From 81a71f51b89e84f39df2a3b1daf4274ae6b7b194 Mon Sep 17 00:00:00 2001
From: Fei Li <fei1.li@intel.com>
Date: Thu, 4 Aug 2022 13:59:03 +0800
Subject: x86/acrn: Set up timekeeping

ACRN Hypervisor reports timing information via CPUID leaf 0x40000010.
Get the TSC and CPU frequency via CPUID leaf 0x40000010 and set the
kernel values accordingly.

Signed-off-by: Fei Li <fei1.li@intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Conghui <conghui.chen@intel.com>
Link: https://lore.kernel.org/r/20220804055903.365211-1-fei1.li@intel.com
---
 arch/x86/include/asm/acrn.h | 14 ++++++++++++++
 arch/x86/kernel/cpu/acrn.c  |  3 +++
 2 files changed, 17 insertions(+)

diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h
index e003a01b7c67..1dd14381bcb6 100644
--- a/arch/x86/include/asm/acrn.h
+++ b/arch/x86/include/asm/acrn.h
@@ -10,6 +10,15 @@
 /* Bit 0 indicates whether guest VM is privileged */
 #define	ACRN_FEATURE_PRIVILEGED_VM	BIT(0)
 
+/*
+ * Timing Information.
+ * This leaf returns the current TSC frequency in kHz.
+ *
+ * EAX: (Virtual) TSC frequency in kHz.
+ * EBX, ECX, EDX: RESERVED (reserved fields are set to zero).
+ */
+#define ACRN_CPUID_TIMING_INFO		0x40000010
+
 void acrn_setup_intr_handler(void (*handler)(void));
 void acrn_remove_intr_handler(void);
 
@@ -21,6 +30,11 @@ static inline u32 acrn_cpuid_base(void)
 	return 0;
 }
 
+static inline unsigned long acrn_get_tsc_khz(void)
+{
+	return cpuid_eax(ACRN_CPUID_TIMING_INFO);
+}
+
 /*
  * Hypercalls for ACRN
  *
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index 23f5f27b5a02..485441b7f030 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -28,6 +28,9 @@ static void __init acrn_init_platform(void)
 {
 	/* Setup the IDT for ACRN hypervisor callback */
 	alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback);
+
+	x86_platform.calibrate_tsc = acrn_get_tsc_khz;
+	x86_platform.calibrate_cpu = acrn_get_tsc_khz;
 }
 
 static bool acrn_x2apic_available(void)
-- 
cgit v1.2.3


From fc04b2ccf0edc49e53d2e1251d122e40285233e6 Mon Sep 17 00:00:00 2001
From: Mateusz Jończyk <mat.jonczyk@o2.pl>
Date: Sat, 13 Aug 2022 15:10:33 +0200
Subject: x86/rtc: Rewrite & simplify mach_get_cmos_time() by deleting
 duplicated functionality
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are functions in drivers/rtc/rtc-mc146818-lib.c that handle
reading from / writing to the CMOS RTC clock. mach_get_cmos_time() in
arch/x86/kernel/rtc.c did not use them and was mostly a duplicate of
mc146818_get_time(). Modify mach_get_cmos_time() to use
mc146818_get_time() and remove the duplicated functionality.

mach_get_cmos_time() used a different algorithm than
mc146818_get_time(), but these functions are equivalent. The major
differences are:

- mc146818_get_time() is better refined and handles various edge
  conditions,

- when the UIP ("Update in progress") bit of the RTC is set,
  mach_get_cmos_time() was busy waiting with cpu_relax() while
  mc146818_get_time() is using mdelay(1) in every loop iteration.
  (However, there is my commit merged for Linux 5.20 / 6.0 to decrease
  this period to 100us:
    commit d2a632a8a117 ("rtc: mc146818-lib: reduce RTC_UIP polling period")
  ),

- mach_get_cmos_time() assumed that the RTC year is >= 2000, which
  may not be true on some old boxes with a dead battery,

- mach_get_cmos_time() was holding the rtc_lock for a long time
  and could hang if the RTC is broken or not present.

The RTC writing counterpart, mach_set_rtc_mmss() is already using
mc146818_get_time() from drivers/rtc. This was done in
        commit 3195ef59cb42 ("x86: Do full rtc synchronization with ntp")
It appears that mach_get_cmos_time() was simply forgotten.

mach_get_cmos_time() is really used only in read_persistent_clock64(),
which is called only in a few places in kernel/time/timekeeping.c .

[ mingo: These changes are not supposed to change behavior, but they are
         not identity transformations either, as mc146818_get_time() is a
	 better but different implementation of the same logic - so
	 regressions are possible in principle. ]

Signed-off-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20220813131034.768527-1-mat.jonczyk@o2.pl
---
 arch/x86/kernel/rtc.c | 59 ++++++---------------------------------------------
 1 file changed, 7 insertions(+), 52 deletions(-)

diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 586f718b8e95..1cadc8a15267 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -4,11 +4,8 @@
  */
 #include <linux/platform_device.h>
 #include <linux/mc146818rtc.h>
-#include <linux/acpi.h>
-#include <linux/bcd.h>
 #include <linux/export.h>
 #include <linux/pnp.h>
-#include <linux/of.h>
 
 #include <asm/vsyscall.h>
 #include <asm/x86_init.h>
@@ -20,15 +17,12 @@
 /*
  * This is a special lock that is owned by the CPU and holds the index
  * register we are working with.  It is required for NMI access to the
- * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
+ * CMOS/RTC registers.  See arch/x86/include/asm/mc146818rtc.h for details.
  */
 volatile unsigned long cmos_lock;
 EXPORT_SYMBOL(cmos_lock);
 #endif /* CONFIG_X86_32 */
 
-/* For two digit years assume time is always after that */
-#define CMOS_YEARS_OFFS 2000
-
 DEFINE_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
@@ -62,8 +56,7 @@ int mach_set_rtc_mmss(const struct timespec64 *now)
 
 void mach_get_cmos_time(struct timespec64 *now)
 {
-	unsigned int status, year, mon, day, hour, min, sec, century = 0;
-	unsigned long flags;
+	struct rtc_time tm;
 
 	/*
 	 * If pm_trace abused the RTC as storage, set the timespec to 0,
@@ -74,51 +67,13 @@ void mach_get_cmos_time(struct timespec64 *now)
 		return;
 	}
 
-	spin_lock_irqsave(&rtc_lock, flags);
-
-	/*
-	 * If UIP is clear, then we have >= 244 microseconds before
-	 * RTC registers will be updated.  Spec sheet says that this
-	 * is the reliable way to read RTC - registers. If UIP is set
-	 * then the register access might be invalid.
-	 */
-	while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
-		cpu_relax();
-
-	sec = CMOS_READ(RTC_SECONDS);
-	min = CMOS_READ(RTC_MINUTES);
-	hour = CMOS_READ(RTC_HOURS);
-	day = CMOS_READ(RTC_DAY_OF_MONTH);
-	mon = CMOS_READ(RTC_MONTH);
-	year = CMOS_READ(RTC_YEAR);
-
-#ifdef CONFIG_ACPI
-	if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
-	    acpi_gbl_FADT.century)
-		century = CMOS_READ(acpi_gbl_FADT.century);
-#endif
-
-	status = CMOS_READ(RTC_CONTROL);
-	WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
-
-	spin_unlock_irqrestore(&rtc_lock, flags);
-
-	if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
-		sec = bcd2bin(sec);
-		min = bcd2bin(min);
-		hour = bcd2bin(hour);
-		day = bcd2bin(day);
-		mon = bcd2bin(mon);
-		year = bcd2bin(year);
+	if (mc146818_get_time(&tm)) {
+		pr_err("Unable to read current time from RTC\n");
+		now->tv_sec = now->tv_nsec = 0;
+		return;
 	}
 
-	if (century) {
-		century = bcd2bin(century);
-		year += century * 100;
-	} else
-		year += CMOS_YEARS_OFFS;
-
-	now->tv_sec = mktime64(year, mon, day, hour, min, sec);
+	now->tv_sec = rtc_tm_to_time64(&tm);
 	now->tv_nsec = 0;
 }
 
-- 
cgit v1.2.3


From e1a6bc7c6969527dbe0afa4801a0237e41e26b1b Mon Sep 17 00:00:00 2001
From: Mateusz Jończyk <mat.jonczyk@o2.pl>
Date: Sat, 13 Aug 2022 15:10:34 +0200
Subject: x86/rtc: Rename mach_set_rtc_mmss() to mach_set_cmos_time()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Once upon a time, before this commit in 2013:

   3195ef59cb42 ("x86: Do full rtc synchronization with ntp")

... the mach_set_rtc_mmss() function set only the minutes and seconds
registers of the CMOS RTC - hence the '_mmss' postfix.

This is no longer true, so rename the function to mach_set_cmos_time().

[ mingo: Expanded changelog a bit. ]

Signed-off-by: Mateusz Jończyk <mat.jonczyk@o2.pl>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20220813131034.768527-2-mat.jonczyk@o2.pl
---
 arch/x86/include/asm/mc146818rtc.h | 2 +-
 arch/x86/kernel/rtc.c              | 4 ++--
 arch/x86/kernel/x86_init.c         | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index 97198001e567..6115bb3d5795 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -95,7 +95,7 @@ static inline unsigned char current_lock_cmos_reg(void)
 unsigned char rtc_cmos_read(unsigned char addr);
 void rtc_cmos_write(unsigned char val, unsigned char addr);
 
-extern int mach_set_rtc_mmss(const struct timespec64 *now);
+extern int mach_set_cmos_time(const struct timespec64 *now);
 extern void mach_get_cmos_time(struct timespec64 *now);
 
 #define RTC_IRQ 8
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1cadc8a15267..349046434513 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -27,13 +27,13 @@ DEFINE_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
 /*
- * In order to set the CMOS clock precisely, set_rtc_mmss has to be
+ * In order to set the CMOS clock precisely, mach_set_cmos_time has to be
  * called 500 ms after the second nowtime has started, because when
  * nowtime is written into the registers of the CMOS clock, it will
  * jump to the next second precisely 500 ms later. Check the Motorola
  * MC146818A or Dallas DS12887 data sheet for details.
  */
-int mach_set_rtc_mmss(const struct timespec64 *now)
+int mach_set_cmos_time(const struct timespec64 *now)
 {
 	unsigned long long nowtime = now->tv_sec;
 	struct rtc_time tm;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index e84ee5cdbd8c..57353519bc11 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -138,7 +138,7 @@ struct x86_platform_ops x86_platform __ro_after_init = {
 	.calibrate_cpu			= native_calibrate_cpu_early,
 	.calibrate_tsc			= native_calibrate_tsc,
 	.get_wallclock			= mach_get_cmos_time,
-	.set_wallclock			= mach_set_rtc_mmss,
+	.set_wallclock			= mach_set_cmos_time,
 	.iommu_shutdown			= iommu_shutdown_noop,
 	.is_untracked_pat_range		= is_ISA_range,
 	.nmi_init			= default_nmi_init,
-- 
cgit v1.2.3


From ee56a283988d739c25d2d00ffb22707cb487ab47 Mon Sep 17 00:00:00 2001
From: Kristen Carlson Accardi <kristen.c.accardi@intel.com>
Date: Fri, 12 Aug 2022 11:18:03 -0700
Subject: x86/sgx: Improve comments for sgx_encl_lookup/alloc_backing()

Modify the comments for sgx_encl_lookup_backing() and for
sgx_encl_alloc_backing() to indicate that they take a reference
which must be dropped with a call to sgx_encl_put_backing().
Make sgx_encl_lookup_backing() static for now, and change the
name of sgx_encl_get_backing() to __sgx_encl_get_backing() to
make it more clear that sgx_encl_get_backing() is an internal
function.

Signed-off-by: Kristen Carlson Accardi <kristen.c.accardi@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/all/YtUs3MKLzFg+rqEV@zn.tnic/
---
 arch/x86/kernel/cpu/sgx/encl.c | 21 ++++++++++++++-------
 arch/x86/kernel/cpu/sgx/encl.h |  2 --
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 24c1bb8eb196..f40d64206ded 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -12,6 +12,9 @@
 #include "encls.h"
 #include "sgx.h"
 
+static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+			    struct sgx_backing *backing);
+
 #define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
 /*
  * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
@@ -914,7 +917,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
 }
 
 /**
- * sgx_encl_get_backing() - Pin the backing storage
+ * __sgx_encl_get_backing() - Pin the backing storage
  * @encl:	an enclave pointer
  * @page_index:	enclave page index
  * @backing:	data for accessing backing storage for the page
@@ -926,7 +929,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
  *   0 on success,
  *   -errno otherwise.
  */
-static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
+static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
 			 struct sgx_backing *backing)
 {
 	pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
@@ -1001,7 +1004,7 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
 }
 
 /**
- * sgx_encl_alloc_backing() - allocate a new backing storage page
+ * sgx_encl_alloc_backing() - create a new backing storage page
  * @encl:	an enclave pointer
  * @page_index:	enclave page index
  * @backing:	data for accessing backing storage for the page
@@ -1009,7 +1012,9 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
  * When called from ksgxd, sets the active memcg from one of the
  * mms in the enclave's mm_list prior to any backing page allocation,
  * in order to ensure that shmem page allocations are charged to the
- * enclave.
+ * enclave.  Create a backing page for loading data back into an EPC page with
+ * ELDU.  This function takes a reference on a new backing page which
+ * must be dropped with a corresponding call to sgx_encl_put_backing().
  *
  * Return:
  *   0 on success,
@@ -1022,7 +1027,7 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
 	struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
 	int ret;
 
-	ret = sgx_encl_get_backing(encl, page_index, backing);
+	ret = __sgx_encl_get_backing(encl, page_index, backing);
 
 	set_active_memcg(memcg);
 	mem_cgroup_put(encl_memcg);
@@ -1040,15 +1045,17 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
  * It is the caller's responsibility to ensure that it is appropriate to use
  * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
  * not used correctly, this will cause an allocation which is not accounted for.
+ * This function takes a reference on an existing backing page which must be
+ * dropped with a corresponding call to sgx_encl_put_backing().
  *
  * Return:
  *   0 on success,
  *   -errno otherwise.
  */
-int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
 			   struct sgx_backing *backing)
 {
-	return sgx_encl_get_backing(encl, page_index, backing);
+	return __sgx_encl_get_backing(encl, page_index, backing);
 }
 
 /**
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index a65a952116fd..f94ff14c9486 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -107,8 +107,6 @@ bool current_is_ksgxd(void);
 void sgx_encl_release(struct kref *ref);
 int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
 const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl);
-int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
-			    struct sgx_backing *backing);
 int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
 			   struct sgx_backing *backing);
 void sgx_encl_put_backing(struct sgx_backing *backing);
-- 
cgit v1.2.3


From 039f0e054a29d06970892240d70143150d2aaec2 Mon Sep 17 00:00:00 2001
From: Li kunyu <kunyu@nfschina.com>
Date: Mon, 25 Jul 2022 12:23:58 +0800
Subject: x86/boot: Remove superfluous type casting from arch/x86/boot/bitops.h

'const void *' will auto-type-convert to just about any other const
pointer type, no need to force it.

  [ mingo: Rewrote the changelog. ]

Signed-off-by: Li kunyu <kunyu@nfschina.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220725042358.3377-1-kunyu@nfschina.com
---
 arch/x86/boot/bitops.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h
index 02e1dea11d94..8518ae214c9b 100644
--- a/arch/x86/boot/bitops.h
+++ b/arch/x86/boot/bitops.h
@@ -19,13 +19,13 @@
 
 static inline bool constant_test_bit(int nr, const void *addr)
 {
-	const u32 *p = (const u32 *)addr;
+	const u32 *p = addr;
 	return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
 }
 static inline bool variable_test_bit(int nr, const void *addr)
 {
 	bool v;
-	const u32 *p = (const u32 *)addr;
+	const u32 *p = addr;
 
 	asm("btl %2,%1" CC_SET(c) : CC_OUT(c) (v) : "m" (*p), "Ir" (nr));
 	return v;
-- 
cgit v1.2.3


From 3163600cab67ffd276a3bb97891404c59c8595a4 Mon Sep 17 00:00:00 2001
From: Jason Wang <wangborong@cdjrlc.com>
Date: Fri, 15 Jul 2022 12:48:09 +0800
Subject: x86: Fix various duplicate-word comment typos

  [ mingo: Consolidated 4 very similar patches into one, it's silly to spread this out. ]

Signed-off-by: Jason Wang <wangborong@cdjrlc.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220715044809.20572-1-wangborong@cdjrlc.com
---
 arch/x86/kernel/amd_gart_64.c | 2 +-
 arch/x86/kernel/aperture_64.c | 2 +-
 arch/x86/kvm/vmx/nested.c     | 2 +-
 arch/x86/platform/efi/efi.c   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 194d54eed537..19a0207e529f 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -53,7 +53,7 @@ static u32 *iommu_gatt_base;		/* Remapping table */
  * of only flushing when an mapping is reused. With it true the GART is
  * flushed for every mapping. Problem is that doing the lazy flush seems
  * to trigger bugs with some popular PCI cards, in particular 3ware (but
- * has been also also seen with Qlogic at least).
+ * has been also seen with Qlogic at least).
  */
 static int iommu_fullflush = 1;
 
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 7a5630d904b2..4feaa670d578 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -36,7 +36,7 @@
 /*
  * Using 512M as goal, in case kexec will load kernel_big
  * that will do the on-position decompress, and could overlap with
- * with the gart aperture that is used.
+ * the gart aperture that is used.
  * Sequence:
  * kernel_small
  * ==> kexec (with kdump trigger path or gart still enabled)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ddd4367d4826..7eaf96064cb0 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2570,7 +2570,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	 * bits which we consider mandatory enabled.
 	 * The CR0_READ_SHADOW is what L2 should have expected to read given
 	 * the specifications by L1; It's not enough to take
-	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
+	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we
 	 * have more bits than L1 expected.
 	 */
 	vmx_set_cr0(vcpu, vmcs12->guest_cr0);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 6e598bd78eef..ebc98a68c400 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -169,7 +169,7 @@ static void __init do_add_efi_memmap(void)
 }
 
 /*
- * Given add_efi_memmap defaults to 0 and there there is no alternative
+ * Given add_efi_memmap defaults to 0 and there is no alternative
  * e820 mechanism for soft-reserved memory, import the full EFI memory
  * map if soft reservations are present and enabled. Otherwise, the
  * mechanism to disable the kernel's consideration of EFI_MEMORY_SP is
-- 
cgit v1.2.3


From 4630535c64725bcd722eb59fb8d92be8db943d3d Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Tue, 28 Jun 2022 18:16:12 +0200
Subject: x86/uaccess: Improve __try_cmpxchg64_user_asm() for x86_32

Improve __try_cmpxcgh64_user_asm() for !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
by relaxing the output register constraint from "c" to "q" constraint,
which allows the compiler to choose between %ecx or %ebx register.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/r/20220628161612.7993-1-ubizjak@gmail.com
---
 arch/x86/include/asm/uaccess.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 913e593a3b45..b0583c1da14f 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -448,7 +448,7 @@ do {									\
 
 #ifdef CONFIG_X86_32
 /*
- * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error.
+ * Unlike the normal CMPXCHG, use output GPR for both success/fail and error.
  * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
  * hardcoded by CMPXCHG8B, leaving only ESI and EDI.  If the compiler uses
  * both ESI and EDI for the memory operand, compilation will fail if the error
@@ -461,11 +461,12 @@ do {									\
 	__typeof__(*(_ptr)) __new = (_new);				\
 	asm volatile("\n"						\
 		     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"		\
-		     "mov $0, %%ecx\n\t"				\
-		     "setz %%cl\n"					\
+		     "mov $0, %[result]\n\t"				\
+		     "setz %b[result]\n"				\
 		     "2:\n"						\
-		     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \
-		     : [result]"=c" (__result),				\
+		     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG,	\
+					   %[result])			\
+		     : [result] "=q" (__result),			\
 		       "+A" (__old),					\
 		       [ptr] "+m" (*_ptr)				\
 		     : "b" ((u32)__new),				\
-- 
cgit v1.2.3


From 4847c0eb663ab431b56cd82c9c2627967f09f2ef Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Mon, 15 Aug 2022 22:44:01 -0400
Subject: lsm: clean up redundant NULL pointer check

The implements of {ip,tcp,udp,dccp,sctp,ipv6}_hdr(skb) guarantee that
they will never return NULL, and elsewhere users don't do the check
as well, so remove the check here.

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
[PM: subject line tweaks]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 security/lsm_audit.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 78a278f28e49..75cc3f8d2a42 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -44,9 +44,6 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb,
 	struct iphdr *ih;
 
 	ih = ip_hdr(skb);
-	if (ih == NULL)
-		return -EINVAL;
-
 	ad->u.net->v4info.saddr = ih->saddr;
 	ad->u.net->v4info.daddr = ih->daddr;
 
@@ -59,8 +56,6 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb,
 	switch (ih->protocol) {
 	case IPPROTO_TCP: {
 		struct tcphdr *th = tcp_hdr(skb);
-		if (th == NULL)
-			break;
 
 		ad->u.net->sport = th->source;
 		ad->u.net->dport = th->dest;
@@ -68,8 +63,6 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb,
 	}
 	case IPPROTO_UDP: {
 		struct udphdr *uh = udp_hdr(skb);
-		if (uh == NULL)
-			break;
 
 		ad->u.net->sport = uh->source;
 		ad->u.net->dport = uh->dest;
@@ -77,8 +70,6 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb,
 	}
 	case IPPROTO_DCCP: {
 		struct dccp_hdr *dh = dccp_hdr(skb);
-		if (dh == NULL)
-			break;
 
 		ad->u.net->sport = dh->dccph_sport;
 		ad->u.net->dport = dh->dccph_dport;
@@ -86,8 +77,7 @@ int ipv4_skb_to_auditdata(struct sk_buff *skb,
 	}
 	case IPPROTO_SCTP: {
 		struct sctphdr *sh = sctp_hdr(skb);
-		if (sh == NULL)
-			break;
+
 		ad->u.net->sport = sh->source;
 		ad->u.net->dport = sh->dest;
 		break;
@@ -115,8 +105,6 @@ int ipv6_skb_to_auditdata(struct sk_buff *skb,
 	__be16 frag_off;
 
 	ip6 = ipv6_hdr(skb);
-	if (ip6 == NULL)
-		return -EINVAL;
 	ad->u.net->v6info.saddr = ip6->saddr;
 	ad->u.net->v6info.daddr = ip6->daddr;
 	/* IPv6 can have several extension header before the Transport header
-- 
cgit v1.2.3


From 0351dc57b95b8b56f2a467122c13b6b16e0dc53f Mon Sep 17 00:00:00 2001
From: Jilin Yuan <yuanjilin@cdjrlc.com>
Date: Sun, 14 Aug 2022 17:39:41 +0800
Subject: audit: fix repeated words in comments

Delete the redundant word 'doesn't'.

Signed-off-by: Jilin Yuan <yuanjilin@cdjrlc.com>
[PM: subject line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index dd8d9ab747c3..9226746dcf0a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1833,7 +1833,7 @@ void __audit_free(struct task_struct *tsk)
 
 	/* We are called either by do_exit() or the fork() error handling code;
 	 * in the former case tsk == current and in the latter tsk is a
-	 * random task_struct that doesn't doesn't have any meaningful data we
+	 * random task_struct that doesn't have any meaningful data we
 	 * need to log via audit_log_exit().
 	 */
 	if (tsk == current && !context->dummy) {
-- 
cgit v1.2.3


From 0612d928b7ff8c87043115f74f08cb4af1f0c4cf Mon Sep 17 00:00:00 2001
From: Adam Skladowski <a39.skl@gmail.com>
Date: Mon, 15 Aug 2022 12:09:42 +0200
Subject: cpufreq: Add SM6115 to cpufreq-dt-platdev blocklist

The Qualcomm SM6115 platform uses the
qcom-cpufreq-hw driver, so add it to the cpufreq-dt-platdev driver's
blocklist.

Signed-off-by: Adam Skladowski <a39.skl@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq-dt-platdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index 2c96de3f2d83..6ac3800db450 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -146,6 +146,7 @@ static const struct of_device_id blocklist[] __initconst = {
 	{ .compatible = "qcom,sc8180x", },
 	{ .compatible = "qcom,sc8280xp", },
 	{ .compatible = "qcom,sdm845", },
+	{ .compatible = "qcom,sm6115", },
 	{ .compatible = "qcom,sm6350", },
 	{ .compatible = "qcom,sm8150", },
 	{ .compatible = "qcom,sm8250", },
-- 
cgit v1.2.3


From 93d7c52a6eb93e58e4569bd4de95ba3b19e3cf20 Mon Sep 17 00:00:00 2001
From: Gautam Menghani <gautammenghani201@gmail.com>
Date: Sat, 6 Aug 2022 22:05:30 +0530
Subject: selftests/net: Refactor xfrm_fill_key() to use array of structs

A TODO in net/ipsec.c asks to refactor the code in xfrm_fill_key() to
use set/map to avoid manually comparing each algorithm with the "name"
parameter passed to the function as an argument. This patch refactors
the code to create an array of structs where each struct contains the
algorithm name and its corresponding key length.

Signed-off-by: Gautam Menghani <gautammenghani201@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 tools/testing/selftests/net/ipsec.c | 104 ++++++++++++++++--------------------
 1 file changed, 45 insertions(+), 59 deletions(-)

diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c
index cc10c10c5ed9..9a8229abfa02 100644
--- a/tools/testing/selftests/net/ipsec.c
+++ b/tools/testing/selftests/net/ipsec.c
@@ -58,6 +58,8 @@
 #define VETH_FMT	"ktst-%d"
 #define VETH_LEN	12
 
+#define XFRM_ALGO_NR_KEYS 29
+
 static int nsfd_parent	= -1;
 static int nsfd_childa	= -1;
 static int nsfd_childb	= -1;
@@ -75,6 +77,43 @@ const unsigned int ping_timeout		= 300;
 const unsigned int ping_count		= 100;
 const unsigned int ping_success		= 80;
 
+struct xfrm_key_entry {
+	char algo_name[35];
+	int key_len;
+};
+
+struct xfrm_key_entry xfrm_key_entries[] = {
+	{"digest_null", 0},
+	{"ecb(cipher_null)", 0},
+	{"cbc(des)", 64},
+	{"hmac(md5)", 128},
+	{"cmac(aes)", 128},
+	{"xcbc(aes)", 128},
+	{"cbc(cast5)", 128},
+	{"cbc(serpent)", 128},
+	{"hmac(sha1)", 160},
+	{"hmac(rmd160)", 160},
+	{"cbc(des3_ede)", 192},
+	{"hmac(sha256)", 256},
+	{"cbc(aes)", 256},
+	{"cbc(camellia)", 256},
+	{"cbc(twofish)", 256},
+	{"rfc3686(ctr(aes))", 288},
+	{"hmac(sha384)", 384},
+	{"cbc(blowfish)", 448},
+	{"hmac(sha512)", 512},
+	{"rfc4106(gcm(aes))-128", 160},
+	{"rfc4543(gcm(aes))-128", 160},
+	{"rfc4309(ccm(aes))-128", 152},
+	{"rfc4106(gcm(aes))-192", 224},
+	{"rfc4543(gcm(aes))-192", 224},
+	{"rfc4309(ccm(aes))-192", 216},
+	{"rfc4106(gcm(aes))-256", 288},
+	{"rfc4543(gcm(aes))-256", 288},
+	{"rfc4309(ccm(aes))-256", 280},
+	{"rfc7539(chacha20,poly1305)-128", 0}
+};
+
 static void randomize_buffer(void *buf, size_t buflen)
 {
 	int *p = (int *)buf;
@@ -767,65 +806,12 @@ static int do_ping(int cmd_fd, char *buf, size_t buf_len, struct in_addr from,
 static int xfrm_fill_key(char *name, char *buf,
 		size_t buf_len, unsigned int *key_len)
 {
-	/* TODO: use set/map instead */
-	if (strncmp(name, "digest_null", ALGO_LEN) == 0)
-		*key_len = 0;
-	else if (strncmp(name, "ecb(cipher_null)", ALGO_LEN) == 0)
-		*key_len = 0;
-	else if (strncmp(name, "cbc(des)", ALGO_LEN) == 0)
-		*key_len = 64;
-	else if (strncmp(name, "hmac(md5)", ALGO_LEN) == 0)
-		*key_len = 128;
-	else if (strncmp(name, "cmac(aes)", ALGO_LEN) == 0)
-		*key_len = 128;
-	else if (strncmp(name, "xcbc(aes)", ALGO_LEN) == 0)
-		*key_len = 128;
-	else if (strncmp(name, "cbc(cast5)", ALGO_LEN) == 0)
-		*key_len = 128;
-	else if (strncmp(name, "cbc(serpent)", ALGO_LEN) == 0)
-		*key_len = 128;
-	else if (strncmp(name, "hmac(sha1)", ALGO_LEN) == 0)
-		*key_len = 160;
-	else if (strncmp(name, "hmac(rmd160)", ALGO_LEN) == 0)
-		*key_len = 160;
-	else if (strncmp(name, "cbc(des3_ede)", ALGO_LEN) == 0)
-		*key_len = 192;
-	else if (strncmp(name, "hmac(sha256)", ALGO_LEN) == 0)
-		*key_len = 256;
-	else if (strncmp(name, "cbc(aes)", ALGO_LEN) == 0)
-		*key_len = 256;
-	else if (strncmp(name, "cbc(camellia)", ALGO_LEN) == 0)
-		*key_len = 256;
-	else if (strncmp(name, "cbc(twofish)", ALGO_LEN) == 0)
-		*key_len = 256;
-	else if (strncmp(name, "rfc3686(ctr(aes))", ALGO_LEN) == 0)
-		*key_len = 288;
-	else if (strncmp(name, "hmac(sha384)", ALGO_LEN) == 0)
-		*key_len = 384;
-	else if (strncmp(name, "cbc(blowfish)", ALGO_LEN) == 0)
-		*key_len = 448;
-	else if (strncmp(name, "hmac(sha512)", ALGO_LEN) == 0)
-		*key_len = 512;
-	else if (strncmp(name, "rfc4106(gcm(aes))-128", ALGO_LEN) == 0)
-		*key_len = 160;
-	else if (strncmp(name, "rfc4543(gcm(aes))-128", ALGO_LEN) == 0)
-		*key_len = 160;
-	else if (strncmp(name, "rfc4309(ccm(aes))-128", ALGO_LEN) == 0)
-		*key_len = 152;
-	else if (strncmp(name, "rfc4106(gcm(aes))-192", ALGO_LEN) == 0)
-		*key_len = 224;
-	else if (strncmp(name, "rfc4543(gcm(aes))-192", ALGO_LEN) == 0)
-		*key_len = 224;
-	else if (strncmp(name, "rfc4309(ccm(aes))-192", ALGO_LEN) == 0)
-		*key_len = 216;
-	else if (strncmp(name, "rfc4106(gcm(aes))-256", ALGO_LEN) == 0)
-		*key_len = 288;
-	else if (strncmp(name, "rfc4543(gcm(aes))-256", ALGO_LEN) == 0)
-		*key_len = 288;
-	else if (strncmp(name, "rfc4309(ccm(aes))-256", ALGO_LEN) == 0)
-		*key_len = 280;
-	else if (strncmp(name, "rfc7539(chacha20,poly1305)-128", ALGO_LEN) == 0)
-		*key_len = 0;
+	int i;
+
+	for (i = 0; i < XFRM_ALGO_NR_KEYS; i++) {
+		if (strncmp(name, xfrm_key_entries[i].algo_name, ALGO_LEN) == 0)
+			*key_len = xfrm_key_entries[i].key_len;
+	}
 
 	if (*key_len > buf_len) {
 		printk("Can't pack a key - too big for buffer");
-- 
cgit v1.2.3


From 7cd4c5c2101cb092db00f61f69d24380cf7a0ee8 Mon Sep 17 00:00:00 2001
From: Frederick Lawler <fred@cloudflare.com>
Date: Mon, 15 Aug 2022 11:20:25 -0500
Subject: security, lsm: Introduce security_create_user_ns()

User namespaces are an effective tool to allow programs to run with
permission without requiring the need for a program to run as root. User
namespaces may also be used as a sandboxing technique. However, attackers
sometimes leverage user namespaces as an initial attack vector to perform
some exploit. [1,2,3]

While it is not the unprivileged user namespace functionality, which
causes the kernel to be exploitable, users/administrators might want to
more granularly limit or at least monitor how various processes use this
functionality, while vulnerable kernel subsystems are being patched.

Preventing user namespace already creation comes in a few of forms in
order of granularity:

        1. /proc/sys/user/max_user_namespaces sysctl
        2. Distro specific patch(es)
        3. CONFIG_USER_NS

To block a task based on its attributes, the LSM hook cred_prepare is a
decent candidate for use because it provides more granular control, and
it is called before create_user_ns():

        cred = prepare_creds()
                security_prepare_creds()
                        call_int_hook(cred_prepare, ...
        if (cred)
                create_user_ns(cred)

Since security_prepare_creds() is meant for LSMs to copy and prepare
credentials, access control is an unintended use of the hook. [4]
Further, security_prepare_creds() will always return a ENOMEM if the
hook returns any non-zero error code.

This hook also does not handle the clone3 case which requires us to
access a user space pointer to know if we're in the CLONE_NEW_USER
call path which may be subject to a TOCTTOU attack.

Lastly, cred_prepare is called in many call paths, and a targeted hook
further limits the frequency of calls which is a beneficial outcome.
Therefore introduce a new function security_create_user_ns() with an
accompanying userns_create LSM hook.

With the new userns_create hook, users will have more control over the
observability and access control over user namespace creation. Users
should expect that normal operation of user namespaces will behave as
usual, and only be impacted when controls are implemented by users or
administrators.

This hook takes the prepared creds for LSM authors to write policy
against. On success, the new namespace is applied to credentials,
otherwise an error is returned.

Links:
1. https://nvd.nist.gov/vuln/detail/CVE-2022-0492
2. https://nvd.nist.gov/vuln/detail/CVE-2022-25636
3. https://nvd.nist.gov/vuln/detail/CVE-2022-34918
4. https://lore.kernel.org/all/1c4b1c0d-12f6-6e9e-a6a3-cdce7418110c@schaufler-ca.com/

Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: KP Singh <kpsingh@kernel.org>
Signed-off-by: Frederick Lawler <fred@cloudflare.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/lsm_hook_defs.h | 1 +
 include/linux/lsm_hooks.h     | 4 ++++
 include/linux/security.h      | 6 ++++++
 kernel/user_namespace.c       | 5 +++++
 security/security.c           | 5 +++++
 5 files changed, 21 insertions(+)

diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 806448173033..aa7272e83626 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -224,6 +224,7 @@ LSM_HOOK(int, -ENOSYS, task_prctl, int option, unsigned long arg2,
 	 unsigned long arg3, unsigned long arg4, unsigned long arg5)
 LSM_HOOK(void, LSM_RET_VOID, task_to_inode, struct task_struct *p,
 	 struct inode *inode)
+LSM_HOOK(int, 0, userns_create, const struct cred *cred)
 LSM_HOOK(int, 0, ipc_permission, struct kern_ipc_perm *ipcp, short flag)
 LSM_HOOK(void, LSM_RET_VOID, ipc_getsecid, struct kern_ipc_perm *ipcp,
 	 u32 *secid)
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index 84a0d7e02176..2e11a2a22ed1 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -806,6 +806,10 @@
  *	security attributes, e.g. for /proc/pid inodes.
  *	@p contains the task_struct for the task.
  *	@inode contains the inode structure for the inode.
+ * @userns_create:
+ *	Check permission prior to creating a new user namespace.
+ *	@cred points to prepared creds.
+ *	Return 0 if successful, otherwise < 0 error code.
  *
  * Security hooks for Netlink messaging.
  *
diff --git a/include/linux/security.h b/include/linux/security.h
index 1bc362cb413f..767802fe9bfa 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -437,6 +437,7 @@ int security_task_kill(struct task_struct *p, struct kernel_siginfo *info,
 int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			unsigned long arg4, unsigned long arg5);
 void security_task_to_inode(struct task_struct *p, struct inode *inode);
+int security_create_user_ns(const struct cred *cred);
 int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag);
 void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid);
 int security_msg_msg_alloc(struct msg_msg *msg);
@@ -1194,6 +1195,11 @@ static inline int security_task_prctl(int option, unsigned long arg2,
 static inline void security_task_to_inode(struct task_struct *p, struct inode *inode)
 { }
 
+static inline int security_create_user_ns(const struct cred *cred)
+{
+	return 0;
+}
+
 static inline int security_ipc_permission(struct kern_ipc_perm *ipcp,
 					  short flag)
 {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 5481ba44a8d6..3f464bbda0e9 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
 #include <linux/highuid.h>
 #include <linux/cred.h>
 #include <linux/securebits.h>
+#include <linux/security.h>
 #include <linux/keyctl.h>
 #include <linux/key-type.h>
 #include <keys/user-type.h>
@@ -113,6 +114,10 @@ int create_user_ns(struct cred *new)
 	    !kgid_has_mapping(parent_ns, group))
 		goto fail_dec;
 
+	ret = security_create_user_ns(new);
+	if (ret < 0)
+		goto fail_dec;
+
 	ret = -ENOMEM;
 	ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
 	if (!ns)
diff --git a/security/security.c b/security/security.c
index 14d30fec8a00..1e60c4b570ec 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1909,6 +1909,11 @@ void security_task_to_inode(struct task_struct *p, struct inode *inode)
 	call_void_hook(task_to_inode, p, inode);
 }
 
+int security_create_user_ns(const struct cred *cred)
+{
+	return call_int_hook(userns_create, 0, cred);
+}
+
 int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
 {
 	return call_int_hook(ipc_permission, 0, ipcp, flag);
-- 
cgit v1.2.3


From 401e64b3a4af4c7a2f6a00337232a3cf0bb757ed Mon Sep 17 00:00:00 2001
From: Frederick Lawler <fred@cloudflare.com>
Date: Mon, 15 Aug 2022 11:20:26 -0500
Subject: bpf-lsm: Make bpf_lsm_userns_create() sleepable

Users may want to audit calls to security_create_user_ns() and access
user space memory. Also create_user_ns() runs without
pagefault_disabled(). Therefore, make bpf_lsm_userns_create() sleepable
for mandatory access control policies.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Signed-off-by: Frederick Lawler <fred@cloudflare.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/bpf/bpf_lsm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index fa71d58b7ded..761998fda762 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -335,6 +335,7 @@ BTF_ID(func, bpf_lsm_task_getsecid_obj)
 BTF_ID(func, bpf_lsm_task_prctl)
 BTF_ID(func, bpf_lsm_task_setscheduler)
 BTF_ID(func, bpf_lsm_task_to_inode)
+BTF_ID(func, bpf_lsm_userns_create)
 BTF_SET_END(sleepable_lsm_hooks)
 
 bool bpf_lsm_is_sleepable_hook(u32 btf_id)
-- 
cgit v1.2.3


From d5810139cca39cf2854728b465f8bada4a445302 Mon Sep 17 00:00:00 2001
From: Frederick Lawler <fred@cloudflare.com>
Date: Mon, 15 Aug 2022 11:20:27 -0500
Subject: selftests/bpf: Add tests verifying bpf lsm userns_create hook

The LSM hook userns_create was introduced to provide LSM's an
opportunity to block or allow unprivileged user namespace creation. This
test serves two purposes: it provides a test eBPF implementation, and
tests the hook successfully blocks or allows user namespace creation.

This tests 3 cases:

        1. Unattached bpf program does not block unpriv user namespace
           creation.
        2. Attached bpf program allows user namespace creation given
           CAP_SYS_ADMIN privileges.
        3. Attached bpf program denies user namespace creation for a
           user without CAP_SYS_ADMIN.

Acked-by: KP Singh <kpsingh@kernel.org>
Signed-off-by: Frederick Lawler <fred@cloudflare.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 .../selftests/bpf/prog_tests/deny_namespace.c      | 102 +++++++++++++++++++++
 .../selftests/bpf/progs/test_deny_namespace.c      |  33 +++++++
 2 files changed, 135 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/deny_namespace.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_deny_namespace.c

diff --git a/tools/testing/selftests/bpf/prog_tests/deny_namespace.c b/tools/testing/selftests/bpf/prog_tests/deny_namespace.c
new file mode 100644
index 000000000000..1bc6241b755b
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/deny_namespace.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <test_progs.h>
+#include "test_deny_namespace.skel.h"
+#include <sched.h>
+#include "cap_helpers.h"
+#include <stdio.h>
+
+static int wait_for_pid(pid_t pid)
+{
+	int status, ret;
+
+again:
+	ret = waitpid(pid, &status, 0);
+	if (ret == -1) {
+		if (errno == EINTR)
+			goto again;
+
+		return -1;
+	}
+
+	if (!WIFEXITED(status))
+		return -1;
+
+	return WEXITSTATUS(status);
+}
+
+/* negative return value -> some internal error
+ * positive return value -> userns creation failed
+ * 0                     -> userns creation succeeded
+ */
+static int create_user_ns(void)
+{
+	pid_t pid;
+
+	pid = fork();
+	if (pid < 0)
+		return -1;
+
+	if (pid == 0) {
+		if (unshare(CLONE_NEWUSER))
+			_exit(EXIT_FAILURE);
+		_exit(EXIT_SUCCESS);
+	}
+
+	return wait_for_pid(pid);
+}
+
+static void test_userns_create_bpf(void)
+{
+	__u32 cap_mask = 1ULL << CAP_SYS_ADMIN;
+	__u64 old_caps = 0;
+
+	cap_enable_effective(cap_mask, &old_caps);
+
+	ASSERT_OK(create_user_ns(), "priv new user ns");
+
+	cap_disable_effective(cap_mask, &old_caps);
+
+	ASSERT_EQ(create_user_ns(), EPERM, "unpriv new user ns");
+
+	if (cap_mask & old_caps)
+		cap_enable_effective(cap_mask, NULL);
+}
+
+static void test_unpriv_userns_create_no_bpf(void)
+{
+	__u32 cap_mask = 1ULL << CAP_SYS_ADMIN;
+	__u64 old_caps = 0;
+
+	cap_disable_effective(cap_mask, &old_caps);
+
+	ASSERT_OK(create_user_ns(), "no-bpf unpriv new user ns");
+
+	if (cap_mask & old_caps)
+		cap_enable_effective(cap_mask, NULL);
+}
+
+void test_deny_namespace(void)
+{
+	struct test_deny_namespace *skel = NULL;
+	int err;
+
+	if (test__start_subtest("unpriv_userns_create_no_bpf"))
+		test_unpriv_userns_create_no_bpf();
+
+	skel = test_deny_namespace__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel load"))
+		goto close_prog;
+
+	err = test_deny_namespace__attach(skel);
+	if (!ASSERT_OK(err, "attach"))
+		goto close_prog;
+
+	if (test__start_subtest("userns_create_bpf"))
+		test_userns_create_bpf();
+
+	test_deny_namespace__detach(skel);
+
+close_prog:
+	test_deny_namespace__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_deny_namespace.c b/tools/testing/selftests/bpf/progs/test_deny_namespace.c
new file mode 100644
index 000000000000..09ad5a4ebd1f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_deny_namespace.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <errno.h>
+#include <linux/capability.h>
+
+struct kernel_cap_struct {
+	__u32 cap[_LINUX_CAPABILITY_U32S_3];
+} __attribute__((preserve_access_index));
+
+struct cred {
+	struct kernel_cap_struct cap_effective;
+} __attribute__((preserve_access_index));
+
+char _license[] SEC("license") = "GPL";
+
+SEC("lsm.s/userns_create")
+int BPF_PROG(test_userns_create, const struct cred *cred, int ret)
+{
+	struct kernel_cap_struct caps = cred->cap_effective;
+	int cap_index = CAP_TO_INDEX(CAP_SYS_ADMIN);
+	__u32 cap_mask = CAP_TO_MASK(CAP_SYS_ADMIN);
+
+	if (ret)
+		return 0;
+
+	ret = -EPERM;
+	if (caps.cap[cap_index] & cap_mask)
+		return 0;
+
+	return -EPERM;
+}
-- 
cgit v1.2.3


From ed5d44d42c95e8a13bb54e614d2269c8740667f9 Mon Sep 17 00:00:00 2001
From: Frederick Lawler <fred@cloudflare.com>
Date: Mon, 15 Aug 2022 11:20:28 -0500
Subject: selinux: Implement userns_create hook

Unprivileged user namespace creation is an intended feature to enable
sandboxing, however this feature is often used to as an initial step to
perform a privilege escalation attack.

This patch implements a new user_namespace { create } access control
permission to restrict which domains allow or deny user namespace
creation. This is necessary for system administrators to quickly protect
their systems while waiting for vulnerability patches to be applied.

This permission can be used in the following way:

        allow domA_t domA_t : user_namespace { create };

Signed-off-by: Frederick Lawler <fred@cloudflare.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 security/selinux/hooks.c            | 9 +++++++++
 security/selinux/include/classmap.h | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 79573504783b..b9f1078450b3 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -4221,6 +4221,14 @@ static void selinux_task_to_inode(struct task_struct *p,
 	spin_unlock(&isec->lock);
 }
 
+static int selinux_userns_create(const struct cred *cred)
+{
+	u32 sid = current_sid();
+
+	return avc_has_perm(&selinux_state, sid, sid, SECCLASS_USER_NAMESPACE,
+						USER_NAMESPACE__CREATE, NULL);
+}
+
 /* Returns error only if unable to parse addresses */
 static int selinux_parse_skb_ipv4(struct sk_buff *skb,
 			struct common_audit_data *ad, u8 *proto)
@@ -7111,6 +7119,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(task_movememory, selinux_task_movememory),
 	LSM_HOOK_INIT(task_kill, selinux_task_kill),
 	LSM_HOOK_INIT(task_to_inode, selinux_task_to_inode),
+	LSM_HOOK_INIT(userns_create, selinux_userns_create),
 
 	LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission),
 	LSM_HOOK_INIT(ipc_getsecid, selinux_ipc_getsecid),
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index ff757ae5f253..0bff55bb9cde 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -254,6 +254,8 @@ const struct security_class_mapping secclass_map[] = {
 	  { COMMON_FILE_PERMS, NULL } },
 	{ "io_uring",
 	  { "override_creds", "sqpoll", NULL } },
+	{ "user_namespace",
+	  { "create", NULL } },
 	{ NULL }
   };
 
-- 
cgit v1.2.3


From 3fd6d6e2b4e80fe45bfd1c8f01dff7d30a0f9b53 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:17 +0200
Subject: thermal/of: Rework the thermal device tree initialization

The following changes are reworking entirely the thermal device tree
initialization. The old version is kept until the different drivers
using it are converted to the new API.

The old approach creates the different actors independently. This
approach is the source of the code duplication in the thermal OF
because a thermal zone is created but a sensor is registered
after. The thermal zones are created unconditionnaly with a fake
sensor at init time, thus forcing to provide fake ops and store all
the thermal zone related information in duplicated structures. Then
the sensor is initialized and the code looks up the thermal zone name
using the device tree. Then the sensor is associated to the thermal
zone, and the sensor specific ops are called with a second level of
indirection from the thermal zone ops.

When a sensor is removed (with a module unload), the thermal zone
stays there with the fake sensor.

The cooling device associated with a thermal zone and a trip point is
stored in a list, again duplicating information, using the node name
of the device tree to match afterwards the cooling devices.

The new approach is simpler, it creates a thermal zone when the sensor
is registered and destroys it when the sensor is removed. All the
matching between the cooling device, trip points and thermal zones are
done using the device tree, as well as bindings. The ops are no longer
specific but uses the generic ones provided by the thermal framework.

When the old code won't have any users, it can be removed and the
remaining thermal OF code will be much simpler.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-2-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_of.c | 460 ++++++++++++++++++++++++++++++++++++++++++-
 include/linux/thermal.h      |  18 ++
 2 files changed, 468 insertions(+), 10 deletions(-)

diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c
index 802c30b72a92..82236fec7c65 100644
--- a/drivers/thermal/thermal_of.c
+++ b/drivers/thermal/thermal_of.c
@@ -811,16 +811,6 @@ static int thermal_of_get_trip_type(struct device_node *np,
 	return -ENODEV;
 }
 
-/**
- * thermal_of_populate_trip - parse and fill one trip point data
- * @np: DT node containing a trip point node
- * @trip: trip point data structure to be filled up
- *
- * This function parses a trip point type of node represented by
- * @np parameter and fills the read data into @trip data structure.
- *
- * Return: 0 on success, proper error code otherwise
- */
 static int thermal_of_populate_trip(struct device_node *np,
 				    struct thermal_trip *trip)
 {
@@ -1065,6 +1055,456 @@ static __init void of_thermal_destroy_zones(void)
 	of_node_put(np);
 }
 
+static struct device_node *of_thermal_zone_find(struct device_node *sensor, int id)
+{
+	struct device_node *np, *tz;
+	struct of_phandle_args sensor_specs;
+
+	np = of_find_node_by_name(NULL, "thermal-zones");
+	if (!np) {
+		pr_err("Unable to find thermal zones description\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	/*
+	 * Search for each thermal zone, a defined sensor
+	 * corresponding to the one passed as parameter
+	 */
+	for_each_available_child_of_node(np, tz) {
+
+		int count, i;
+
+		count = of_count_phandle_with_args(tz, "thermal-sensors",
+						   "#thermal-sensor-cells");
+		if (count <= 0) {
+			pr_err("%pOFn: missing thermal sensor\n", tz);
+			tz = ERR_PTR(-EINVAL);
+			goto out;
+		}
+
+		for (i = 0; i < count; i++) {
+
+			int ret;
+
+			ret = of_parse_phandle_with_args(tz, "thermal-sensors",
+							 "#thermal-sensor-cells",
+							 i, &sensor_specs);
+			if (ret < 0) {
+				pr_err("%pOFn: Failed to read thermal-sensors cells: %d\n", tz, ret);
+				tz = ERR_PTR(ret);
+				goto out;
+			}
+
+			if ((sensor == sensor_specs.np) && id == (sensor_specs.args_count ?
+								  sensor_specs.args[0] : 0)) {
+				pr_debug("sensor %pOFn id=%d belongs to %pOFn\n", sensor, id, tz);
+				goto out;
+			}
+		}
+	}
+out:
+	of_node_put(np);
+	return tz;
+}
+
+static int thermal_of_monitor_init(struct device_node *np, int *delay, int *pdelay)
+{
+	int ret;
+
+	ret = of_property_read_u32(np, "polling-delay-passive", pdelay);
+	if (ret < 0) {
+		pr_err("%pOFn: missing polling-delay-passive property\n", np);
+		return ret;
+	}
+
+	ret = of_property_read_u32(np, "polling-delay", delay);
+	if (ret < 0) {
+		pr_err("%pOFn: missing polling-delay property\n", np);
+		return ret;
+	}
+
+	return 0;
+}
+
+static struct thermal_zone_params *thermal_of_parameters_init(struct device_node *np)
+{
+	struct thermal_zone_params *tzp;
+	int coef[2];
+	int ncoef = ARRAY_SIZE(coef);
+	int prop, ret;
+
+	tzp = kzalloc(sizeof(*tzp), GFP_KERNEL);
+	if (!tzp)
+		return ERR_PTR(-ENOMEM);
+
+	tzp->no_hwmon = true;
+
+	if (!of_property_read_u32(np, "sustainable-power", &prop))
+		tzp->sustainable_power = prop;
+
+	/*
+	 * For now, the thermal framework supports only one sensor per
+	 * thermal zone. Thus, we are considering only the first two
+	 * values as slope and offset.
+	 */
+	ret = of_property_read_u32_array(np, "coefficients", coef, ncoef);
+	if (ret) {
+		coef[0] = 1;
+		coef[1] = 0;
+	}
+
+	tzp->slope = coef[0];
+	tzp->offset = coef[1];
+
+	return tzp;
+}
+
+static struct device_node *thermal_of_zone_get_by_name(struct thermal_zone_device *tz)
+{
+	struct device_node *np, *tz_np;
+
+	np = of_find_node_by_name(NULL, "thermal-zones");
+	if (!np)
+		return ERR_PTR(-ENODEV);
+
+	tz_np = of_get_child_by_name(np, tz->type);
+
+	of_node_put(np);
+
+	if (!tz_np)
+		return ERR_PTR(-ENODEV);
+
+	return tz_np;
+}
+
+static int __thermal_of_unbind(struct device_node *map_np, int index, int trip_id,
+			       struct thermal_zone_device *tz, struct thermal_cooling_device *cdev)
+{
+	struct of_phandle_args cooling_spec;
+	int ret;
+
+	ret = of_parse_phandle_with_args(map_np, "cooling-device", "#cooling-cells",
+					 index, &cooling_spec);
+
+	of_node_put(cooling_spec.np);
+
+	if (ret < 0) {
+		pr_err("Invalid cooling-device entry\n");
+		return ret;
+	}
+
+	if (cooling_spec.args_count < 2) {
+		pr_err("wrong reference to cooling device, missing limits\n");
+		return -EINVAL;
+	}
+
+	if (cooling_spec.np != cdev->np)
+		return 0;
+
+	ret = thermal_zone_unbind_cooling_device(tz, trip_id, cdev);
+	if (ret)
+		pr_err("Failed to unbind '%s' with '%s': %d\n", tz->type, cdev->type, ret);
+
+	return ret;
+}
+
+static int __thermal_of_bind(struct device_node *map_np, int index, int trip_id,
+			     struct thermal_zone_device *tz, struct thermal_cooling_device *cdev)
+{
+	struct of_phandle_args cooling_spec;
+	int ret, weight = THERMAL_WEIGHT_DEFAULT;
+
+	of_property_read_u32(map_np, "contribution", &weight);
+
+	ret = of_parse_phandle_with_args(map_np, "cooling-device", "#cooling-cells",
+					 index, &cooling_spec);
+
+	of_node_put(cooling_spec.np);
+
+	if (ret < 0) {
+		pr_err("Invalid cooling-device entry\n");
+		return ret;
+	}
+
+	if (cooling_spec.args_count < 2) {
+		pr_err("wrong reference to cooling device, missing limits\n");
+		return -EINVAL;
+	}
+
+	if (cooling_spec.np != cdev->np)
+		return 0;
+
+	ret = thermal_zone_bind_cooling_device(tz, trip_id, cdev, cooling_spec.args[1],
+					       cooling_spec.args[0],
+					       weight);
+	if (ret)
+		pr_err("Failed to bind '%s' with '%s': %d\n", tz->type, cdev->type, ret);
+
+	return ret;
+}
+
+static int thermal_of_for_each_cooling_device(struct device_node *tz_np, struct device_node *map_np,
+					      struct thermal_zone_device *tz, struct thermal_cooling_device *cdev,
+					      int (*action)(struct device_node *, int, int,
+							    struct thermal_zone_device *, struct thermal_cooling_device *))
+{
+	struct device_node *tr_np;
+	int count, i, trip_id;
+
+	tr_np = of_parse_phandle(map_np, "trip", 0);
+	if (!tr_np)
+		return -ENODEV;
+
+	trip_id = of_find_trip_id(tz_np, tr_np);
+	if (trip_id < 0)
+		return trip_id;
+
+	count = of_count_phandle_with_args(map_np, "cooling-device", "#cooling-cells");
+	if (count <= 0) {
+		pr_err("Add a cooling_device property with at least one device\n");
+		return -ENOENT;
+	}
+
+	/*
+	 * At this point, we don't want to bail out when there is an
+	 * error, we will try to bind/unbind as many as possible
+	 * cooling devices
+	 */
+	for (i = 0; i < count; i++)
+		action(map_np, i, trip_id, tz, cdev);
+
+	return 0;
+}
+
+static int thermal_of_for_each_cooling_maps(struct thermal_zone_device *tz,
+					    struct thermal_cooling_device *cdev,
+					    int (*action)(struct device_node *, int, int,
+							  struct thermal_zone_device *, struct thermal_cooling_device *))
+{
+	struct device_node *tz_np, *cm_np, *child;
+	int ret = 0;
+
+	tz_np = thermal_of_zone_get_by_name(tz);
+	if (IS_ERR(tz_np)) {
+		pr_err("Failed to get node tz by name\n");
+		return PTR_ERR(tz_np);
+	}
+
+	cm_np = of_get_child_by_name(tz_np, "cooling-maps");
+	if (!cm_np)
+		goto out;
+
+	for_each_child_of_node(cm_np, child) {
+		ret = thermal_of_for_each_cooling_device(tz_np, child, tz, cdev, action);
+		if (ret)
+			break;
+	}
+
+	of_node_put(cm_np);
+out:
+	of_node_put(tz_np);
+
+	return ret;
+}
+
+static int thermal_of_bind(struct thermal_zone_device *tz,
+			   struct thermal_cooling_device *cdev)
+{
+	return thermal_of_for_each_cooling_maps(tz, cdev, __thermal_of_bind);
+}
+
+static int thermal_of_unbind(struct thermal_zone_device *tz,
+			     struct thermal_cooling_device *cdev)
+{
+	return thermal_of_for_each_cooling_maps(tz, cdev, __thermal_of_unbind);
+}
+
+/**
+ * thermal_of_zone_unregister - Cleanup the specific allocated ressources
+ *
+ * This function disables the thermal zone and frees the different
+ * ressources allocated specific to the thermal OF.
+ *
+ * @tz: a pointer to the thermal zone structure
+ */
+void thermal_of_zone_unregister(struct thermal_zone_device *tz)
+{
+	thermal_zone_device_disable(tz);
+	thermal_zone_device_unregister(tz);
+	kfree(tz->trips);
+	kfree(tz->tzp);
+	kfree(tz->ops);
+}
+EXPORT_SYMBOL_GPL(thermal_of_zone_unregister);
+
+/**
+ * thermal_of_zone_register - Register a thermal zone with device node
+ * sensor
+ *
+ * The thermal_of_zone_register() parses a device tree given a device
+ * node sensor and identifier. It searches for the thermal zone
+ * associated to the couple sensor/id and retrieves all the thermal
+ * zone properties and registers new thermal zone with those
+ * properties.
+ *
+ * @sensor: A device node pointer corresponding to the sensor in the device tree
+ * @id: An integer as sensor identifier
+ * @data: A private data to be stored in the thermal zone dedicated private area
+ * @ops: A set of thermal sensor ops
+ *
+ * Return: a valid thermal zone structure pointer on success.
+ * 	- EINVAL: if the device tree thermal description is malformed
+ *	- ENOMEM: if one structure can not be allocated
+ *	- Other negative errors are returned by the underlying called functions
+ */
+struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
+						     const struct thermal_zone_device_ops *ops)
+{
+	struct thermal_zone_device *tz;
+	struct thermal_trip *trips;
+	struct thermal_zone_params *tzp;
+	struct thermal_zone_device_ops *of_ops;
+	struct device_node *np;
+	int delay, pdelay;
+	int ntrips, mask;
+	int ret;
+
+	of_ops = kmemdup(ops, sizeof(*ops), GFP_KERNEL);
+	if (!of_ops)
+		return ERR_PTR(-ENOMEM);
+
+	np = of_thermal_zone_find(sensor, id);
+	if (IS_ERR(np)) {
+		pr_err("Failed to find thermal zone for %pOFn id=%d\n", sensor, id);
+		return ERR_CAST(np);
+	}
+
+	trips = thermal_of_trips_init(np, &ntrips);
+	if (IS_ERR(trips)) {
+		pr_err("Failed to find trip points for %pOFn id=%d\n", sensor, id);
+		return ERR_CAST(trips);
+	}
+
+	ret = thermal_of_monitor_init(np, &delay, &pdelay);
+	if (ret) {
+		pr_err("Failed to initialize monitoring delays from %pOFn\n", np);
+		goto out_kfree_trips;
+	}
+
+	tzp = thermal_of_parameters_init(np);
+	if (IS_ERR(tzp)) {
+		ret = PTR_ERR(tzp);
+		pr_err("Failed to initialize parameter from %pOFn: %d\n", np, ret);
+		goto out_kfree_trips;
+	}
+
+	of_ops->get_trip_type = of_ops->get_trip_type ? : of_thermal_get_trip_type;
+	of_ops->get_trip_temp = of_ops->get_trip_temp ? : of_thermal_get_trip_temp;
+	of_ops->get_trip_hyst = of_ops->get_trip_hyst ? : of_thermal_get_trip_hyst;
+	of_ops->set_trip_hyst = of_ops->set_trip_hyst ? : of_thermal_set_trip_hyst;
+	of_ops->get_crit_temp = of_ops->get_crit_temp ? : of_thermal_get_crit_temp;
+	of_ops->bind = thermal_of_bind;
+	of_ops->unbind = thermal_of_unbind;
+
+	mask = GENMASK_ULL((ntrips) - 1, 0);
+
+	tz = thermal_zone_device_register_with_trips(np->name, trips, ntrips,
+						     mask, data, of_ops, tzp,
+						     pdelay, delay);
+	if (IS_ERR(tz)) {
+		ret = PTR_ERR(tz);
+		pr_err("Failed to register thermal zone %pOFn: %d\n", np, ret);
+		goto out_kfree_tzp;
+	}
+
+	ret = thermal_zone_device_enable(tz);
+	if (ret) {
+		pr_err("Failed to enabled thermal zone '%s', id=%d: %d\n",
+		       tz->type, tz->id, ret);
+		thermal_of_zone_unregister(tz);
+		return ERR_PTR(ret);
+	}
+
+	return tz;
+
+out_kfree_tzp:
+	kfree(tzp);
+out_kfree_trips:
+	kfree(trips);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(thermal_of_zone_register);
+
+static void devm_thermal_of_zone_release(struct device *dev, void *res)
+{
+	thermal_of_zone_unregister(*(struct thermal_zone_device **)res);
+}
+
+static int devm_thermal_of_zone_match(struct device *dev, void *res,
+				      void *data)
+{
+	struct thermal_zone_device **r = res;
+
+	if (WARN_ON(!r || !*r))
+		return 0;
+
+	return *r == data;
+}
+
+/**
+ * devm_thermal_of_zone_register - register a thermal tied with the sensor life cycle
+ *
+ * This function is the device version of the thermal_of_zone_register() function.
+ *
+ * @dev: a device structure pointer to sensor to be tied with the thermal zone OF life cycle
+ * @sensor_id: the sensor identifier
+ * @data: a pointer to a private data to be stored in the thermal zone 'devdata' field
+ * @ops: a pointer to the ops structure associated with the sensor
+ */
+struct thermal_zone_device *devm_thermal_of_zone_register(struct device *dev, int sensor_id, void *data,
+							  const struct thermal_zone_device_ops *ops)
+{
+	struct thermal_zone_device **ptr, *tzd;
+
+	ptr = devres_alloc(devm_thermal_of_zone_release, sizeof(*ptr),
+			   GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	tzd = thermal_of_zone_register(dev->of_node, sensor_id, data, ops);
+	if (IS_ERR(tzd)) {
+		devres_free(ptr);
+		return tzd;
+	}
+
+	*ptr = tzd;
+	devres_add(dev, ptr);
+
+	return tzd;
+}
+EXPORT_SYMBOL_GPL(devm_thermal_of_zone_register);
+
+/**
+ * devm_thermal_of_zone_unregister - Resource managed version of
+ *				thermal_of_zone_unregister().
+ * @dev: Device for which which resource was allocated.
+ * @tz: a pointer to struct thermal_zone where the sensor is registered.
+ *
+ * This function removes the sensor callbacks and private data from the
+ * thermal zone device registered with devm_thermal_zone_of_sensor_register()
+ * API. It will also silent the zone by remove the .get_temp() and .get_trend()
+ * thermal zone device callbacks.
+ * Normally this function will not need to be called and the resource
+ * management code will ensure that the resource is freed.
+ */
+void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz)
+{
+	WARN_ON(devres_release(dev, devm_thermal_zone_of_sensor_release,
+			       devm_thermal_of_zone_match, tz));
+}
+EXPORT_SYMBOL_GPL(devm_thermal_of_zone_unregister);
+
 /**
  * of_parse_thermal_zones - parse device tree thermal data
  *
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 1386c713885d..e2ac9d473bd6 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -325,6 +325,16 @@ struct thermal_zone_of_device_ops {
 
 /* Function declarations */
 #ifdef CONFIG_THERMAL_OF
+struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
+						     const struct thermal_zone_device_ops *ops);
+
+struct thermal_zone_device *devm_thermal_of_zone_register(struct device *dev, int id, void *data,
+							  const struct thermal_zone_device_ops *ops);
+
+void thermal_of_zone_unregister(struct thermal_zone_device *tz);
+
+void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz);
+
 int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
 				  struct device_node *sensor_np,
 				  u32 *id);
@@ -366,6 +376,14 @@ static inline struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
 	return ERR_PTR(-ENODEV);
 }
 
+static inline void thermal_of_zone_unregister(struct thermal_zone_device *tz)
+{
+}
+
+static inline void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz)
+{
+}
+
 static inline
 void devm_thermal_zone_of_sensor_unregister(struct device *dev,
 					    struct thermal_zone_device *tz)
-- 
cgit v1.2.3


From 45b8850b3d3071d5ea9e19ad4a29ad5f0b5d1ec1 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 9 Aug 2022 10:56:26 +0200
Subject: thermal/of: Fix error code in of_thermal_zone_find()

Currently, if we cannot find the correct thermal zone then this error
path returns NULL and it would lead to an Oops in the caller.  Return
ERR_PTR(-EINVAL) instead.

Fixes: 3bd52ac87347 ("thermal/of: Rework the thermal device tree initialization")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Link: https://lore.kernel.org/r/YvDzovkMCQecPDjz@kili
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_of.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c
index 82236fec7c65..15b342fa81d6 100644
--- a/drivers/thermal/thermal_of.c
+++ b/drivers/thermal/thermal_of.c
@@ -1102,6 +1102,7 @@ static struct device_node *of_thermal_zone_find(struct device_node *sensor, int
 			}
 		}
 	}
+	tz = ERR_PTR(-EINVAL);
 out:
 	of_node_put(np);
 	return tz;
-- 
cgit v1.2.3


From 9d6792df07367aab42009d2b24c62c11a5968ee3 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 9 Aug 2022 10:56:27 +0200
Subject: thermal/of: Return -ENODEV instead of -EINVAL if registration fails

The previous version of the OF code was returning -ENODEV if no
thermal zones description was found or if the lookup of the sensor in
the thermal zones was not found.

The backend drivers are expecting this return value as an information
about skipping the sensor initialization and considered as normal.

Fix the return value by replacing -EINVAL by -ENODEV and remove the
error message as this missing is not considered as an error.

Fixes: 3bd52ac87347 ("thermal/of: Rework the thermal device tree initialization")
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Tested-by: Michael Walle <michael@walle.cc>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20220809085629.509116-2-daniel.lezcano@linaro.org
---
 drivers/thermal/thermal_of.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c
index 15b342fa81d6..072e05477855 100644
--- a/drivers/thermal/thermal_of.c
+++ b/drivers/thermal/thermal_of.c
@@ -1062,8 +1062,8 @@ static struct device_node *of_thermal_zone_find(struct device_node *sensor, int
 
 	np = of_find_node_by_name(NULL, "thermal-zones");
 	if (!np) {
-		pr_err("Unable to find thermal zones description\n");
-		return ERR_PTR(-EINVAL);
+		pr_debug("No thermal zones description\n");
+		return ERR_PTR(-ENODEV);
 	}
 
 	/*
@@ -1102,7 +1102,7 @@ static struct device_node *of_thermal_zone_find(struct device_node *sensor, int
 			}
 		}
 	}
-	tz = ERR_PTR(-EINVAL);
+	tz = ERR_PTR(-ENODEV);
 out:
 	of_node_put(np);
 	return tz;
@@ -1376,7 +1376,8 @@ struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor,
 
 	np = of_thermal_zone_find(sensor, id);
 	if (IS_ERR(np)) {
-		pr_err("Failed to find thermal zone for %pOFn id=%d\n", sensor, id);
+		if (PTR_ERR(np) != -ENODEV)
+			pr_err("Failed to find thermal zone for %pOFn id=%d\n", sensor, id);
 		return ERR_CAST(np);
 	}
 
-- 
cgit v1.2.3


From 8fb5b71ed37dbe469eaa930e2ddc93ec9e305f3c Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 9 Aug 2022 10:56:29 +0200
Subject: thermal/of: Fix free after use in thermal_of_unregister()

The thermal zone is freed after being unregistered. The release method
devm_thermal_zone_device_register() calls
  -> thermal_of_zone_device_unregister()

This one calls thermal_zone_device_unregister() which frees the
thermal zone. However, thermal_of_zone_device_unregister() does access
this freed pointer to free different resources allocated by the
thermal_of framework which is invalid.

It results in a kernel panic:

[    1.915140] thermal_sys: Failed to find thermal zone for tmu id=2
[    1.921279] qoriq_thermal 1f80000.tmu: Failed to register sensors
[    1.927395] qoriq_thermal: probe of 1f80000.tmu failed with error -22
[    1.934189] Unable to handle kernel paging request at virtual address 01adadadadadad88
[    1.942146] Mem abort info:
[    1.944948]   ESR = 0x0000000096000004
[    1.948708]   EC = 0x25: DABT (current EL), IL = 32 bits
[    1.954042]   SET = 0, FnV = 0
[    1.957107]   EA = 0, S1PTW = 0
[    1.960253]   FSC = 0x04: level 0 translation fault
[    1.965147] Data abort info:
[    1.968030]   ISV = 0, ISS = 0x00000004
[    1.971878]   CM = 0, WnR = 0
[    1.974852] [01adadadadadad88] address between user and kernel address ranges
[    1.982016] Internal error: Oops: 96000004 [#1] SMP
[    1.986907] Modules linked in:
[    1.989969] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.19.0-next-20220808-00080-g1c46f44502e0 #1697
[    1.999135] Hardware name: Kontron KBox A-230-LS (DT)
[    2.004199] pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[    2.011185] pc : kfree+0x5c/0x3c0
[    2.014516] lr : devm_thermal_of_zone_release+0x38/0x60
[    2.019761] sp : ffff80000a22bad0
[    2.023081] x29: ffff80000a22bad0 x28: 0000000000000000 x27: ffff800009960464
[    2.030245] x26: ffff800009a16960 x25: 0000000000000006 x24: ffff800009f09a40
[    2.037407] x23: ffff800009ab9008 x22: ffff800008d0eea8 x21: 01adadadadadad80
[    2.044569] x20: 6b6b6b6b6b6b6b6b x19: ffff00200232b800 x18: 00000000fffffffb
[    2.051731] x17: ffff800008d0eea0 x16: ffff800008d07d44 x15: ffff800008d0d154
[    2.056647] usb 1-1: new high-speed USB device number 2 using xhci-hcd
[    2.058893] x14: ffff800008d0cddc x13: ffff8000088d1c2c x12: ffff8000088d5034
[    2.072597] x11: ffff8000088d46d4 x10: 0000000000000000 x9 : ffff800008d0eea8
[    2.079759] x8 : ffff002000b1a158 x7 : bbbbbbbbbbbbbbbb x6 : ffff80000a0f53b8
[    2.086921] x5 : ffff80000a22b960 x4 : 0000000000000000 x3 : 0000000000000000
[    2.094082] x2 : fffffc0000000000 x1 : ffff002000838040 x0 : 01adb1adadadad80
[    2.101244] Call trace:
[    2.103692]  kfree+0x5c/0x3c0
[    2.106666]  devm_thermal_of_zone_release+0x38/0x60
[    2.111561]  release_nodes+0x64/0xd0
[    2.115146]  devres_release_all+0xbc/0x350
[    2.119253]  device_unbind_cleanup+0x20/0x70
[    2.123536]  really_probe+0x1a0/0x2e4
[    2.127208]  __driver_probe_device+0x80/0xec
[    2.131490]  driver_probe_device+0x44/0x130
[    2.135685]  __driver_attach+0x104/0x1b4
[    2.139619]  bus_for_each_dev+0x7c/0xe0
[    2.143465]  driver_attach+0x30/0x40
[    2.147048]  bus_add_driver+0x160/0x210
[    2.150894]  driver_register+0x84/0x140
[    2.154741]  __platform_driver_register+0x34/0x40
[    2.159461]  qoriq_tmu_init+0x28/0x34
[    2.163133]  do_one_initcall+0x50/0x250
[    2.166979]  kernel_init_freeable+0x278/0x31c
[    2.171349]  kernel_init+0x30/0x140
[    2.174847]  ret_from_fork+0x10/0x20
[    2.178433] Code: b25657e2 d34cfc00 d37ae400 8b020015 (f94006a1)
[    2.184546] ---[ end trace 0000000000000000 ]---

Store the allocated resource pointers before the thermal zone is free
and use them to release the resource after unregistering the thermal
zone.

Fixes: 3bd52ac87347 ("thermal/of: Rework the thermal device tree initialization")
Reported-by: Michael Walle <michael@walle.cc>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Tested-by: Michael Walle <michael@walle.cc>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20220809085629.509116-4-daniel.lezcano@linaro.org
---
 drivers/thermal/thermal_of.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c
index 072e05477855..c5cbe254a4f1 100644
--- a/drivers/thermal/thermal_of.c
+++ b/drivers/thermal/thermal_of.c
@@ -1330,11 +1330,15 @@ static int thermal_of_unbind(struct thermal_zone_device *tz,
  */
 void thermal_of_zone_unregister(struct thermal_zone_device *tz)
 {
+	struct thermal_trip *trips = tz->trips;
+	struct thermal_zone_params *tzp = tz->tzp;
+	struct thermal_zone_device_ops *ops = tz->ops;
+
 	thermal_zone_device_disable(tz);
 	thermal_zone_device_unregister(tz);
-	kfree(tz->trips);
-	kfree(tz->tzp);
-	kfree(tz->ops);
+	kfree(trips);
+	kfree(tzp);
+	kfree(ops);
 }
 EXPORT_SYMBOL_GPL(thermal_of_zone_unregister);
 
-- 
cgit v1.2.3


From 48ad3b104b9ec85de58c2b4e38fdad9a26446f99 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:18 +0200
Subject: thermal/of: Make new code and old code co-exist

This transient change allows to use old and new OF together until all
the drivers are converted to use the new OF API.

This will go away when the old OF code will be removed.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-3-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_core.c |  6 ------
 drivers/thermal/thermal_of.c   | 13 +++++++++++--
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 50d50cec7774..69447aba7e65 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1492,10 +1492,6 @@ static int __init thermal_init(void)
 	if (result)
 		goto unregister_governors;
 
-	result = of_parse_thermal_zones();
-	if (result)
-		goto unregister_class;
-
 	result = register_pm_notifier(&thermal_pm_nb);
 	if (result)
 		pr_warn("Thermal: Can not register suspend notifier, return %d\n",
@@ -1503,8 +1499,6 @@ static int __init thermal_init(void)
 
 	return 0;
 
-unregister_class:
-	class_unregister(&thermal_class);
 unregister_governors:
 	thermal_unregister_governors();
 error:
diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c
index c5cbe254a4f1..a17087c9295d 100644
--- a/drivers/thermal/thermal_of.c
+++ b/drivers/thermal/thermal_of.c
@@ -485,6 +485,15 @@ thermal_zone_of_sensor_register(struct device *dev, int sensor_id, void *data,
 {
 	struct device_node *np, *child, *sensor_np;
 	struct thermal_zone_device *tzd = ERR_PTR(-ENODEV);
+	static int old_tz_initialized;
+	int ret;
+
+	if (!old_tz_initialized) {
+		ret = of_parse_thermal_zones();
+		if (ret)
+			return ERR_PTR(ret);
+		old_tz_initialized = 1;
+	}
 
 	np = of_find_node_by_name(NULL, "thermal-zones");
 	if (!np)
@@ -1004,7 +1013,7 @@ free_tz:
 	return ERR_PTR(ret);
 }
 
-static __init void of_thermal_free_zone(struct __thermal_zone *tz)
+static void of_thermal_free_zone(struct __thermal_zone *tz)
 {
 	struct __thermal_bind_params *tbp;
 	int i, j;
@@ -1523,7 +1532,7 @@ EXPORT_SYMBOL_GPL(devm_thermal_of_zone_unregister);
  * Return: 0 on success, proper error code otherwise
  *
  */
-int __init of_parse_thermal_zones(void)
+int of_parse_thermal_zones(void)
 {
 	struct device_node *np, *child;
 	struct __thermal_zone *tz;
-- 
cgit v1.2.3


From 90b2ca02a969963bb37c30b42510fc3dfb0a3ae7 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:19 +0200
Subject: thermal/drivers/rockchip: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-4-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/rockchip_thermal.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c
index dc3a9c276a09..819e059cde71 100644
--- a/drivers/thermal/rockchip_thermal.c
+++ b/drivers/thermal/rockchip_thermal.c
@@ -1211,9 +1211,9 @@ static irqreturn_t rockchip_thermal_alarm_irq_thread(int irq, void *dev)
 	return IRQ_HANDLED;
 }
 
-static int rockchip_thermal_set_trips(void *_sensor, int low, int high)
+static int rockchip_thermal_set_trips(struct thermal_zone_device *tz, int low, int high)
 {
-	struct rockchip_thermal_sensor *sensor = _sensor;
+	struct rockchip_thermal_sensor *sensor = tz->devdata;
 	struct rockchip_thermal_data *thermal = sensor->thermal;
 	const struct rockchip_tsadc_chip *tsadc = thermal->chip;
 
@@ -1224,9 +1224,9 @@ static int rockchip_thermal_set_trips(void *_sensor, int low, int high)
 				     sensor->id, thermal->regs, high);
 }
 
-static int rockchip_thermal_get_temp(void *_sensor, int *out_temp)
+static int rockchip_thermal_get_temp(struct thermal_zone_device *tz, int *out_temp)
 {
-	struct rockchip_thermal_sensor *sensor = _sensor;
+	struct rockchip_thermal_sensor *sensor = tz->devdata;
 	struct rockchip_thermal_data *thermal = sensor->thermal;
 	const struct rockchip_tsadc_chip *tsadc = sensor->thermal->chip;
 	int retval;
@@ -1239,7 +1239,7 @@ static int rockchip_thermal_get_temp(void *_sensor, int *out_temp)
 	return retval;
 }
 
-static const struct thermal_zone_of_device_ops rockchip_of_thermal_ops = {
+static const struct thermal_zone_device_ops rockchip_of_thermal_ops = {
 	.get_temp = rockchip_thermal_get_temp,
 	.set_trips = rockchip_thermal_set_trips,
 };
@@ -1326,8 +1326,8 @@ rockchip_thermal_register_sensor(struct platform_device *pdev,
 
 	sensor->thermal = thermal;
 	sensor->id = id;
-	sensor->tzd = devm_thermal_zone_of_sensor_register(&pdev->dev, id,
-					sensor, &rockchip_of_thermal_ops);
+	sensor->tzd = devm_thermal_of_zone_register(&pdev->dev, id, sensor,
+						    &rockchip_of_thermal_ops);
 	if (IS_ERR(sensor->tzd)) {
 		error = PTR_ERR(sensor->tzd);
 		dev_err(&pdev->dev, "failed to register sensor %d: %d\n",
-- 
cgit v1.2.3


From c5f12023ff1d5622c7499352786233399beab7f8 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:20 +0200
Subject: thermal/drivers/uniphier: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-5-daniel.lezcano@linexp.org
Reviewed-by: Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/uniphier_thermal.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/thermal/uniphier_thermal.c b/drivers/thermal/uniphier_thermal.c
index 4cae5561a2a3..4111d99ef50e 100644
--- a/drivers/thermal/uniphier_thermal.c
+++ b/drivers/thermal/uniphier_thermal.c
@@ -187,9 +187,9 @@ static void uniphier_tm_disable_sensor(struct uniphier_tm_dev *tdev)
 	usleep_range(1000, 2000);	/* The spec note says at least 1ms */
 }
 
-static int uniphier_tm_get_temp(void *data, int *out_temp)
+static int uniphier_tm_get_temp(struct thermal_zone_device *tz, int *out_temp)
 {
-	struct uniphier_tm_dev *tdev = data;
+	struct uniphier_tm_dev *tdev = tz->devdata;
 	struct regmap *map = tdev->regmap;
 	int ret;
 	u32 temp;
@@ -204,7 +204,7 @@ static int uniphier_tm_get_temp(void *data, int *out_temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops uniphier_of_thermal_ops = {
+static const struct thermal_zone_device_ops uniphier_of_thermal_ops = {
 	.get_temp = uniphier_tm_get_temp,
 };
 
@@ -289,8 +289,8 @@ static int uniphier_tm_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, tdev);
 
-	tdev->tz_dev = devm_thermal_zone_of_sensor_register(dev, 0, tdev,
-						&uniphier_of_thermal_ops);
+	tdev->tz_dev = devm_thermal_of_zone_register(dev, 0, tdev,
+						     &uniphier_of_thermal_ops);
 	if (IS_ERR(tdev->tz_dev)) {
 		dev_err(dev, "failed to register sensor device\n");
 		return PTR_ERR(tdev->tz_dev);
-- 
cgit v1.2.3


From 2ff66cba5beb9302f5787fd34617c5f64ad98309 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:21 +0200
Subject: thermal/drivers/generic-adc: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-6-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal-generic-adc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/thermal/thermal-generic-adc.c b/drivers/thermal/thermal-generic-adc.c
index 73665c3ccfe0..323e273e3298 100644
--- a/drivers/thermal/thermal-generic-adc.c
+++ b/drivers/thermal/thermal-generic-adc.c
@@ -52,9 +52,9 @@ static int gadc_thermal_adc_to_temp(struct gadc_thermal_info *gti, int val)
 	return temp;
 }
 
-static int gadc_thermal_get_temp(void *data, int *temp)
+static int gadc_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct gadc_thermal_info *gti = data;
+	struct gadc_thermal_info *gti = tz->devdata;
 	int val;
 	int ret;
 
@@ -68,7 +68,7 @@ static int gadc_thermal_get_temp(void *data, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops gadc_thermal_ops = {
+static const struct thermal_zone_device_ops gadc_thermal_ops = {
 	.get_temp = gadc_thermal_get_temp,
 };
 
@@ -143,8 +143,8 @@ static int gadc_thermal_probe(struct platform_device *pdev)
 	gti->dev = &pdev->dev;
 	platform_set_drvdata(pdev, gti);
 
-	gti->tz_dev = devm_thermal_zone_of_sensor_register(&pdev->dev, 0, gti,
-							   &gadc_thermal_ops);
+	gti->tz_dev = devm_thermal_of_zone_register(&pdev->dev, 0, gti,
+						    &gadc_thermal_ops);
 	if (IS_ERR(gti->tz_dev)) {
 		ret = PTR_ERR(gti->tz_dev);
 		if (ret != -EPROBE_DEFER)
-- 
cgit v1.2.3


From 44b5554d98d422a4411341d9aed5352c2ce34fc1 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:22 +0200
Subject: thermal/drivers/mmio: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Reviewed-by: Talel Shenhar <talel@amazon.com>
Link: https://lore.kernel.org/r/20220804224349.1926752-7-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_mmio.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/thermal/thermal_mmio.c b/drivers/thermal/thermal_mmio.c
index 360b0dfdc3b0..1bf1a16533da 100644
--- a/drivers/thermal/thermal_mmio.c
+++ b/drivers/thermal/thermal_mmio.c
@@ -20,11 +20,10 @@ static u32 thermal_mmio_readb(void __iomem *mmio_base)
 	return readb(mmio_base);
 }
 
-static int thermal_mmio_get_temperature(void *private, int *temp)
+static int thermal_mmio_get_temperature(struct thermal_zone_device *tz, int *temp)
 {
 	int t;
-	struct thermal_mmio *sensor =
-		(struct thermal_mmio *)private;
+	struct thermal_mmio *sensor = tz->devdata;
 
 	t = sensor->read_mmio(sensor->mmio_base) & sensor->mask;
 	t *= sensor->factor;
@@ -34,7 +33,7 @@ static int thermal_mmio_get_temperature(void *private, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops thermal_mmio_ops = {
+static const struct thermal_zone_device_ops thermal_mmio_ops = {
 	.get_temp = thermal_mmio_get_temperature,
 };
 
@@ -68,10 +67,10 @@ static int thermal_mmio_probe(struct platform_device *pdev)
 		}
 	}
 
-	thermal_zone = devm_thermal_zone_of_sensor_register(&pdev->dev,
-							    0,
-							    sensor,
-							    &thermal_mmio_ops);
+	thermal_zone = devm_thermal_of_zone_register(&pdev->dev,
+						     0,
+						     sensor,
+						     &thermal_mmio_ops);
 	if (IS_ERR(thermal_zone)) {
 		dev_err(&pdev->dev,
 			"failed to register sensor (%ld)\n",
@@ -79,7 +78,7 @@ static int thermal_mmio_probe(struct platform_device *pdev)
 		return PTR_ERR(thermal_zone);
 	}
 
-	thermal_mmio_get_temperature(sensor, &temperature);
+	thermal_mmio_get_temperature(thermal_zone, &temperature);
 	dev_info(&pdev->dev,
 		 "thermal mmio sensor %s registered, current temperature: %d\n",
 		 pdev->name, temperature);
-- 
cgit v1.2.3


From 6fc2e1a5f98feb9cf0698b69c90701e0b9de2bf5 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:23 +0200
Subject: thermal/drivers/tegra: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-8-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/tegra/soctherm.c           | 21 ++++++++++-----------
 drivers/thermal/tegra/tegra-bpmp-thermal.c | 19 ++++++++++++-------
 drivers/thermal/tegra/tegra30-tsensor.c    | 12 ++++++------
 3 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c
index 825eab526619..1efe470f31e9 100644
--- a/drivers/thermal/tegra/soctherm.c
+++ b/drivers/thermal/tegra/soctherm.c
@@ -421,9 +421,9 @@ static int translate_temp(u16 val)
 	return t;
 }
 
-static int tegra_thermctl_get_temp(void *data, int *out_temp)
+static int tegra_thermctl_get_temp(struct thermal_zone_device *tz, int *out_temp)
 {
-	struct tegra_thermctl_zone *zone = data;
+	struct tegra_thermctl_zone *zone = tz->devdata;
 	u32 val;
 
 	val = readl(zone->reg);
@@ -582,10 +582,9 @@ static int tsensor_group_thermtrip_get(struct tegra_soctherm *ts, int id)
 	return temp;
 }
 
-static int tegra_thermctl_set_trip_temp(void *data, int trip, int temp)
+static int tegra_thermctl_set_trip_temp(struct thermal_zone_device *tz, int trip, int temp)
 {
-	struct tegra_thermctl_zone *zone = data;
-	struct thermal_zone_device *tz = zone->tz;
+	struct tegra_thermctl_zone *zone = tz->devdata;
 	struct tegra_soctherm *ts = zone->ts;
 	const struct tegra_tsensor_group *sg = zone->sg;
 	struct device *dev = zone->dev;
@@ -657,9 +656,9 @@ static void thermal_irq_disable(struct tegra_thermctl_zone *zn)
 	mutex_unlock(&zn->ts->thermctl_lock);
 }
 
-static int tegra_thermctl_set_trips(void *data, int lo, int hi)
+static int tegra_thermctl_set_trips(struct thermal_zone_device *tz, int lo, int hi)
 {
-	struct tegra_thermctl_zone *zone = data;
+	struct tegra_thermctl_zone *zone = tz->devdata;
 	u32 r;
 
 	thermal_irq_disable(zone);
@@ -682,7 +681,7 @@ static int tegra_thermctl_set_trips(void *data, int lo, int hi)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
+static const struct thermal_zone_device_ops tegra_of_thermal_ops = {
 	.get_temp = tegra_thermctl_get_temp,
 	.set_trip_temp = tegra_thermctl_set_trip_temp,
 	.set_trips = tegra_thermctl_set_trips,
@@ -2194,9 +2193,9 @@ static int tegra_soctherm_probe(struct platform_device *pdev)
 		zone->sg = soc->ttgs[i];
 		zone->ts = tegra;
 
-		z = devm_thermal_zone_of_sensor_register(&pdev->dev,
-							 soc->ttgs[i]->id, zone,
-							 &tegra_of_thermal_ops);
+		z = devm_thermal_of_zone_register(&pdev->dev,
+						  soc->ttgs[i]->id, zone,
+						  &tegra_of_thermal_ops);
 		if (IS_ERR(z)) {
 			err = PTR_ERR(z);
 			dev_err(&pdev->dev, "failed to register sensor: %d\n",
diff --git a/drivers/thermal/tegra/tegra-bpmp-thermal.c b/drivers/thermal/tegra/tegra-bpmp-thermal.c
index 5affc3d196be..eb84f0b9dc7c 100644
--- a/drivers/thermal/tegra/tegra-bpmp-thermal.c
+++ b/drivers/thermal/tegra/tegra-bpmp-thermal.c
@@ -30,9 +30,9 @@ struct tegra_bpmp_thermal {
 	struct tegra_bpmp_thermal_zone **zones;
 };
 
-static int tegra_bpmp_thermal_get_temp(void *data, int *out_temp)
+static int __tegra_bpmp_thermal_get_temp(struct tegra_bpmp_thermal_zone *zone,
+					 int *out_temp)
 {
-	struct tegra_bpmp_thermal_zone *zone = data;
 	struct mrq_thermal_host_to_bpmp_request req;
 	union mrq_thermal_bpmp_to_host_response reply;
 	struct tegra_bpmp_message msg;
@@ -60,9 +60,14 @@ static int tegra_bpmp_thermal_get_temp(void *data, int *out_temp)
 	return 0;
 }
 
-static int tegra_bpmp_thermal_set_trips(void *data, int low, int high)
+static int tegra_bpmp_thermal_get_temp(struct thermal_zone_device *tz, int *out_temp)
 {
-	struct tegra_bpmp_thermal_zone *zone = data;
+	return __tegra_bpmp_thermal_get_temp(tz->devdata, out_temp);
+}
+
+static int tegra_bpmp_thermal_set_trips(struct thermal_zone_device *tz, int low, int high)
+{
+	struct tegra_bpmp_thermal_zone *zone = tz->devdata;
 	struct mrq_thermal_host_to_bpmp_request req;
 	struct tegra_bpmp_message msg;
 	int err;
@@ -157,7 +162,7 @@ static int tegra_bpmp_thermal_get_num_zones(struct tegra_bpmp *bpmp,
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops tegra_bpmp_of_thermal_ops = {
+static const struct thermal_zone_device_ops tegra_bpmp_of_thermal_ops = {
 	.get_temp = tegra_bpmp_thermal_get_temp,
 	.set_trips = tegra_bpmp_thermal_set_trips,
 };
@@ -200,13 +205,13 @@ static int tegra_bpmp_thermal_probe(struct platform_device *pdev)
 		zone->idx = i;
 		zone->tegra = tegra;
 
-		err = tegra_bpmp_thermal_get_temp(zone, &temp);
+		err = __tegra_bpmp_thermal_get_temp(zone, &temp);
 		if (err < 0) {
 			devm_kfree(&pdev->dev, zone);
 			continue;
 		}
 
-		tzd = devm_thermal_zone_of_sensor_register(
+		tzd = devm_thermal_of_zone_register(
 			&pdev->dev, i, zone, &tegra_bpmp_of_thermal_ops);
 		if (IS_ERR(tzd)) {
 			if (PTR_ERR(tzd) == -EPROBE_DEFER)
diff --git a/drivers/thermal/tegra/tegra30-tsensor.c b/drivers/thermal/tegra/tegra30-tsensor.c
index 05886684f429..c34501287e96 100644
--- a/drivers/thermal/tegra/tegra30-tsensor.c
+++ b/drivers/thermal/tegra/tegra30-tsensor.c
@@ -159,9 +159,9 @@ static void devm_tegra_tsensor_hw_disable(void *data)
 	tegra_tsensor_hw_disable(ts);
 }
 
-static int tegra_tsensor_get_temp(void *data, int *temp)
+static int tegra_tsensor_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	const struct tegra_tsensor_channel *tsc = data;
+	const struct tegra_tsensor_channel *tsc = tz->devdata;
 	const struct tegra_tsensor *ts = tsc->ts;
 	int err, c1, c2, c3, c4, counter;
 	u32 val;
@@ -217,9 +217,9 @@ static int tegra_tsensor_temp_to_counter(const struct tegra_tsensor *ts, int tem
 	return DIV_ROUND_CLOSEST(c2 * 1000000 - ts->calib.b, ts->calib.a);
 }
 
-static int tegra_tsensor_set_trips(void *data, int low, int high)
+static int tegra_tsensor_set_trips(struct thermal_zone_device *tz, int low, int high)
 {
-	const struct tegra_tsensor_channel *tsc = data;
+	const struct tegra_tsensor_channel *tsc = tz->devdata;
 	const struct tegra_tsensor *ts = tsc->ts;
 	u32 val;
 
@@ -240,7 +240,7 @@ static int tegra_tsensor_set_trips(void *data, int low, int high)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops ops = {
+static const struct thermal_zone_device_ops ops = {
 	.get_temp = tegra_tsensor_get_temp,
 	.set_trips = tegra_tsensor_set_trips,
 };
@@ -516,7 +516,7 @@ static int tegra_tsensor_register_channel(struct tegra_tsensor *ts,
 	tsc->id = id;
 	tsc->regs = ts->regs + 0x40 * (hw_id + 1);
 
-	tsc->tzd = devm_thermal_zone_of_sensor_register(ts->dev, id, tsc, &ops);
+	tsc->tzd = devm_thermal_of_zone_register(ts->dev, id, tsc, &ops);
 	if (IS_ERR(tsc->tzd)) {
 		if (PTR_ERR(tsc->tzd) != -ENODEV)
 			return dev_err_probe(ts->dev, PTR_ERR(tsc->tzd),
-- 
cgit v1.2.3


From 2e2150c7946764f289bafd716cbd6721283dc9ce Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:24 +0200
Subject: thermal/drivers/sun8i: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-9-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/sun8i_thermal.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
index 212c87e63a66..e64d06d1328c 100644
--- a/drivers/thermal/sun8i_thermal.c
+++ b/drivers/thermal/sun8i_thermal.c
@@ -108,9 +108,9 @@ static int sun50i_h5_calc_temp(struct ths_device *tmdev,
 		return -1590 * reg / 10 + 276000;
 }
 
-static int sun8i_ths_get_temp(void *data, int *temp)
+static int sun8i_ths_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct tsensor *s = data;
+	struct tsensor *s = tz->devdata;
 	struct ths_device *tmdev = s->tmdev;
 	int val = 0;
 
@@ -135,7 +135,7 @@ static int sun8i_ths_get_temp(void *data, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops ths_ops = {
+static const struct thermal_zone_device_ops ths_ops = {
 	.get_temp = sun8i_ths_get_temp,
 };
 
@@ -468,10 +468,10 @@ static int sun8i_ths_register(struct ths_device *tmdev)
 		tmdev->sensor[i].tmdev = tmdev;
 		tmdev->sensor[i].id = i;
 		tmdev->sensor[i].tzd =
-			devm_thermal_zone_of_sensor_register(tmdev->dev,
-							     i,
-							     &tmdev->sensor[i],
-							     &ths_ops);
+			devm_thermal_of_zone_register(tmdev->dev,
+						      i,
+						      &tmdev->sensor[i],
+						      &ths_ops);
 		if (IS_ERR(tmdev->sensor[i].tzd))
 			return PTR_ERR(tmdev->sensor[i].tzd);
 
-- 
cgit v1.2.3


From 7f689a2ef4f6422b200682f80be225c1f61218f7 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:25 +0200
Subject: thermal/drivers/sprd: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-10-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/sprd_thermal.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/thermal/sprd_thermal.c b/drivers/thermal/sprd_thermal.c
index fff80fc18002..ac884514f116 100644
--- a/drivers/thermal/sprd_thermal.c
+++ b/drivers/thermal/sprd_thermal.c
@@ -204,9 +204,9 @@ static int sprd_thm_temp_to_rawdata(int temp, struct sprd_thermal_sensor *sen)
 	return clamp(val, val, (u32)(SPRD_THM_RAW_DATA_HIGH - 1));
 }
 
-static int sprd_thm_read_temp(void *devdata, int *temp)
+static int sprd_thm_read_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct sprd_thermal_sensor *sen = devdata;
+	struct sprd_thermal_sensor *sen = tz->devdata;
 	u32 data;
 
 	data = readl(sen->data->base + SPRD_THM_TEMP(sen->id)) &
@@ -217,7 +217,7 @@ static int sprd_thm_read_temp(void *devdata, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops sprd_thm_ops = {
+static const struct thermal_zone_device_ops sprd_thm_ops = {
 	.get_temp = sprd_thm_read_temp,
 };
 
@@ -408,10 +408,10 @@ static int sprd_thm_probe(struct platform_device *pdev)
 
 		sprd_thm_sensor_init(thm, sen);
 
-		sen->tzd = devm_thermal_zone_of_sensor_register(sen->dev,
-								sen->id,
-								sen,
-								&sprd_thm_ops);
+		sen->tzd = devm_thermal_of_zone_register(sen->dev,
+							 sen->id,
+							 sen,
+							 &sprd_thm_ops);
 		if (IS_ERR(sen->tzd)) {
 			dev_err(&pdev->dev, "register thermal zone failed %d\n",
 				sen->id);
@@ -523,8 +523,8 @@ static int sprd_thm_remove(struct platform_device *pdev)
 
 	for (i = 0; i < thm->nr_sensors; i++) {
 		sprd_thm_toggle_sensor(thm->sensor[i], false);
-		devm_thermal_zone_of_sensor_unregister(&pdev->dev,
-						       thm->sensor[i]->tzd);
+		devm_thermal_of_zone_unregister(&pdev->dev,
+						thm->sensor[i]->tzd);
 	}
 
 	clk_disable_unprepare(thm->clk);
-- 
cgit v1.2.3


From 944441d878b0aebd87ec404fe86c322186da458d Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:26 +0200
Subject: thermal/drivers/broadcom: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20220804224349.1926752-11-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/broadcom/bcm2711_thermal.c | 14 ++++-----
 drivers/thermal/broadcom/bcm2835_thermal.c | 14 ++++-----
 drivers/thermal/broadcom/brcmstb_thermal.c | 20 ++++++------
 drivers/thermal/broadcom/ns-thermal.c      | 50 +++++++++++++-----------------
 drivers/thermal/broadcom/sr-thermal.c      | 16 +++++-----
 5 files changed, 53 insertions(+), 61 deletions(-)

diff --git a/drivers/thermal/broadcom/bcm2711_thermal.c b/drivers/thermal/broadcom/bcm2711_thermal.c
index e9bef5c3414b..1f8651d15160 100644
--- a/drivers/thermal/broadcom/bcm2711_thermal.c
+++ b/drivers/thermal/broadcom/bcm2711_thermal.c
@@ -31,11 +31,11 @@ struct bcm2711_thermal_priv {
 	struct thermal_zone_device *thermal;
 };
 
-static int bcm2711_get_temp(void *data, int *temp)
+static int bcm2711_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct bcm2711_thermal_priv *priv = data;
-	int slope = thermal_zone_get_slope(priv->thermal);
-	int offset = thermal_zone_get_offset(priv->thermal);
+	struct bcm2711_thermal_priv *priv = tz->devdata;
+	int slope = thermal_zone_get_slope(tz);
+	int offset = thermal_zone_get_offset(tz);
 	u32 val;
 	int ret;
 
@@ -54,7 +54,7 @@ static int bcm2711_get_temp(void *data, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops bcm2711_thermal_of_ops = {
+static const struct thermal_zone_device_ops bcm2711_thermal_of_ops = {
 	.get_temp	= bcm2711_get_temp,
 };
 
@@ -88,8 +88,8 @@ static int bcm2711_thermal_probe(struct platform_device *pdev)
 	}
 	priv->regmap = regmap;
 
-	thermal = devm_thermal_zone_of_sensor_register(dev, 0, priv,
-						       &bcm2711_thermal_of_ops);
+	thermal = devm_thermal_of_zone_register(dev, 0, priv,
+						&bcm2711_thermal_of_ops);
 	if (IS_ERR(thermal)) {
 		ret = PTR_ERR(thermal);
 		dev_err(dev, "could not register sensor: %d\n", ret);
diff --git a/drivers/thermal/broadcom/bcm2835_thermal.c b/drivers/thermal/broadcom/bcm2835_thermal.c
index c8e4344d5a3d..2c67841a1115 100644
--- a/drivers/thermal/broadcom/bcm2835_thermal.c
+++ b/drivers/thermal/broadcom/bcm2835_thermal.c
@@ -88,9 +88,9 @@ static int bcm2835_thermal_temp2adc(int temp, int offset, int slope)
 	return temp;
 }
 
-static int bcm2835_thermal_get_temp(void *d, int *temp)
+static int bcm2835_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct bcm2835_thermal_data *data = d;
+	struct bcm2835_thermal_data *data = tz->devdata;
 	u32 val = readl(data->regs + BCM2835_TS_TSENSSTAT);
 
 	if (!(val & BCM2835_TS_TSENSSTAT_VALID))
@@ -135,7 +135,7 @@ static void bcm2835_thermal_debugfs(struct platform_device *pdev)
 	debugfs_create_regset32("regset", 0444, data->debugfsdir, regset);
 }
 
-static const struct thermal_zone_of_device_ops bcm2835_thermal_ops = {
+static const struct thermal_zone_device_ops bcm2835_thermal_ops = {
 	.get_temp = bcm2835_thermal_get_temp,
 };
 
@@ -206,8 +206,8 @@ static int bcm2835_thermal_probe(struct platform_device *pdev)
 			 data->clk, rate);
 
 	/* register of thermal sensor and get info from DT */
-	tz = thermal_zone_of_sensor_register(&pdev->dev, 0, data,
-					     &bcm2835_thermal_ops);
+	tz = devm_thermal_of_zone_register(&pdev->dev, 0, data,
+					   &bcm2835_thermal_ops);
 	if (IS_ERR(tz)) {
 		err = PTR_ERR(tz);
 		dev_err(&pdev->dev,
@@ -277,7 +277,7 @@ static int bcm2835_thermal_probe(struct platform_device *pdev)
 
 	return 0;
 err_tz:
-	thermal_zone_of_sensor_unregister(&pdev->dev, tz);
+	thermal_of_zone_unregister(tz);
 err_clk:
 	clk_disable_unprepare(data->clk);
 
@@ -290,7 +290,7 @@ static int bcm2835_thermal_remove(struct platform_device *pdev)
 	struct thermal_zone_device *tz = data->tz;
 
 	debugfs_remove_recursive(data->debugfsdir);
-	thermal_zone_of_sensor_unregister(&pdev->dev, tz);
+	thermal_of_zone_unregister(tz);
 	clk_disable_unprepare(data->clk);
 
 	return 0;
diff --git a/drivers/thermal/broadcom/brcmstb_thermal.c b/drivers/thermal/broadcom/brcmstb_thermal.c
index 0cedb8b4f00a..c79c6cfdd74d 100644
--- a/drivers/thermal/broadcom/brcmstb_thermal.c
+++ b/drivers/thermal/broadcom/brcmstb_thermal.c
@@ -105,7 +105,7 @@ static struct avs_tmon_trip avs_tmon_trips[] = {
 struct brcmstb_thermal_params {
 	unsigned int offset;
 	unsigned int mult;
-	const struct thermal_zone_of_device_ops *of_ops;
+	const struct thermal_zone_device_ops *of_ops;
 };
 
 struct brcmstb_thermal_priv {
@@ -150,9 +150,9 @@ static inline u32 avs_tmon_temp_to_code(struct brcmstb_thermal_priv *priv,
 		return (u32)((offset - temp) / mult);
 }
 
-static int brcmstb_get_temp(void *data, int *temp)
+static int brcmstb_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct brcmstb_thermal_priv *priv = data;
+	struct brcmstb_thermal_priv *priv = tz->devdata;
 	u32 val;
 	long t;
 
@@ -260,9 +260,9 @@ static irqreturn_t brcmstb_tmon_irq_thread(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int brcmstb_set_trips(void *data, int low, int high)
+static int brcmstb_set_trips(struct thermal_zone_device *tz, int low, int high)
 {
-	struct brcmstb_thermal_priv *priv = data;
+	struct brcmstb_thermal_priv *priv = tz->devdata;
 
 	dev_dbg(priv->dev, "set trips %d <--> %d\n", low, high);
 
@@ -288,7 +288,7 @@ static int brcmstb_set_trips(void *data, int low, int high)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops brcmstb_16nm_of_ops = {
+static const struct thermal_zone_device_ops brcmstb_16nm_of_ops = {
 	.get_temp	= brcmstb_get_temp,
 };
 
@@ -298,7 +298,7 @@ static const struct brcmstb_thermal_params brcmstb_16nm_params = {
 	.of_ops	= &brcmstb_16nm_of_ops,
 };
 
-static const struct thermal_zone_of_device_ops brcmstb_28nm_of_ops = {
+static const struct thermal_zone_device_ops brcmstb_28nm_of_ops = {
 	.get_temp	= brcmstb_get_temp,
 	.set_trips	= brcmstb_set_trips,
 };
@@ -318,7 +318,7 @@ MODULE_DEVICE_TABLE(of, brcmstb_thermal_id_table);
 
 static int brcmstb_thermal_probe(struct platform_device *pdev)
 {
-	const struct thermal_zone_of_device_ops *of_ops;
+	const struct thermal_zone_device_ops *of_ops;
 	struct thermal_zone_device *thermal;
 	struct brcmstb_thermal_priv *priv;
 	struct resource *res;
@@ -341,8 +341,8 @@ static int brcmstb_thermal_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, priv);
 	of_ops = priv->temp_params->of_ops;
 
-	thermal = devm_thermal_zone_of_sensor_register(&pdev->dev, 0, priv,
-						       of_ops);
+	thermal = devm_thermal_of_zone_register(&pdev->dev, 0, priv,
+						of_ops);
 	if (IS_ERR(thermal)) {
 		ret = PTR_ERR(thermal);
 		dev_err(&pdev->dev, "could not register sensor: %d\n", ret);
diff --git a/drivers/thermal/broadcom/ns-thermal.c b/drivers/thermal/broadcom/ns-thermal.c
index c9468ba9d449..07a8a3f49bd0 100644
--- a/drivers/thermal/broadcom/ns-thermal.c
+++ b/drivers/thermal/broadcom/ns-thermal.c
@@ -14,19 +14,14 @@
 #define PVTMON_CONTROL0_SEL_TEST_MODE			0x0000000e
 #define PVTMON_STATUS					0x08
 
-struct ns_thermal {
-	struct thermal_zone_device *tz;
-	void __iomem *pvtmon;
-};
-
-static int ns_thermal_get_temp(void *data, int *temp)
+static int ns_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct ns_thermal *ns_thermal = data;
-	int offset = thermal_zone_get_offset(ns_thermal->tz);
-	int slope = thermal_zone_get_slope(ns_thermal->tz);
+	void __iomem *pvtmon = tz->devdata;
+	int offset = thermal_zone_get_offset(tz);
+	int slope = thermal_zone_get_slope(tz);
 	u32 val;
 
-	val = readl(ns_thermal->pvtmon + PVTMON_CONTROL0);
+	val = readl(pvtmon + PVTMON_CONTROL0);
 	if ((val & PVTMON_CONTROL0_SEL_MASK) != PVTMON_CONTROL0_SEL_TEMP_MONITOR) {
 		/* Clear current mode selection */
 		val &= ~PVTMON_CONTROL0_SEL_MASK;
@@ -34,50 +29,47 @@ static int ns_thermal_get_temp(void *data, int *temp)
 		/* Set temp monitor mode (it's the default actually) */
 		val |= PVTMON_CONTROL0_SEL_TEMP_MONITOR;
 
-		writel(val, ns_thermal->pvtmon + PVTMON_CONTROL0);
+		writel(val, pvtmon + PVTMON_CONTROL0);
 	}
 
-	val = readl(ns_thermal->pvtmon + PVTMON_STATUS);
+	val = readl(pvtmon + PVTMON_STATUS);
 	*temp = slope * val + offset;
 
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops ns_thermal_ops = {
+static const struct thermal_zone_device_ops ns_thermal_ops = {
 	.get_temp = ns_thermal_get_temp,
 };
 
 static int ns_thermal_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
-	struct ns_thermal *ns_thermal;
-
-	ns_thermal = devm_kzalloc(dev, sizeof(*ns_thermal), GFP_KERNEL);
-	if (!ns_thermal)
-		return -ENOMEM;
+	struct thermal_zone_device *tz;
+	void __iomem *pvtmon;
 
-	ns_thermal->pvtmon = of_iomap(dev_of_node(dev), 0);
-	if (WARN_ON(!ns_thermal->pvtmon))
+	pvtmon = of_iomap(dev_of_node(dev), 0);
+	if (WARN_ON(!pvtmon))
 		return -ENOENT;
 
-	ns_thermal->tz = devm_thermal_zone_of_sensor_register(dev, 0,
-							      ns_thermal,
-							      &ns_thermal_ops);
-	if (IS_ERR(ns_thermal->tz)) {
-		iounmap(ns_thermal->pvtmon);
-		return PTR_ERR(ns_thermal->tz);
+	tz = devm_thermal_of_zone_register(dev, 0,
+					   pvtmon,
+					   &ns_thermal_ops);
+	if (IS_ERR(tz)) {
+		iounmap(pvtmon);
+		return PTR_ERR(tz);
 	}
 
-	platform_set_drvdata(pdev, ns_thermal);
+	platform_set_drvdata(pdev, pvtmon);
 
 	return 0;
 }
 
 static int ns_thermal_remove(struct platform_device *pdev)
 {
-	struct ns_thermal *ns_thermal = platform_get_drvdata(pdev);
+	void __iomem *pvtmon = platform_get_drvdata(pdev);
 
-	iounmap(ns_thermal->pvtmon);
+	iounmap(pvtmon);
 
 	return 0;
 }
diff --git a/drivers/thermal/broadcom/sr-thermal.c b/drivers/thermal/broadcom/sr-thermal.c
index 85ab9edd580c..2b93502543ff 100644
--- a/drivers/thermal/broadcom/sr-thermal.c
+++ b/drivers/thermal/broadcom/sr-thermal.c
@@ -19,7 +19,6 @@
 #define SR_TMON_MAX_LIST        6
 
 struct sr_tmon {
-	struct thermal_zone_device *tz;
 	unsigned int crit_temp;
 	unsigned int tmon_id;
 	struct sr_thermal *priv;
@@ -31,9 +30,9 @@ struct sr_thermal {
 	struct sr_tmon tmon[SR_TMON_MAX_LIST];
 };
 
-static int sr_get_temp(void *data, int *temp)
+static int sr_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct sr_tmon *tmon = data;
+	struct sr_tmon *tmon = tz->devdata;
 	struct sr_thermal *sr_thermal = tmon->priv;
 
 	*temp = readl(sr_thermal->regs + SR_TMON_TEMP_BASE(tmon->tmon_id));
@@ -41,13 +40,14 @@ static int sr_get_temp(void *data, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops sr_tz_ops = {
+static const struct thermal_zone_device_ops sr_tz_ops = {
 	.get_temp = sr_get_temp,
 };
 
 static int sr_thermal_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
+	struct thermal_zone_device *tz;
 	struct sr_thermal *sr_thermal;
 	struct sr_tmon *tmon;
 	struct resource *res;
@@ -84,10 +84,10 @@ static int sr_thermal_probe(struct platform_device *pdev)
 		writel(0, sr_thermal->regs + SR_TMON_TEMP_BASE(i));
 		tmon->tmon_id = i;
 		tmon->priv = sr_thermal;
-		tmon->tz = devm_thermal_zone_of_sensor_register(dev, i, tmon,
-								&sr_tz_ops);
-		if (IS_ERR(tmon->tz))
-			return PTR_ERR(tmon->tz);
+		tz = devm_thermal_of_zone_register(dev, i, tmon,
+						   &sr_tz_ops);
+		if (IS_ERR(tz))
+			return PTR_ERR(tz);
 
 		dev_dbg(dev, "thermal sensor %d registered\n", i);
 	}
-- 
cgit v1.2.3


From ca1b9a9eb3fdbb9aa39d0c174391af694ae77671 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:27 +0200
Subject: thermal/drivers/qcom: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-12-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/qcom/qcom-spmi-adc-tm5.c    | 19 +++++++++----------
 drivers/thermal/qcom/qcom-spmi-temp-alarm.c | 12 ++++++------
 drivers/thermal/qcom/tsens.c                | 16 ++++++++--------
 3 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/drivers/thermal/qcom/qcom-spmi-adc-tm5.c b/drivers/thermal/qcom/qcom-spmi-adc-tm5.c
index 073943cbcc2b..add6f40e5e2a 100644
--- a/drivers/thermal/qcom/qcom-spmi-adc-tm5.c
+++ b/drivers/thermal/qcom/qcom-spmi-adc-tm5.c
@@ -357,9 +357,9 @@ static irqreturn_t adc_tm5_gen2_isr(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int adc_tm5_get_temp(void *data, int *temp)
+static int adc_tm5_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct adc_tm5_channel *channel = data;
+	struct adc_tm5_channel *channel = tz->devdata;
 	int ret;
 
 	if (!channel || !channel->iio)
@@ -639,9 +639,9 @@ config_fail:
 	return ret;
 }
 
-static int adc_tm5_set_trips(void *data, int low, int high)
+static int adc_tm5_set_trips(struct thermal_zone_device *tz, int low, int high)
 {
-	struct adc_tm5_channel *channel = data;
+	struct adc_tm5_channel *channel = tz->devdata;
 	struct adc_tm5_chip *chip;
 	int ret;
 
@@ -660,7 +660,7 @@ static int adc_tm5_set_trips(void *data, int low, int high)
 	return ret;
 }
 
-static struct thermal_zone_of_device_ops adc_tm5_thermal_ops = {
+static const struct thermal_zone_device_ops adc_tm5_thermal_ops = {
 	.get_temp = adc_tm5_get_temp,
 	.set_trips = adc_tm5_set_trips,
 };
@@ -672,11 +672,10 @@ static int adc_tm5_register_tzd(struct adc_tm5_chip *adc_tm)
 
 	for (i = 0; i < adc_tm->nchannels; i++) {
 		adc_tm->channels[i].chip = adc_tm;
-
-		tzd = devm_thermal_zone_of_sensor_register(adc_tm->dev,
-							   adc_tm->channels[i].channel,
-							   &adc_tm->channels[i],
-							   &adc_tm5_thermal_ops);
+		tzd = devm_thermal_of_zone_register(adc_tm->dev,
+						    adc_tm->channels[i].channel,
+						    &adc_tm->channels[i],
+						    &adc_tm5_thermal_ops);
 		if (IS_ERR(tzd)) {
 			if (PTR_ERR(tzd) == -ENODEV) {
 				dev_warn(adc_tm->dev, "thermal sensor on channel %d is not used\n",
diff --git a/drivers/thermal/qcom/qcom-spmi-temp-alarm.c b/drivers/thermal/qcom/qcom-spmi-temp-alarm.c
index 770f82cc9bca..be785ab37e53 100644
--- a/drivers/thermal/qcom/qcom-spmi-temp-alarm.c
+++ b/drivers/thermal/qcom/qcom-spmi-temp-alarm.c
@@ -186,9 +186,9 @@ static int qpnp_tm_update_temp_no_adc(struct qpnp_tm_chip *chip)
 	return 0;
 }
 
-static int qpnp_tm_get_temp(void *data, int *temp)
+static int qpnp_tm_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct qpnp_tm_chip *chip = data;
+	struct qpnp_tm_chip *chip = tz->devdata;
 	int ret, mili_celsius;
 
 	if (!temp)
@@ -263,9 +263,9 @@ skip:
 	return qpnp_tm_write(chip, QPNP_TM_REG_SHUTDOWN_CTRL1, reg);
 }
 
-static int qpnp_tm_set_trip_temp(void *data, int trip, int temp)
+static int qpnp_tm_set_trip_temp(struct thermal_zone_device *tz, int trip, int temp)
 {
-	struct qpnp_tm_chip *chip = data;
+	struct qpnp_tm_chip *chip = tz->devdata;
 	const struct thermal_trip *trip_points;
 	int ret;
 
@@ -283,7 +283,7 @@ static int qpnp_tm_set_trip_temp(void *data, int trip, int temp)
 	return ret;
 }
 
-static const struct thermal_zone_of_device_ops qpnp_tm_sensor_ops = {
+static const struct thermal_zone_device_ops qpnp_tm_sensor_ops = {
 	.get_temp = qpnp_tm_get_temp,
 	.set_trip_temp = qpnp_tm_set_trip_temp,
 };
@@ -446,7 +446,7 @@ static int qpnp_tm_probe(struct platform_device *pdev)
 	 * read the trip points. get_temp() returns the default temperature
 	 * before the hardware initialization is completed.
 	 */
-	chip->tz_dev = devm_thermal_zone_of_sensor_register(
+	chip->tz_dev = devm_thermal_of_zone_register(
 		&pdev->dev, 0, chip, &qpnp_tm_sensor_ops);
 	if (IS_ERR(chip->tz_dev)) {
 		dev_err(&pdev->dev, "failed to register sensor\n");
diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c
index e49f58e83513..b1b10005fb28 100644
--- a/drivers/thermal/qcom/tsens.c
+++ b/drivers/thermal/qcom/tsens.c
@@ -532,9 +532,9 @@ static irqreturn_t tsens_irq_thread(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static int tsens_set_trips(void *_sensor, int low, int high)
+static int tsens_set_trips(struct thermal_zone_device *tz, int low, int high)
 {
-	struct tsens_sensor *s = _sensor;
+	struct tsens_sensor *s = tz->devdata;
 	struct tsens_priv *priv = s->priv;
 	struct device *dev = priv->dev;
 	struct tsens_irq_data d;
@@ -925,9 +925,9 @@ err_put_device:
 	return ret;
 }
 
-static int tsens_get_temp(void *data, int *temp)
+static int tsens_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct tsens_sensor *s = data;
+	struct tsens_sensor *s = tz->devdata;
 	struct tsens_priv *priv = s->priv;
 
 	return priv->ops->get_temp(s, temp);
@@ -991,7 +991,7 @@ static const struct of_device_id tsens_table[] = {
 };
 MODULE_DEVICE_TABLE(of, tsens_table);
 
-static const struct thermal_zone_of_device_ops tsens_of_ops = {
+static const struct thermal_zone_device_ops tsens_of_ops = {
 	.get_temp = tsens_get_temp,
 	.set_trips = tsens_set_trips,
 };
@@ -1044,9 +1044,9 @@ static int tsens_register(struct tsens_priv *priv)
 
 	for (i = 0;  i < priv->num_sensors; i++) {
 		priv->sensor[i].priv = priv;
-		tzd = devm_thermal_zone_of_sensor_register(priv->dev, priv->sensor[i].hw_id,
-							   &priv->sensor[i],
-							   &tsens_of_ops);
+		tzd = devm_thermal_of_zone_register(priv->dev, priv->sensor[i].hw_id,
+						    &priv->sensor[i],
+						    &tsens_of_ops);
 		if (IS_ERR(tzd))
 			continue;
 		priv->sensor[i].tzd = tzd;
-- 
cgit v1.2.3


From 7e96f35408b6b196a3dc20db757878a7d26bf02d Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:28 +0200
Subject: thermal/drivers/st: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-13-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/st/stm_thermal.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/thermal/st/stm_thermal.c b/drivers/thermal/st/stm_thermal.c
index 5fd3fb8912a6..78feb802a87d 100644
--- a/drivers/thermal/st/stm_thermal.c
+++ b/drivers/thermal/st/stm_thermal.c
@@ -302,9 +302,9 @@ static int stm_disable_irq(struct stm_thermal_sensor *sensor)
 	return 0;
 }
 
-static int stm_thermal_set_trips(void *data, int low, int high)
+static int stm_thermal_set_trips(struct thermal_zone_device *tz, int low, int high)
 {
-	struct stm_thermal_sensor *sensor = data;
+	struct stm_thermal_sensor *sensor = tz->devdata;
 	u32 itr1, th;
 	int ret;
 
@@ -350,9 +350,9 @@ static int stm_thermal_set_trips(void *data, int low, int high)
 }
 
 /* Callback to get temperature from HW */
-static int stm_thermal_get_temp(void *data, int *temp)
+static int stm_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct stm_thermal_sensor *sensor = data;
+	struct stm_thermal_sensor *sensor = tz->devdata;
 	u32 periods;
 	int freqM, ret;
 
@@ -474,7 +474,7 @@ static int stm_thermal_resume(struct device *dev)
 static SIMPLE_DEV_PM_OPS(stm_thermal_pm_ops,
 			 stm_thermal_suspend, stm_thermal_resume);
 
-static const struct thermal_zone_of_device_ops stm_tz_ops = {
+static const struct thermal_zone_device_ops stm_tz_ops = {
 	.get_temp	= stm_thermal_get_temp,
 	.set_trips	= stm_thermal_set_trips,
 };
@@ -539,9 +539,9 @@ static int stm_thermal_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	sensor->th_dev = devm_thermal_zone_of_sensor_register(&pdev->dev, 0,
-							      sensor,
-							      &stm_tz_ops);
+	sensor->th_dev = devm_thermal_of_zone_register(&pdev->dev, 0,
+						       sensor,
+						       &stm_tz_ops);
 
 	if (IS_ERR(sensor->th_dev)) {
 		dev_err(&pdev->dev, "%s: thermal zone sensor registering KO\n",
@@ -572,7 +572,6 @@ static int stm_thermal_probe(struct platform_device *pdev)
 	return 0;
 
 err_tz:
-	thermal_zone_of_sensor_unregister(&pdev->dev, sensor->th_dev);
 	return ret;
 }
 
@@ -582,7 +581,6 @@ static int stm_thermal_remove(struct platform_device *pdev)
 
 	stm_thermal_sensor_off(sensor);
 	thermal_remove_hwmon_sysfs(sensor->th_dev);
-	thermal_zone_of_sensor_unregister(&pdev->dev, sensor->th_dev);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 1240fd6512b7df593b99ea777c846f0b59173a6b Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:29 +0200
Subject: thermal/drivers/amlogic: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-14-daniel.lezcano@linexp.org
Reviewed-by: Neil Armstrong <narmstrong@baylibre.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/amlogic_thermal.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/thermal/amlogic_thermal.c b/drivers/thermal/amlogic_thermal.c
index e61b91d14ad1..d30cb791e63c 100644
--- a/drivers/thermal/amlogic_thermal.c
+++ b/drivers/thermal/amlogic_thermal.c
@@ -179,12 +179,12 @@ static int amlogic_thermal_disable(struct amlogic_thermal *data)
 	return 0;
 }
 
-static int amlogic_thermal_get_temp(void *data, int *temp)
+static int amlogic_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
 	unsigned int tval;
-	struct amlogic_thermal *pdata = data;
+	struct amlogic_thermal *pdata = tz->devdata;
 
-	if (!data)
+	if (!pdata)
 		return -EINVAL;
 
 	regmap_read(pdata->regmap, TSENSOR_STAT0, &tval);
@@ -195,7 +195,7 @@ static int amlogic_thermal_get_temp(void *data, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops amlogic_thermal_ops = {
+static const struct thermal_zone_device_ops amlogic_thermal_ops = {
 	.get_temp	= amlogic_thermal_get_temp,
 };
 
@@ -276,10 +276,10 @@ static int amlogic_thermal_probe(struct platform_device *pdev)
 		return PTR_ERR(pdata->sec_ao_map);
 	}
 
-	pdata->tzd = devm_thermal_zone_of_sensor_register(&pdev->dev,
-							  0,
-							  pdata,
-							  &amlogic_thermal_ops);
+	pdata->tzd = devm_thermal_of_zone_register(&pdev->dev,
+						   0,
+						   pdata,
+						   &amlogic_thermal_ops);
 	if (IS_ERR(pdata->tzd)) {
 		ret = PTR_ERR(pdata->tzd);
 		dev_err(dev, "Failed to register tsensor: %d\n", ret);
-- 
cgit v1.2.3


From e4a1150e3e8d708e989c9f7056320fbff4a2d0c4 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:30 +0200
Subject: thermal/drivers/armada: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-15-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/armada_thermal.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/thermal/armada_thermal.c b/drivers/thermal/armada_thermal.c
index c2ebfb5be4b3..52d63b3997fe 100644
--- a/drivers/thermal/armada_thermal.c
+++ b/drivers/thermal/armada_thermal.c
@@ -420,9 +420,9 @@ static struct thermal_zone_device_ops legacy_ops = {
 	.get_temp = armada_get_temp_legacy,
 };
 
-static int armada_get_temp(void *_sensor, int *temp)
+static int armada_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct armada_thermal_sensor *sensor = _sensor;
+	struct armada_thermal_sensor *sensor = tz->devdata;
 	struct armada_thermal_priv *priv = sensor->priv;
 	int ret;
 
@@ -450,7 +450,7 @@ unlock_mutex:
 	return ret;
 }
 
-static const struct thermal_zone_of_device_ops of_ops = {
+static const struct thermal_zone_device_ops of_ops = {
 	.get_temp = armada_get_temp,
 };
 
@@ -928,9 +928,9 @@ static int armada_thermal_probe(struct platform_device *pdev)
 		/* Register the sensor */
 		sensor->priv = priv;
 		sensor->id = sensor_id;
-		tz = devm_thermal_zone_of_sensor_register(&pdev->dev,
-							  sensor->id, sensor,
-							  &of_ops);
+		tz = devm_thermal_of_zone_register(&pdev->dev,
+						   sensor->id, sensor,
+						   &of_ops);
 		if (IS_ERR(tz)) {
 			dev_info(&pdev->dev, "Thermal sensor %d unavailable\n",
 				 sensor_id);
-- 
cgit v1.2.3


From 2320be6032e1c1b17a3fcac98813947d1d28c32f Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:31 +0200
Subject: thermal/drivers/db8500: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-16-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/db8500_thermal.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c
index 121cf853e545..cb10e280681f 100644
--- a/drivers/thermal/db8500_thermal.c
+++ b/drivers/thermal/db8500_thermal.c
@@ -58,9 +58,9 @@ struct db8500_thermal_zone {
 };
 
 /* Callback to get current temperature */
-static int db8500_thermal_get_temp(void *data, int *temp)
+static int db8500_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct db8500_thermal_zone *th = data;
+	struct db8500_thermal_zone *th = tz->devdata;
 
 	/*
 	 * TODO: There is no PRCMU interface to get temperature data currently,
@@ -72,7 +72,7 @@ static int db8500_thermal_get_temp(void *data, int *temp)
 	return 0;
 }
 
-static struct thermal_zone_of_device_ops thdev_ops = {
+static const struct thermal_zone_device_ops thdev_ops = {
 	.get_temp = db8500_thermal_get_temp,
 };
 
@@ -182,7 +182,7 @@ static int db8500_thermal_probe(struct platform_device *pdev)
 	}
 
 	/* register of thermal sensor and get info from DT */
-	th->tz = devm_thermal_zone_of_sensor_register(dev, 0, th, &thdev_ops);
+	th->tz = devm_thermal_of_zone_register(dev, 0, th, &thdev_ops);
 	if (IS_ERR(th->tz)) {
 		dev_err(dev, "register thermal zone sensor failed\n");
 		return PTR_ERR(th->tz);
-- 
cgit v1.2.3


From 32fb9a8a9d0db3edee50f9c2fcc74fcc26812b86 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:32 +0200
Subject: thermal/drivers/imx: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-17-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/imx8mm_thermal.c | 14 +++++++-------
 drivers/thermal/imx_sc_thermal.c | 14 +++++++-------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/thermal/imx8mm_thermal.c b/drivers/thermal/imx8mm_thermal.c
index af666bd9e8d4..e2c2673025a7 100644
--- a/drivers/thermal/imx8mm_thermal.c
+++ b/drivers/thermal/imx8mm_thermal.c
@@ -96,15 +96,15 @@ static int imx8mp_tmu_get_temp(void *data, int *temp)
 	return 0;
 }
 
-static int tmu_get_temp(void *data, int *temp)
+static int tmu_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct tmu_sensor *sensor = data;
+	struct tmu_sensor *sensor = tz->devdata;
 	struct imx8mm_tmu *tmu = sensor->priv;
 
-	return tmu->socdata->get_temp(data, temp);
+	return tmu->socdata->get_temp(sensor, temp);
 }
 
-static struct thermal_zone_of_device_ops tmu_tz_ops = {
+static const struct thermal_zone_device_ops tmu_tz_ops = {
 	.get_temp = tmu_get_temp,
 };
 
@@ -165,9 +165,9 @@ static int imx8mm_tmu_probe(struct platform_device *pdev)
 	for (i = 0; i < data->num_sensors; i++) {
 		tmu->sensors[i].priv = tmu;
 		tmu->sensors[i].tzd =
-			devm_thermal_zone_of_sensor_register(&pdev->dev, i,
-							     &tmu->sensors[i],
-							     &tmu_tz_ops);
+			devm_thermal_of_zone_register(&pdev->dev, i,
+						      &tmu->sensors[i],
+						      &tmu_tz_ops);
 		if (IS_ERR(tmu->sensors[i].tzd)) {
 			ret = PTR_ERR(tmu->sensors[i].tzd);
 			dev_err(&pdev->dev,
diff --git a/drivers/thermal/imx_sc_thermal.c b/drivers/thermal/imx_sc_thermal.c
index 331a241eb0ef..10bfa6507eb4 100644
--- a/drivers/thermal/imx_sc_thermal.c
+++ b/drivers/thermal/imx_sc_thermal.c
@@ -43,11 +43,11 @@ struct imx_sc_msg_misc_get_temp {
 	} data;
 } __packed __aligned(4);
 
-static int imx_sc_thermal_get_temp(void *data, int *temp)
+static int imx_sc_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
 	struct imx_sc_msg_misc_get_temp msg;
 	struct imx_sc_rpc_msg *hdr = &msg.hdr;
-	struct imx_sc_sensor *sensor = data;
+	struct imx_sc_sensor *sensor = tz->devdata;
 	int ret;
 
 	msg.data.req.resource_id = sensor->resource_id;
@@ -70,7 +70,7 @@ static int imx_sc_thermal_get_temp(void *data, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops imx_sc_thermal_ops = {
+static const struct thermal_zone_device_ops imx_sc_thermal_ops = {
 	.get_temp = imx_sc_thermal_get_temp,
 };
 
@@ -109,10 +109,10 @@ static int imx_sc_thermal_probe(struct platform_device *pdev)
 			break;
 		}
 
-		sensor->tzd = devm_thermal_zone_of_sensor_register(&pdev->dev,
-								   sensor->resource_id,
-								   sensor,
-								   &imx_sc_thermal_ops);
+		sensor->tzd = devm_thermal_of_zone_register(&pdev->dev,
+							    sensor->resource_id,
+							    sensor,
+							    &imx_sc_thermal_ops);
 		if (IS_ERR(sensor->tzd)) {
 			dev_err(&pdev->dev, "failed to register thermal zone\n");
 			ret = PTR_ERR(sensor->tzd);
-- 
cgit v1.2.3


From 2ebd4f2f2ecfde86ce490f02b28b3282d93aa405 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:33 +0200
Subject: thermal/drivers/rcar: Switch to new of API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Tested-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://lore.kernel.org/r/20220804224349.1926752-18-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/rcar_gen3_thermal.c | 16 ++++++++--------
 drivers/thermal/rcar_thermal.c      | 13 +++----------
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/drivers/thermal/rcar_gen3_thermal.c b/drivers/thermal/rcar_gen3_thermal.c
index cda7c52f2319..4c1c6f89aa2f 100644
--- a/drivers/thermal/rcar_gen3_thermal.c
+++ b/drivers/thermal/rcar_gen3_thermal.c
@@ -164,9 +164,9 @@ static int rcar_gen3_thermal_round(int temp)
 	return result * RCAR3_THERMAL_GRAN;
 }
 
-static int rcar_gen3_thermal_get_temp(void *devdata, int *temp)
+static int rcar_gen3_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct rcar_gen3_thermal_tsc *tsc = devdata;
+	struct rcar_gen3_thermal_tsc *tsc = tz->devdata;
 	int mcelsius, val;
 	int reg;
 
@@ -203,9 +203,9 @@ static int rcar_gen3_thermal_mcelsius_to_temp(struct rcar_gen3_thermal_tsc *tsc,
 	return INT_FIXPT(val);
 }
 
-static int rcar_gen3_thermal_set_trips(void *devdata, int low, int high)
+static int rcar_gen3_thermal_set_trips(struct thermal_zone_device *tz, int low, int high)
 {
-	struct rcar_gen3_thermal_tsc *tsc = devdata;
+	struct rcar_gen3_thermal_tsc *tsc = tz->devdata;
 	u32 irqmsk = 0;
 
 	if (low != -INT_MAX) {
@@ -225,7 +225,7 @@ static int rcar_gen3_thermal_set_trips(void *devdata, int low, int high)
 	return 0;
 }
 
-static struct thermal_zone_of_device_ops rcar_gen3_tz_of_ops = {
+static struct thermal_zone_device_ops rcar_gen3_tz_of_ops = {
 	.get_temp	= rcar_gen3_thermal_get_temp,
 	.set_trips	= rcar_gen3_thermal_set_trips,
 };
@@ -508,8 +508,8 @@ static int rcar_gen3_thermal_probe(struct platform_device *pdev)
 	for (i = 0; i < priv->num_tscs; i++) {
 		struct rcar_gen3_thermal_tsc *tsc = priv->tscs[i];
 
-		zone = devm_thermal_zone_of_sensor_register(dev, i, tsc,
-							    &rcar_gen3_tz_of_ops);
+		zone = devm_thermal_of_zone_register(dev, i, tsc,
+						     &rcar_gen3_tz_of_ops);
 		if (IS_ERR(zone)) {
 			dev_err(dev, "Sensor %u: Can't register thermal zone\n", i);
 			ret = PTR_ERR(zone);
@@ -560,7 +560,7 @@ static int __maybe_unused rcar_gen3_thermal_resume(struct device *dev)
 
 		priv->thermal_init(tsc);
 		if (zone->ops->set_trips)
-			rcar_gen3_thermal_set_trips(tsc, zone->prev_low_trip,
+			rcar_gen3_thermal_set_trips(zone, zone->prev_low_trip,
 						    zone->prev_high_trip);
 	}
 
diff --git a/drivers/thermal/rcar_thermal.c b/drivers/thermal/rcar_thermal.c
index 1d729ed4d685..4df42d70d867 100644
--- a/drivers/thermal/rcar_thermal.c
+++ b/drivers/thermal/rcar_thermal.c
@@ -271,13 +271,6 @@ static int rcar_thermal_get_current_temp(struct rcar_thermal_priv *priv,
 	return 0;
 }
 
-static int rcar_thermal_of_get_temp(void *data, int *temp)
-{
-	struct rcar_thermal_priv *priv = data;
-
-	return rcar_thermal_get_current_temp(priv, temp);
-}
-
 static int rcar_thermal_get_temp(struct thermal_zone_device *zone, int *temp)
 {
 	struct rcar_thermal_priv *priv = rcar_zone_to_priv(zone);
@@ -323,8 +316,8 @@ static int rcar_thermal_get_trip_temp(struct thermal_zone_device *zone,
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops rcar_thermal_zone_of_ops = {
-	.get_temp	= rcar_thermal_of_get_temp,
+static struct thermal_zone_device_ops rcar_thermal_zone_of_ops = {
+	.get_temp	= rcar_thermal_get_temp,
 };
 
 static struct thermal_zone_device_ops rcar_thermal_zone_ops = {
@@ -534,7 +527,7 @@ static int rcar_thermal_probe(struct platform_device *pdev)
 			goto error_unregister;
 
 		if (chip->use_of_thermal) {
-			priv->zone = devm_thermal_zone_of_sensor_register(
+			priv->zone = devm_thermal_of_zone_register(
 						dev, i, priv,
 						&rcar_thermal_zone_of_ops);
 		} else {
-- 
cgit v1.2.3


From 396cbbc6b711ef8d329303dc179a7a1c395f1f12 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:34 +0200
Subject: thermal/drivers/rzg2l: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-19-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/rzg2l_thermal.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/thermal/rzg2l_thermal.c b/drivers/thermal/rzg2l_thermal.c
index 51ae80eda6af..2e0649f38506 100644
--- a/drivers/thermal/rzg2l_thermal.c
+++ b/drivers/thermal/rzg2l_thermal.c
@@ -73,9 +73,9 @@ static inline void rzg2l_thermal_write(struct rzg2l_thermal_priv *priv, u32 reg,
 	iowrite32(data, priv->base + reg);
 }
 
-static int rzg2l_thermal_get_temp(void *devdata, int *temp)
+static int rzg2l_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct rzg2l_thermal_priv *priv = devdata;
+	struct rzg2l_thermal_priv *priv = tz->devdata;
 	u32 result = 0, dsensor, ts_code_ave;
 	int val, i;
 
@@ -114,7 +114,7 @@ static int rzg2l_thermal_get_temp(void *devdata, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops rzg2l_tz_of_ops = {
+static const struct thermal_zone_device_ops rzg2l_tz_of_ops = {
 	.get_temp = rzg2l_thermal_get_temp,
 };
 
@@ -207,8 +207,8 @@ static int rzg2l_thermal_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	zone = devm_thermal_zone_of_sensor_register(dev, 0, priv,
-						    &rzg2l_tz_of_ops);
+	zone = devm_thermal_of_zone_register(dev, 0, priv,
+					     &rzg2l_tz_of_ops);
 	if (IS_ERR(zone)) {
 		dev_err(dev, "Can't register thermal zone");
 		ret = PTR_ERR(zone);
-- 
cgit v1.2.3


From 3e7494b41c41959cd68a3f652e286c1fb7c626fc Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:35 +0200
Subject: thermal/drivers/qoriq: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-20-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/qoriq_thermal.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c
index 73049f9bea25..d111e218f362 100644
--- a/drivers/thermal/qoriq_thermal.c
+++ b/drivers/thermal/qoriq_thermal.c
@@ -82,9 +82,9 @@ static struct qoriq_tmu_data *qoriq_sensor_to_data(struct qoriq_sensor *s)
 	return container_of(s, struct qoriq_tmu_data, sensor[s->id]);
 }
 
-static int tmu_get_temp(void *p, int *temp)
+static int tmu_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct qoriq_sensor *qsensor = p;
+	struct qoriq_sensor *qsensor = tz->devdata;
 	struct qoriq_tmu_data *qdata = qoriq_sensor_to_data(qsensor);
 	u32 val;
 	/*
@@ -122,7 +122,7 @@ static int tmu_get_temp(void *p, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops tmu_tz_ops = {
+static const struct thermal_zone_device_ops tmu_tz_ops = {
 	.get_temp = tmu_get_temp,
 };
 
@@ -146,9 +146,9 @@ static int qoriq_tmu_register_tmu_zone(struct device *dev,
 
 		sensor->id = id;
 
-		tzd = devm_thermal_zone_of_sensor_register(dev, id,
-							   sensor,
-							   &tmu_tz_ops);
+		tzd = devm_thermal_of_zone_register(dev, id,
+						    sensor,
+						    &tmu_tz_ops);
 		ret = PTR_ERR_OR_ZERO(tzd);
 		if (ret) {
 			if (ret == -ENODEV)
-- 
cgit v1.2.3


From ab7e865db9a54abd775327f87f32f4d0e6e24109 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:36 +0200
Subject: thermal/drivers/mtk: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20220804224349.1926752-21-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/mtk_thermal.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/thermal/mtk_thermal.c b/drivers/thermal/mtk_thermal.c
index ede94eadddda..8440692e3890 100644
--- a/drivers/thermal/mtk_thermal.c
+++ b/drivers/thermal/mtk_thermal.c
@@ -679,9 +679,9 @@ static int mtk_thermal_bank_temperature(struct mtk_thermal_bank *bank)
 	return max;
 }
 
-static int mtk_read_temp(void *data, int *temperature)
+static int mtk_read_temp(struct thermal_zone_device *tz, int *temperature)
 {
-	struct mtk_thermal *mt = data;
+	struct mtk_thermal *mt = tz->devdata;
 	int i;
 	int tempmax = INT_MIN;
 
@@ -700,7 +700,7 @@ static int mtk_read_temp(void *data, int *temperature)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops mtk_thermal_ops = {
+static const struct thermal_zone_device_ops mtk_thermal_ops = {
 	.get_temp = mtk_read_temp,
 };
 
@@ -1082,8 +1082,8 @@ static int mtk_thermal_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, mt);
 
-	tzdev = devm_thermal_zone_of_sensor_register(&pdev->dev, 0, mt,
-						     &mtk_thermal_ops);
+	tzdev = devm_thermal_of_zone_register(&pdev->dev, 0, mt,
+					      &mtk_thermal_ops);
 	if (IS_ERR(tzdev)) {
 		ret = PTR_ERR(tzdev);
 		goto err_disable_clk_peri_therm;
-- 
cgit v1.2.3


From b86105ed9f3bfead2aaf3daefa99b694ba5da443 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:37 +0200
Subject: thermal/drivers/banggap: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-22-daniel.lezcano@linexp.org
Reviewed-by: Bryan Brattlof <bb@ti.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/k3_bandgap.c       | 12 ++++++------
 drivers/thermal/k3_j72xx_bandgap.c | 12 +++++-------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/drivers/thermal/k3_bandgap.c b/drivers/thermal/k3_bandgap.c
index 5d0b3ffc6f46..22c9bcb899c3 100644
--- a/drivers/thermal/k3_bandgap.c
+++ b/drivers/thermal/k3_bandgap.c
@@ -139,9 +139,9 @@ static int k3_bgp_read_temp(struct k3_thermal_data *devdata,
 	return 0;
 }
 
-static int k3_thermal_get_temp(void *devdata, int *temp)
+static int k3_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct k3_thermal_data *data = devdata;
+	struct k3_thermal_data *data = tz->devdata;
 	int ret = 0;
 
 	ret = k3_bgp_read_temp(data, temp);
@@ -151,7 +151,7 @@ static int k3_thermal_get_temp(void *devdata, int *temp)
 	return ret;
 }
 
-static const struct thermal_zone_of_device_ops k3_of_thermal_ops = {
+static const struct thermal_zone_device_ops k3_of_thermal_ops = {
 	.get_temp = k3_thermal_get_temp,
 };
 
@@ -213,9 +213,9 @@ static int k3_bandgap_probe(struct platform_device *pdev)
 		writel(val, data[id].bgp->base + data[id].ctrl_offset);
 
 		data[id].tzd =
-		devm_thermal_zone_of_sensor_register(dev, id,
-						     &data[id],
-						     &k3_of_thermal_ops);
+		devm_thermal_of_zone_register(dev, id,
+					      &data[id],
+					      &k3_of_thermal_ops);
 		if (IS_ERR(data[id].tzd)) {
 			dev_err(dev, "thermal zone device is NULL\n");
 			ret = PTR_ERR(data[id].tzd);
diff --git a/drivers/thermal/k3_j72xx_bandgap.c b/drivers/thermal/k3_j72xx_bandgap.c
index 115a44eb4fbf..16b6bcf1bf4f 100644
--- a/drivers/thermal/k3_j72xx_bandgap.c
+++ b/drivers/thermal/k3_j72xx_bandgap.c
@@ -247,9 +247,9 @@ static inline int k3_bgp_read_temp(struct k3_thermal_data *devdata,
 }
 
 /* Get temperature callback function for thermal zone */
-static int k3_thermal_get_temp(void *devdata, int *temp)
+static int k3_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct k3_thermal_data *data = devdata;
+	struct k3_thermal_data *data = tz->devdata;
 	int ret = 0;
 
 	ret = k3_bgp_read_temp(data, temp);
@@ -259,7 +259,7 @@ static int k3_thermal_get_temp(void *devdata, int *temp)
 	return ret;
 }
 
-static const struct thermal_zone_of_device_ops k3_of_thermal_ops = {
+static const struct thermal_zone_device_ops k3_of_thermal_ops = {
 	.get_temp = k3_thermal_get_temp,
 };
 
@@ -474,10 +474,8 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev)
 		writel(val, data[id].bgp->cfg2_base + data[id].ctrl_offset);
 
 		bgp->ts_data[id] = &data[id];
-		ti_thermal =
-		devm_thermal_zone_of_sensor_register(bgp->dev, id,
-						     &data[id],
-						     &k3_of_thermal_ops);
+		ti_thermal = devm_thermal_of_zone_register(bgp->dev, id, &data[id],
+							   &k3_of_thermal_ops);
 		if (IS_ERR(ti_thermal)) {
 			dev_err(bgp->dev, "thermal zone device is NULL\n");
 			ret = PTR_ERR(ti_thermal);
-- 
cgit v1.2.3


From ae11d6a87c3e742418baa591be1e719a95788059 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:38 +0200
Subject: thermal/drivers/maxim: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-23-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/max77620_thermal.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/thermal/max77620_thermal.c b/drivers/thermal/max77620_thermal.c
index 82d06c7411eb..6451a55eb582 100644
--- a/drivers/thermal/max77620_thermal.c
+++ b/drivers/thermal/max77620_thermal.c
@@ -44,9 +44,9 @@ struct max77620_therm_info {
  * Return 0 on success otherwise error number to show reason of failure.
  */
 
-static int max77620_thermal_read_temp(void *data, int *temp)
+static int max77620_thermal_read_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct max77620_therm_info *mtherm = data;
+	struct max77620_therm_info *mtherm = tz->devdata;
 	unsigned int val;
 	int ret;
 
@@ -66,7 +66,7 @@ static int max77620_thermal_read_temp(void *data, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops max77620_thermal_ops = {
+static const struct thermal_zone_device_ops max77620_thermal_ops = {
 	.get_temp = max77620_thermal_read_temp,
 };
 
@@ -114,7 +114,7 @@ static int max77620_thermal_probe(struct platform_device *pdev)
 	 */
 	device_set_of_node_from_dev(&pdev->dev, pdev->dev.parent);
 
-	mtherm->tz_device = devm_thermal_zone_of_sensor_register(&pdev->dev, 0,
+	mtherm->tz_device = devm_thermal_of_zone_register(&pdev->dev, 0,
 				mtherm, &max77620_thermal_ops);
 	if (IS_ERR(mtherm->tz_device)) {
 		ret = PTR_ERR(mtherm->tz_device);
-- 
cgit v1.2.3


From 5ee7811e9afa4f6a1e6bf9231d096c9e483444a2 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:39 +0200
Subject: thermal/drivers/hisilicon: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-24-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/hisi_thermal.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c
index 19a242c69ce6..d6974db7aaf7 100644
--- a/drivers/thermal/hisi_thermal.c
+++ b/drivers/thermal/hisi_thermal.c
@@ -434,9 +434,9 @@ static int hi3660_thermal_probe(struct hisi_thermal_data *data)
 	return 0;
 }
 
-static int hisi_thermal_get_temp(void *__data, int *temp)
+static int hisi_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct hisi_thermal_sensor *sensor = __data;
+	struct hisi_thermal_sensor *sensor = tz->devdata;
 	struct hisi_thermal_data *data = sensor->data;
 
 	*temp = data->ops->get_temp(sensor);
@@ -447,7 +447,7 @@ static int hisi_thermal_get_temp(void *__data, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops hisi_of_thermal_ops = {
+static const struct thermal_zone_device_ops hisi_of_thermal_ops = {
 	.get_temp = hisi_thermal_get_temp,
 };
 
@@ -459,7 +459,7 @@ static irqreturn_t hisi_thermal_alarm_irq_thread(int irq, void *dev)
 
 	data->ops->irq_handler(sensor);
 
-	hisi_thermal_get_temp(sensor, &temp);
+	temp = data->ops->get_temp(sensor);
 
 	if (temp >= sensor->thres_temp) {
 		dev_crit(&data->pdev->dev,
@@ -484,9 +484,9 @@ static int hisi_thermal_register_sensor(struct platform_device *pdev,
 	int ret, i;
 	const struct thermal_trip *trip;
 
-	sensor->tzd = devm_thermal_zone_of_sensor_register(&pdev->dev,
-							   sensor->id, sensor,
-							   &hisi_of_thermal_ops);
+	sensor->tzd = devm_thermal_of_zone_register(&pdev->dev,
+						    sensor->id, sensor,
+						    &hisi_of_thermal_ops);
 	if (IS_ERR(sensor->tzd)) {
 		ret = PTR_ERR(sensor->tzd);
 		sensor->tzd = NULL;
-- 
cgit v1.2.3


From 2cf3c72a3ffba080b8188a07c19514cd43df6097 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:40 +0200
Subject: thermal/drivers/ti-soc: Switch to new of API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-25-daniel.lezcano@linexp.org
Acked-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/ti-soc-thermal/ti-thermal-common.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
index 703039d8b937..8a9055bd376e 100644
--- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
+++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
@@ -65,10 +65,10 @@ static inline int ti_thermal_hotspot_temperature(int t, int s, int c)
 
 /* thermal zone ops */
 /* Get temperature callback function for thermal zone */
-static inline int __ti_thermal_get_temp(void *devdata, int *temp)
+static inline int __ti_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
 	struct thermal_zone_device *pcb_tz = NULL;
-	struct ti_thermal_data *data = devdata;
+	struct ti_thermal_data *data = tz->devdata;
 	struct ti_bandgap *bgp;
 	const struct ti_temp_sensor *s;
 	int ret, tmp, slope, constant;
@@ -85,8 +85,8 @@ static inline int __ti_thermal_get_temp(void *devdata, int *temp)
 		return ret;
 
 	/* Default constants */
-	slope = thermal_zone_get_slope(data->ti_thermal);
-	constant = thermal_zone_get_offset(data->ti_thermal);
+	slope = thermal_zone_get_slope(tz);
+	constant = thermal_zone_get_offset(tz);
 
 	pcb_tz = data->pcb_tz;
 	/* In case pcb zone is available, use the extrapolation rule with it */
@@ -107,9 +107,9 @@ static inline int __ti_thermal_get_temp(void *devdata, int *temp)
 	return ret;
 }
 
-static int __ti_thermal_get_trend(void *p, int trip, enum thermal_trend *trend)
+static int __ti_thermal_get_trend(struct thermal_zone_device *tz, int trip, enum thermal_trend *trend)
 {
-	struct ti_thermal_data *data = p;
+	struct ti_thermal_data *data = tz->devdata;
 	struct ti_bandgap *bgp;
 	int id, tr, ret = 0;
 
@@ -130,7 +130,7 @@ static int __ti_thermal_get_trend(void *p, int trip, enum thermal_trend *trend)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops ti_of_thermal_ops = {
+static const struct thermal_zone_device_ops ti_of_thermal_ops = {
 	.get_temp = __ti_thermal_get_temp,
 	.get_trend = __ti_thermal_get_trend,
 };
@@ -170,7 +170,7 @@ int ti_thermal_expose_sensor(struct ti_bandgap *bgp, int id,
 		return -EINVAL;
 
 	/* in case this is specified by DT */
-	data->ti_thermal = devm_thermal_zone_of_sensor_register(bgp->dev, id,
+	data->ti_thermal = devm_thermal_of_zone_register(bgp->dev, id,
 					data, &ti_of_thermal_ops);
 	if (IS_ERR(data->ti_thermal)) {
 		dev_err(bgp->dev, "thermal zone device is NULL\n");
-- 
cgit v1.2.3


From f1d8b5042ecf9e99294109bb5a1566f6a2039c89 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:41 +0200
Subject: ata/drivers/ahci_imx: Switch to new of thermal API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

sata_ahci_read_temperature() is used by sata_ahci_show_temp() also.

So in order to change the function prototype for the get_temp ops which
does not take a void* but a thermal_zone_device* structure, this
function wraps the call.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Acked-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Link: https://lore.kernel.org/r/20220804224349.1926752-26-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/ata/ahci_imx.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c
index 79aa9f285312..b734e069034d 100644
--- a/drivers/ata/ahci_imx.c
+++ b/drivers/ata/ahci_imx.c
@@ -327,7 +327,7 @@ static int read_adc_sum(void *dev, u16 rtune_ctl_reg, void __iomem * mmio)
 }
 
 /* SATA AHCI temperature monitor */
-static int sata_ahci_read_temperature(void *dev, int *temp)
+static int __sata_ahci_read_temperature(void *dev, int *temp)
 {
 	u16 mpll_test_reg, rtune_ctl_reg, dac_ctl_reg, read_sum;
 	u32 str1, str2, str3, str4;
@@ -416,6 +416,11 @@ static int sata_ahci_read_temperature(void *dev, int *temp)
 	return 0;
 }
 
+static int sata_ahci_read_temperature(struct thermal_zone_device *tz, int *temp)
+{
+	return __sata_ahci_read_temperature(tz->devdata, temp);
+}
+
 static ssize_t sata_ahci_show_temp(struct device *dev,
 				   struct device_attribute *da,
 				   char *buf)
@@ -423,14 +428,14 @@ static ssize_t sata_ahci_show_temp(struct device *dev,
 	unsigned int temp = 0;
 	int err;
 
-	err = sata_ahci_read_temperature(dev, &temp);
+	err = __sata_ahci_read_temperature(dev, &temp);
 	if (err < 0)
 		return err;
 
 	return sprintf(buf, "%u\n", temp);
 }
 
-static const struct thermal_zone_of_device_ops fsl_sata_ahci_of_thermal_ops = {
+static const struct thermal_zone_device_ops fsl_sata_ahci_of_thermal_ops = {
 	.get_temp = sata_ahci_read_temperature,
 };
 
@@ -1131,8 +1136,8 @@ static int imx_ahci_probe(struct platform_device *pdev)
 			ret = PTR_ERR(hwmon_dev);
 			goto disable_clk;
 		}
-		devm_thermal_zone_of_sensor_register(hwmon_dev, 0, hwmon_dev,
-					     &fsl_sata_ahci_of_thermal_ops);
+		devm_thermal_of_zone_register(hwmon_dev, 0, hwmon_dev,
+					      &fsl_sata_ahci_of_thermal_ops);
 		dev_info(dev, "%s: sensor 'sata_ahci'\n", dev_name(hwmon_dev));
 	}
 
-- 
cgit v1.2.3


From 613ed3f67609291c9b757d62f1f6734e40cb60d8 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:42 +0200
Subject: hwmon: pm_bus: core: Switch to new of thermal API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-27-daniel.lezcano@linexp.org
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/hwmon/pmbus/pmbus_core.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/hwmon/pmbus/pmbus_core.c b/drivers/hwmon/pmbus/pmbus_core.c
index f10bac8860fc..5541d26e8623 100644
--- a/drivers/hwmon/pmbus/pmbus_core.c
+++ b/drivers/hwmon/pmbus/pmbus_core.c
@@ -1270,9 +1270,9 @@ struct pmbus_thermal_data {
 	struct pmbus_sensor *sensor;
 };
 
-static int pmbus_thermal_get_temp(void *data, int *temp)
+static int pmbus_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct pmbus_thermal_data *tdata = data;
+	struct pmbus_thermal_data *tdata = tz->devdata;
 	struct pmbus_sensor *sensor = tdata->sensor;
 	struct pmbus_data *pmbus_data = tdata->pmbus_data;
 	struct i2c_client *client = to_i2c_client(pmbus_data->dev);
@@ -1296,7 +1296,7 @@ static int pmbus_thermal_get_temp(void *data, int *temp)
 	return ret;
 }
 
-static const struct thermal_zone_of_device_ops pmbus_thermal_ops = {
+static const struct thermal_zone_device_ops pmbus_thermal_ops = {
 	.get_temp = pmbus_thermal_get_temp,
 };
 
@@ -1314,8 +1314,8 @@ static int pmbus_thermal_add_sensor(struct pmbus_data *pmbus_data,
 	tdata->sensor = sensor;
 	tdata->pmbus_data = pmbus_data;
 
-	tzd = devm_thermal_zone_of_sensor_register(dev, index, tdata,
-						   &pmbus_thermal_ops);
+	tzd = devm_thermal_of_zone_register(dev, index, tdata,
+					    &pmbus_thermal_ops);
 	/*
 	 * If CONFIG_THERMAL_OF is disabled, this returns -ENODEV,
 	 * so ignore that error but forward any other error.
-- 
cgit v1.2.3


From e5181331359d9311b3cc7e09d9d1cb2ffe87f602 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:43 +0200
Subject: hwmon/drivers/core: Switch to new of thermal API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20220804224349.1926752-28-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/hwmon/hwmon.c      | 14 +++++++-------
 drivers/hwmon/scpi-hwmon.c | 14 +++++++-------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index 2e2cd79d89eb..4218750d5a66 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -151,9 +151,9 @@ static DEFINE_IDA(hwmon_ida);
  * between hwmon and thermal_sys modules.
  */
 #ifdef CONFIG_THERMAL_OF
-static int hwmon_thermal_get_temp(void *data, int *temp)
+static int hwmon_thermal_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct hwmon_thermal_data *tdata = data;
+	struct hwmon_thermal_data *tdata = tz->devdata;
 	struct hwmon_device *hwdev = to_hwmon_device(tdata->dev);
 	int ret;
 	long t;
@@ -168,9 +168,9 @@ static int hwmon_thermal_get_temp(void *data, int *temp)
 	return 0;
 }
 
-static int hwmon_thermal_set_trips(void *data, int low, int high)
+static int hwmon_thermal_set_trips(struct thermal_zone_device *tz, int low, int high)
 {
-	struct hwmon_thermal_data *tdata = data;
+	struct hwmon_thermal_data *tdata = tz->devdata;
 	struct hwmon_device *hwdev = to_hwmon_device(tdata->dev);
 	const struct hwmon_chip_info *chip = hwdev->chip;
 	const struct hwmon_channel_info **info = chip->info;
@@ -203,7 +203,7 @@ static int hwmon_thermal_set_trips(void *data, int low, int high)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops hwmon_thermal_ops = {
+static const struct thermal_zone_device_ops hwmon_thermal_ops = {
 	.get_temp = hwmon_thermal_get_temp,
 	.set_trips = hwmon_thermal_set_trips,
 };
@@ -227,8 +227,8 @@ static int hwmon_thermal_add_sensor(struct device *dev, int index)
 	tdata->dev = dev;
 	tdata->index = index;
 
-	tzd = devm_thermal_zone_of_sensor_register(dev, index, tdata,
-						   &hwmon_thermal_ops);
+	tzd = devm_thermal_of_zone_register(dev, index, tdata,
+					    &hwmon_thermal_ops);
 	if (IS_ERR(tzd)) {
 		if (PTR_ERR(tzd) != -ENODEV)
 			return PTR_ERR(tzd);
diff --git a/drivers/hwmon/scpi-hwmon.c b/drivers/hwmon/scpi-hwmon.c
index 5187c6dd5a4f..4d75385f7d5e 100644
--- a/drivers/hwmon/scpi-hwmon.c
+++ b/drivers/hwmon/scpi-hwmon.c
@@ -62,9 +62,9 @@ static void scpi_scale_reading(u64 *value, struct sensor_data *sensor)
 	}
 }
 
-static int scpi_read_temp(void *dev, int *temp)
+static int scpi_read_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct scpi_thermal_zone *zone = dev;
+	struct scpi_thermal_zone *zone = tz->devdata;
 	struct scpi_sensors *scpi_sensors = zone->scpi_sensors;
 	struct scpi_ops *scpi_ops = scpi_sensors->scpi_ops;
 	struct sensor_data *sensor = &scpi_sensors->data[zone->sensor_id];
@@ -121,7 +121,7 @@ scpi_show_label(struct device *dev, struct device_attribute *attr, char *buf)
 	return sprintf(buf, "%s\n", sensor->info.name);
 }
 
-static const struct thermal_zone_of_device_ops scpi_sensor_ops = {
+static const struct thermal_zone_device_ops scpi_sensor_ops = {
 	.get_temp = scpi_read_temp,
 };
 
@@ -275,10 +275,10 @@ static int scpi_hwmon_probe(struct platform_device *pdev)
 
 		zone->sensor_id = i;
 		zone->scpi_sensors = scpi_sensors;
-		z = devm_thermal_zone_of_sensor_register(dev,
-							 sensor->info.sensor_id,
-							 zone,
-							 &scpi_sensor_ops);
+		z = devm_thermal_of_zone_register(dev,
+						  sensor->info.sensor_id,
+						  zone,
+						  &scpi_sensor_ops);
 		/*
 		 * The call to thermal_zone_of_sensor_register returns
 		 * an error for sensors that are not associated with
-- 
cgit v1.2.3


From de15b8403fa9872cb626a52651b257089b34f5d2 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:44 +0200
Subject: iio/drivers/sun4i_gpadc: Switch to new of thermal API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20220804224349.1926752-29-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/iio/adc/sun4i-gpadc-iio.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/iio/adc/sun4i-gpadc-iio.c b/drivers/iio/adc/sun4i-gpadc-iio.c
index 2d393a4dfff6..a6ade70dedf8 100644
--- a/drivers/iio/adc/sun4i-gpadc-iio.c
+++ b/drivers/iio/adc/sun4i-gpadc-iio.c
@@ -412,9 +412,9 @@ static int sun4i_gpadc_runtime_resume(struct device *dev)
 	return 0;
 }
 
-static int sun4i_gpadc_get_temp(void *data, int *temp)
+static int sun4i_gpadc_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct sun4i_gpadc_iio *info = data;
+	struct sun4i_gpadc_iio *info = tz->devdata;
 	int val, scale, offset;
 
 	if (sun4i_gpadc_temp_read(info->indio_dev, &val))
@@ -428,7 +428,7 @@ static int sun4i_gpadc_get_temp(void *data, int *temp)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops sun4i_ts_tz_ops = {
+static const struct thermal_zone_device_ops sun4i_ts_tz_ops = {
 	.get_temp = &sun4i_gpadc_get_temp,
 };
 
@@ -637,9 +637,9 @@ static int sun4i_gpadc_probe(struct platform_device *pdev)
 	pm_runtime_enable(&pdev->dev);
 
 	if (IS_ENABLED(CONFIG_THERMAL_OF)) {
-		info->tzd = thermal_zone_of_sensor_register(info->sensor_device,
-							    0, info,
-							    &sun4i_ts_tz_ops);
+		info->tzd = devm_thermal_of_zone_register(info->sensor_device,
+							  0, info,
+							  &sun4i_ts_tz_ops);
 		/*
 		 * Do not fail driver probing when failing to register in
 		 * thermal because no thermal DT node is found.
@@ -681,8 +681,6 @@ static int sun4i_gpadc_remove(struct platform_device *pdev)
 	if (!IS_ENABLED(CONFIG_THERMAL_OF))
 		return 0;
 
-	thermal_zone_of_sensor_unregister(info->sensor_device, info->tzd);
-
 	if (!info->no_irq)
 		iio_map_array_unregister(indio_dev);
 
-- 
cgit v1.2.3


From ad662b1d606515a048cef06c025dfb150cebaa8d Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:45 +0200
Subject: Input: sun4i-ts - switch to new of thermal API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20220804224349.1926752-30-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/input/touchscreen/sun4i-ts.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/input/touchscreen/sun4i-ts.c b/drivers/input/touchscreen/sun4i-ts.c
index 742a7e96c1b5..73eb8f80be6e 100644
--- a/drivers/input/touchscreen/sun4i-ts.c
+++ b/drivers/input/touchscreen/sun4i-ts.c
@@ -192,12 +192,12 @@ static int sun4i_get_temp(const struct sun4i_ts_data *ts, int *temp)
 	return 0;
 }
 
-static int sun4i_get_tz_temp(void *data, int *temp)
+static int sun4i_get_tz_temp(struct thermal_zone_device *tz, int *temp)
 {
-	return sun4i_get_temp(data, temp);
+	return sun4i_get_temp(tz->devdata, temp);
 }
 
-static const struct thermal_zone_of_device_ops sun4i_ts_tz_ops = {
+static const struct thermal_zone_device_ops sun4i_ts_tz_ops = {
 	.get_temp = sun4i_get_tz_temp,
 };
 
@@ -356,8 +356,8 @@ static int sun4i_ts_probe(struct platform_device *pdev)
 	if (IS_ERR(hwmon))
 		return PTR_ERR(hwmon);
 
-	thermal = devm_thermal_zone_of_sensor_register(ts->dev, 0, ts,
-						       &sun4i_ts_tz_ops);
+	thermal = devm_thermal_of_zone_register(ts->dev, 0, ts,
+						&sun4i_ts_tz_ops);
 	if (IS_ERR(thermal))
 		return PTR_ERR(thermal);
 
-- 
cgit v1.2.3


From 826855ff5746d0f98877eaa4a438abc4e7b58fd5 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:46 +0200
Subject: regulator/drivers/max8976: Switch to new of thermal API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Acked-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-31-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/regulator/max8973-regulator.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/regulator/max8973-regulator.c b/drivers/regulator/max8973-regulator.c
index fdcb0f508984..596cc36aaff6 100644
--- a/drivers/regulator/max8973-regulator.c
+++ b/drivers/regulator/max8973-regulator.c
@@ -434,9 +434,9 @@ static int max8973_init_dcdc(struct max8973_chip *max,
 	return ret;
 }
 
-static int max8973_thermal_read_temp(void *data, int *temp)
+static int max8973_thermal_read_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct max8973_chip *mchip = data;
+	struct max8973_chip *mchip = tz->devdata;
 	unsigned int val;
 	int ret;
 
@@ -465,7 +465,7 @@ static irqreturn_t max8973_thermal_irq(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static const struct thermal_zone_of_device_ops max77621_tz_ops = {
+static const struct thermal_zone_device_ops max77621_tz_ops = {
 	.get_temp = max8973_thermal_read_temp,
 };
 
@@ -479,8 +479,8 @@ static int max8973_thermal_init(struct max8973_chip *mchip)
 	if (mchip->id != MAX77621)
 		return 0;
 
-	tzd = devm_thermal_zone_of_sensor_register(mchip->dev, 0, mchip,
-						   &max77621_tz_ops);
+	tzd = devm_thermal_of_zone_register(mchip->dev, 0, mchip,
+					    &max77621_tz_ops);
 	if (IS_ERR(tzd)) {
 		ret = PTR_ERR(tzd);
 		dev_err(mchip->dev, "Failed to register thermal sensor: %d\n",
-- 
cgit v1.2.3


From 7ea98f70c73ea37d379a76a69fa71653382a1724 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:47 +0200
Subject: thermal/drivers/samsung: Switch to new of thermal API

The thermal OF code has a new API allowing to migrate the OF
initialization to a simpler approach. The ops are no longer device
tree specific and are the generic ones provided by the core code.

Convert the ops to the thermal_zone_device_ops format and use the new
API to register the thermal zone with these generic ops.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-32-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/samsung/exynos_tmu.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c
index f4ab4c5b4b62..51874d0a284c 100644
--- a/drivers/thermal/samsung/exynos_tmu.c
+++ b/drivers/thermal/samsung/exynos_tmu.c
@@ -650,9 +650,9 @@ static void exynos7_tmu_control(struct platform_device *pdev, bool on)
 	writel(con, data->base + EXYNOS_TMU_REG_CONTROL);
 }
 
-static int exynos_get_temp(void *p, int *temp)
+static int exynos_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	struct exynos_tmu_data *data = p;
+	struct exynos_tmu_data *data = tz->devdata;
 	int value, ret = 0;
 
 	if (!data || !data->tmu_read)
@@ -728,9 +728,9 @@ static void exynos4412_tmu_set_emulation(struct exynos_tmu_data *data,
 	writel(val, data->base + emul_con);
 }
 
-static int exynos_tmu_set_emulation(void *drv_data, int temp)
+static int exynos_tmu_set_emulation(struct thermal_zone_device *tz, int temp)
 {
-	struct exynos_tmu_data *data = drv_data;
+	struct exynos_tmu_data *data = tz->devdata;
 	int ret = -EINVAL;
 
 	if (data->soc == SOC_ARCH_EXYNOS4210)
@@ -750,7 +750,7 @@ out:
 }
 #else
 #define exynos4412_tmu_set_emulation NULL
-static int exynos_tmu_set_emulation(void *drv_data, int temp)
+static int exynos_tmu_set_emulation(struct thermal_zone_device *tz, int temp)
 	{ return -EINVAL; }
 #endif /* CONFIG_THERMAL_EMULATION */
 
@@ -997,7 +997,7 @@ static int exynos_map_dt_data(struct platform_device *pdev)
 	return 0;
 }
 
-static const struct thermal_zone_of_device_ops exynos_sensor_ops = {
+static const struct thermal_zone_device_ops exynos_sensor_ops = {
 	.get_temp = exynos_get_temp,
 	.set_emul_temp = exynos_tmu_set_emulation,
 };
@@ -1091,8 +1091,8 @@ static int exynos_tmu_probe(struct platform_device *pdev)
 	 * data->tzd must be registered before calling exynos_tmu_initialize(),
 	 * requesting irq and calling exynos_tmu_control().
 	 */
-	data->tzd = thermal_zone_of_sensor_register(&pdev->dev, 0, data,
-						    &exynos_sensor_ops);
+	data->tzd = devm_thermal_of_zone_register(&pdev->dev, 0, data,
+						  &exynos_sensor_ops);
 	if (IS_ERR(data->tzd)) {
 		ret = PTR_ERR(data->tzd);
 		if (ret != -EPROBE_DEFER)
@@ -1104,21 +1104,19 @@ static int exynos_tmu_probe(struct platform_device *pdev)
 	ret = exynos_tmu_initialize(pdev);
 	if (ret) {
 		dev_err(&pdev->dev, "Failed to initialize TMU\n");
-		goto err_thermal;
+		goto err_sclk;
 	}
 
 	ret = devm_request_irq(&pdev->dev, data->irq, exynos_tmu_irq,
 		IRQF_TRIGGER_RISING | IRQF_SHARED, dev_name(&pdev->dev), data);
 	if (ret) {
 		dev_err(&pdev->dev, "Failed to request irq: %d\n", data->irq);
-		goto err_thermal;
+		goto err_sclk;
 	}
 
 	exynos_tmu_control(pdev, true);
 	return 0;
 
-err_thermal:
-	thermal_zone_of_sensor_unregister(&pdev->dev, data->tzd);
 err_sclk:
 	clk_disable_unprepare(data->sclk);
 err_clk:
@@ -1136,9 +1134,7 @@ err_sensor:
 static int exynos_tmu_remove(struct platform_device *pdev)
 {
 	struct exynos_tmu_data *data = platform_get_drvdata(pdev);
-	struct thermal_zone_device *tzd = data->tzd;
 
-	thermal_zone_of_sensor_unregister(&pdev->dev, tzd);
 	exynos_tmu_control(pdev, false);
 
 	clk_disable_unprepare(data->sclk);
-- 
cgit v1.2.3


From 9326167058e8a5b93179f19fc0368f5324a1f628 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:48 +0200
Subject: thermal/core: Move set_trip_temp ops to the sysfs code

Given the trip points can be set in the thermal zone structure, there
is no need of a specific OF function to do that. Move the code in the
place where it is generic, in the sysfs set_trip_temp storing
function.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-33-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_of.c    | 23 -----------------------
 drivers/thermal/thermal_sysfs.c |  5 ++++-
 2 files changed, 4 insertions(+), 24 deletions(-)

diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c
index a17087c9295d..1dd6b71bdbdd 100644
--- a/drivers/thermal/thermal_of.c
+++ b/drivers/thermal/thermal_of.c
@@ -288,28 +288,6 @@ static int of_thermal_get_trip_temp(struct thermal_zone_device *tz, int trip,
 	return 0;
 }
 
-static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip,
-				    int temp)
-{
-	struct __thermal_zone *data = tz->devdata;
-
-	if (trip >= tz->num_trips || trip < 0)
-		return -EDOM;
-
-	if (data->ops && data->ops->set_trip_temp) {
-		int ret;
-
-		ret = data->ops->set_trip_temp(data->sensor_data, trip, temp);
-		if (ret)
-			return ret;
-	}
-
-	/* thermal framework should take care of data->mask & (1 << trip) */
-	tz->trips[trip].temperature = temp;
-
-	return 0;
-}
-
 static int of_thermal_get_trip_hyst(struct thermal_zone_device *tz, int trip,
 				    int *hyst)
 {
@@ -350,7 +328,6 @@ static int of_thermal_get_crit_temp(struct thermal_zone_device *tz,
 static struct thermal_zone_device_ops of_thermal_ops = {
 	.get_trip_type = of_thermal_get_trip_type,
 	.get_trip_temp = of_thermal_get_trip_temp,
-	.set_trip_temp = of_thermal_set_trip_temp,
 	.get_trip_hyst = of_thermal_get_trip_hyst,
 	.set_trip_hyst = of_thermal_set_trip_hyst,
 	.get_crit_temp = of_thermal_get_crit_temp,
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index 3a8d6e747c25..0f8201060c38 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -115,7 +115,7 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,
 	int temperature, hyst = 0;
 	enum thermal_trip_type type;
 
-	if (!tz->ops->set_trip_temp)
+	if (!tz->ops->set_trip_temp && !tz->trips)
 		return -EPERM;
 
 	if (sscanf(attr->attr.name, "trip_point_%d_temp", &trip) != 1)
@@ -128,6 +128,9 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,
 	if (ret)
 		return ret;
 
+	if (tz->trips)
+		tz->trips[trip].temperature = temperature;
+
 	if (tz->ops->get_trip_hyst) {
 		ret = tz->ops->get_trip_hyst(tz, trip, &hyst);
 		if (ret)
-- 
cgit v1.2.3


From f59ac19b7f44cab23df84810e2da5f57bdd3a7e7 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linexp.org>
Date: Fri, 5 Aug 2022 00:43:49 +0200
Subject: thermal/of: Remove old OF code

All the drivers are converted to the new OF API, remove the old OF code.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linexp.org>
Link: https://lore.kernel.org/r/20220804224349.1926752-34-daniel.lezcano@linexp.org
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_core.h |   2 -
 drivers/thermal/thermal_of.c   | 810 +----------------------------------------
 include/linux/thermal.h        |  77 +---
 3 files changed, 16 insertions(+), 873 deletions(-)

diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index c991bb290512..2241d2dce017 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -135,13 +135,11 @@ thermal_cooling_device_stats_update(struct thermal_cooling_device *cdev,
 
 /* device tree support */
 #ifdef CONFIG_THERMAL_OF
-int of_parse_thermal_zones(void);
 int of_thermal_get_ntrips(struct thermal_zone_device *);
 bool of_thermal_is_trip_valid(struct thermal_zone_device *, int);
 const struct thermal_trip *
 of_thermal_get_trip_points(struct thermal_zone_device *);
 #else
-static inline int of_parse_thermal_zones(void) { return 0; }
 static inline int of_thermal_get_ntrips(struct thermal_zone_device *tz)
 {
 	return 0;
diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c
index 1dd6b71bdbdd..fd2fb84bf246 100644
--- a/drivers/thermal/thermal_of.c
+++ b/drivers/thermal/thermal_of.c
@@ -19,93 +19,6 @@
 
 #include "thermal_core.h"
 
-/***   Private data structures to represent thermal device tree data ***/
-
-/**
- * struct __thermal_cooling_bind_param - a cooling device for a trip point
- * @cooling_device: a pointer to identify the referred cooling device
- * @min: minimum cooling state used at this trip point
- * @max: maximum cooling state used at this trip point
- */
-
-struct __thermal_cooling_bind_param {
-	struct device_node *cooling_device;
-	unsigned long min;
-	unsigned long max;
-};
-
-/**
- * struct __thermal_bind_params - a match between trip and cooling device
- * @tcbp: a pointer to an array of cooling devices
- * @count: number of elements in array
- * @trip_id: the trip point index
- * @usage: the percentage (from 0 to 100) of cooling contribution
- */
-
-struct __thermal_bind_params {
-	struct __thermal_cooling_bind_param *tcbp;
-	unsigned int count;
-	unsigned int trip_id;
-	unsigned int usage;
-};
-
-/**
- * struct __thermal_zone - internal representation of a thermal zone
- * @passive_delay: polling interval while passive cooling is activated
- * @polling_delay: zone polling interval
- * @slope: slope of the temperature adjustment curve
- * @offset: offset of the temperature adjustment curve
- * @ntrips: number of trip points
- * @trips: an array of trip points (0..ntrips - 1)
- * @num_tbps: number of thermal bind params
- * @tbps: an array of thermal bind params (0..num_tbps - 1)
- * @sensor_data: sensor private data used while reading temperature and trend
- * @ops: set of callbacks to handle the thermal zone based on DT
- */
-
-struct __thermal_zone {
-	int passive_delay;
-	int polling_delay;
-	int slope;
-	int offset;
-
-	/* trip data */
-	int ntrips;
-	struct thermal_trip *trips;
-
-	/* cooling binding data */
-	int num_tbps;
-	struct __thermal_bind_params *tbps;
-
-	/* sensor interface */
-	void *sensor_data;
-	const struct thermal_zone_of_device_ops *ops;
-};
-
-/***   DT thermal zone device callbacks   ***/
-
-static int of_thermal_get_temp(struct thermal_zone_device *tz,
-			       int *temp)
-{
-	struct __thermal_zone *data = tz->devdata;
-
-	if (!data->ops || !data->ops->get_temp)
-		return -EINVAL;
-
-	return data->ops->get_temp(data->sensor_data, temp);
-}
-
-static int of_thermal_set_trips(struct thermal_zone_device *tz,
-				int low, int high)
-{
-	struct __thermal_zone *data = tz->devdata;
-
-	if (!data->ops || !data->ops->set_trips)
-		return -EINVAL;
-
-	return data->ops->set_trips(data->sensor_data, low, high);
-}
-
 /**
  * of_thermal_get_ntrips - function to export number of available trip
  *			   points.
@@ -158,114 +71,6 @@ of_thermal_get_trip_points(struct thermal_zone_device *tz)
 }
 EXPORT_SYMBOL_GPL(of_thermal_get_trip_points);
 
-/**
- * of_thermal_set_emul_temp - function to set emulated temperature
- *
- * @tz:	pointer to a thermal zone
- * @temp:	temperature to set
- *
- * This function gives the ability to set emulated value of temperature,
- * which is handy for debugging
- *
- * Return: zero on success, error code otherwise
- */
-static int of_thermal_set_emul_temp(struct thermal_zone_device *tz,
-				    int temp)
-{
-	struct __thermal_zone *data = tz->devdata;
-
-	if (!data->ops || !data->ops->set_emul_temp)
-		return -EINVAL;
-
-	return data->ops->set_emul_temp(data->sensor_data, temp);
-}
-
-static int of_thermal_get_trend(struct thermal_zone_device *tz, int trip,
-				enum thermal_trend *trend)
-{
-	struct __thermal_zone *data = tz->devdata;
-
-	if (!data->ops || !data->ops->get_trend)
-		return -EINVAL;
-
-	return data->ops->get_trend(data->sensor_data, trip, trend);
-}
-
-static int of_thermal_change_mode(struct thermal_zone_device *tz,
-				enum thermal_device_mode mode)
-{
-	struct __thermal_zone *data = tz->devdata;
-
-	return data->ops->change_mode(data->sensor_data, mode);
-}
-
-static int of_thermal_bind(struct thermal_zone_device *thermal,
-			   struct thermal_cooling_device *cdev)
-{
-	struct __thermal_zone *data = thermal->devdata;
-	struct __thermal_bind_params *tbp;
-	struct __thermal_cooling_bind_param *tcbp;
-	int i, j;
-
-	if (!data || IS_ERR(data))
-		return -ENODEV;
-
-	/* find where to bind */
-	for (i = 0; i < data->num_tbps; i++) {
-		tbp = data->tbps + i;
-
-		for (j = 0; j < tbp->count; j++) {
-			tcbp = tbp->tcbp + j;
-
-			if (tcbp->cooling_device == cdev->np) {
-				int ret;
-
-				ret = thermal_zone_bind_cooling_device(thermal,
-						tbp->trip_id, cdev,
-						tcbp->max,
-						tcbp->min,
-						tbp->usage);
-				if (ret)
-					return ret;
-			}
-		}
-	}
-
-	return 0;
-}
-
-static int of_thermal_unbind(struct thermal_zone_device *thermal,
-			     struct thermal_cooling_device *cdev)
-{
-	struct __thermal_zone *data = thermal->devdata;
-	struct __thermal_bind_params *tbp;
-	struct __thermal_cooling_bind_param *tcbp;
-	int i, j;
-
-	if (!data || IS_ERR(data))
-		return -ENODEV;
-
-	/* find where to unbind */
-	for (i = 0; i < data->num_tbps; i++) {
-		tbp = data->tbps + i;
-
-		for (j = 0; j < tbp->count; j++) {
-			tcbp = tbp->tcbp + j;
-
-			if (tcbp->cooling_device == cdev->np) {
-				int ret;
-
-				ret = thermal_zone_unbind_cooling_device(thermal,
-							tbp->trip_id, cdev);
-				if (ret)
-					return ret;
-			}
-		}
-	}
-
-	return 0;
-}
-
 static int of_thermal_get_trip_type(struct thermal_zone_device *tz, int trip,
 				    enum thermal_trip_type *type)
 {
@@ -325,61 +130,6 @@ static int of_thermal_get_crit_temp(struct thermal_zone_device *tz,
 	return -EINVAL;
 }
 
-static struct thermal_zone_device_ops of_thermal_ops = {
-	.get_trip_type = of_thermal_get_trip_type,
-	.get_trip_temp = of_thermal_get_trip_temp,
-	.get_trip_hyst = of_thermal_get_trip_hyst,
-	.set_trip_hyst = of_thermal_set_trip_hyst,
-	.get_crit_temp = of_thermal_get_crit_temp,
-
-	.bind = of_thermal_bind,
-	.unbind = of_thermal_unbind,
-};
-
-/***   sensor API   ***/
-
-static struct thermal_zone_device *
-thermal_zone_of_add_sensor(struct device_node *zone,
-			   struct device_node *sensor, void *data,
-			   const struct thermal_zone_of_device_ops *ops)
-{
-	struct thermal_zone_device *tzd;
-	struct __thermal_zone *tz;
-
-	tzd = thermal_zone_get_zone_by_name(zone->name);
-	if (IS_ERR(tzd))
-		return ERR_PTR(-EPROBE_DEFER);
-
-	tz = tzd->devdata;
-
-	if (!ops)
-		return ERR_PTR(-EINVAL);
-
-	mutex_lock(&tzd->lock);
-	tz->ops = ops;
-	tz->sensor_data = data;
-
-	tzd->ops->get_temp = of_thermal_get_temp;
-	tzd->ops->get_trend = of_thermal_get_trend;
-
-	/*
-	 * The thermal zone core will calculate the window if they have set the
-	 * optional set_trips pointer.
-	 */
-	if (ops->set_trips)
-		tzd->ops->set_trips = of_thermal_set_trips;
-
-	if (ops->set_emul_temp)
-		tzd->ops->set_emul_temp = of_thermal_set_emul_temp;
-
-	if (ops->change_mode)
-		tzd->ops->change_mode = of_thermal_change_mode;
-
-	mutex_unlock(&tzd->lock);
-
-	return tzd;
-}
-
 /**
  * thermal_zone_of_get_sensor_id - get sensor ID from a DT thermal zone
  * @tz_np: a valid thermal zone device node.
@@ -424,216 +174,6 @@ int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
 }
 EXPORT_SYMBOL_GPL(thermal_zone_of_get_sensor_id);
 
-/**
- * thermal_zone_of_sensor_register - registers a sensor to a DT thermal zone
- * @dev: a valid struct device pointer of a sensor device. Must contain
- *       a valid .of_node, for the sensor node.
- * @sensor_id: a sensor identifier, in case the sensor IP has more
- *             than one sensors
- * @data: a private pointer (owned by the caller) that will be passed
- *        back, when a temperature reading is needed.
- * @ops: struct thermal_zone_of_device_ops *. Must contain at least .get_temp.
- *
- * This function will search the list of thermal zones described in device
- * tree and look for the zone that refer to the sensor device pointed by
- * @dev->of_node as temperature providers. For the zone pointing to the
- * sensor node, the sensor will be added to the DT thermal zone device.
- *
- * The thermal zone temperature is provided by the @get_temp function
- * pointer. When called, it will have the private pointer @data back.
- *
- * The thermal zone temperature trend is provided by the @get_trend function
- * pointer. When called, it will have the private pointer @data back.
- *
- * TODO:
- * 01 - This function must enqueue the new sensor instead of using
- * it as the only source of temperature values.
- *
- * 02 - There must be a way to match the sensor with all thermal zones
- * that refer to it.
- *
- * Return: On success returns a valid struct thermal_zone_device,
- * otherwise, it returns a corresponding ERR_PTR(). Caller must
- * check the return value with help of IS_ERR() helper.
- */
-struct thermal_zone_device *
-thermal_zone_of_sensor_register(struct device *dev, int sensor_id, void *data,
-				const struct thermal_zone_of_device_ops *ops)
-{
-	struct device_node *np, *child, *sensor_np;
-	struct thermal_zone_device *tzd = ERR_PTR(-ENODEV);
-	static int old_tz_initialized;
-	int ret;
-
-	if (!old_tz_initialized) {
-		ret = of_parse_thermal_zones();
-		if (ret)
-			return ERR_PTR(ret);
-		old_tz_initialized = 1;
-	}
-
-	np = of_find_node_by_name(NULL, "thermal-zones");
-	if (!np)
-		return ERR_PTR(-ENODEV);
-
-	if (!dev || !dev->of_node) {
-		of_node_put(np);
-		return ERR_PTR(-ENODEV);
-	}
-
-	sensor_np = of_node_get(dev->of_node);
-
-	for_each_available_child_of_node(np, child) {
-		int ret, id;
-
-		/* For now, thermal framework supports only 1 sensor per zone */
-		ret = thermal_zone_of_get_sensor_id(child, sensor_np, &id);
-		if (ret)
-			continue;
-
-		if (id == sensor_id) {
-			tzd = thermal_zone_of_add_sensor(child, sensor_np,
-							 data, ops);
-			if (!IS_ERR(tzd))
-				thermal_zone_device_enable(tzd);
-
-			of_node_put(child);
-			goto exit;
-		}
-	}
-exit:
-	of_node_put(sensor_np);
-	of_node_put(np);
-
-	return tzd;
-}
-EXPORT_SYMBOL_GPL(thermal_zone_of_sensor_register);
-
-/**
- * thermal_zone_of_sensor_unregister - unregisters a sensor from a DT thermal zone
- * @dev: a valid struct device pointer of a sensor device. Must contain
- *       a valid .of_node, for the sensor node.
- * @tzd: a pointer to struct thermal_zone_device where the sensor is registered.
- *
- * This function removes the sensor callbacks and private data from the
- * thermal zone device registered with thermal_zone_of_sensor_register()
- * API. It will also silent the zone by remove the .get_temp() and .get_trend()
- * thermal zone device callbacks.
- *
- * TODO: When the support to several sensors per zone is added, this
- * function must search the sensor list based on @dev parameter.
- *
- */
-void thermal_zone_of_sensor_unregister(struct device *dev,
-				       struct thermal_zone_device *tzd)
-{
-	struct __thermal_zone *tz;
-
-	if (!dev || !tzd || !tzd->devdata)
-		return;
-
-	tz = tzd->devdata;
-
-	/* no __thermal_zone, nothing to be done */
-	if (!tz)
-		return;
-
-	/* stop temperature polling */
-	thermal_zone_device_disable(tzd);
-
-	mutex_lock(&tzd->lock);
-	tzd->ops->get_temp = NULL;
-	tzd->ops->get_trend = NULL;
-	tzd->ops->set_emul_temp = NULL;
-	tzd->ops->change_mode = NULL;
-
-	tz->ops = NULL;
-	tz->sensor_data = NULL;
-	mutex_unlock(&tzd->lock);
-}
-EXPORT_SYMBOL_GPL(thermal_zone_of_sensor_unregister);
-
-static void devm_thermal_zone_of_sensor_release(struct device *dev, void *res)
-{
-	thermal_zone_of_sensor_unregister(dev,
-					  *(struct thermal_zone_device **)res);
-}
-
-static int devm_thermal_zone_of_sensor_match(struct device *dev, void *res,
-					     void *data)
-{
-	struct thermal_zone_device **r = res;
-
-	if (WARN_ON(!r || !*r))
-		return 0;
-
-	return *r == data;
-}
-
-/**
- * devm_thermal_zone_of_sensor_register - Resource managed version of
- *				thermal_zone_of_sensor_register()
- * @dev: a valid struct device pointer of a sensor device. Must contain
- *       a valid .of_node, for the sensor node.
- * @sensor_id: a sensor identifier, in case the sensor IP has more
- *	       than one sensors
- * @data: a private pointer (owned by the caller) that will be passed
- *	  back, when a temperature reading is needed.
- * @ops: struct thermal_zone_of_device_ops *. Must contain at least .get_temp.
- *
- * Refer thermal_zone_of_sensor_register() for more details.
- *
- * Return: On success returns a valid struct thermal_zone_device,
- * otherwise, it returns a corresponding ERR_PTR(). Caller must
- * check the return value with help of IS_ERR() helper.
- * Registered thermal_zone_device device will automatically be
- * released when device is unbounded.
- */
-struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
-	struct device *dev, int sensor_id,
-	void *data, const struct thermal_zone_of_device_ops *ops)
-{
-	struct thermal_zone_device **ptr, *tzd;
-
-	ptr = devres_alloc(devm_thermal_zone_of_sensor_release, sizeof(*ptr),
-			   GFP_KERNEL);
-	if (!ptr)
-		return ERR_PTR(-ENOMEM);
-
-	tzd = thermal_zone_of_sensor_register(dev, sensor_id, data, ops);
-	if (IS_ERR(tzd)) {
-		devres_free(ptr);
-		return tzd;
-	}
-
-	*ptr = tzd;
-	devres_add(dev, ptr);
-
-	return tzd;
-}
-EXPORT_SYMBOL_GPL(devm_thermal_zone_of_sensor_register);
-
-/**
- * devm_thermal_zone_of_sensor_unregister - Resource managed version of
- *				thermal_zone_of_sensor_unregister().
- * @dev: Device for which which resource was allocated.
- * @tzd: a pointer to struct thermal_zone_device where the sensor is registered.
- *
- * This function removes the sensor callbacks and private data from the
- * thermal zone device registered with devm_thermal_zone_of_sensor_register()
- * API. It will also silent the zone by remove the .get_temp() and .get_trend()
- * thermal zone device callbacks.
- * Normally this function will not need to be called and the resource
- * management code will ensure that the resource is freed.
- */
-void devm_thermal_zone_of_sensor_unregister(struct device *dev,
-					    struct thermal_zone_device *tzd)
-{
-	WARN_ON(devres_release(dev, devm_thermal_zone_of_sensor_release,
-			       devm_thermal_zone_of_sensor_match, tzd));
-}
-EXPORT_SYMBOL_GPL(devm_thermal_zone_of_sensor_unregister);
-
 /***   functions parsing device tree nodes   ***/
 
 static int of_find_trip_id(struct device_node *np, struct device_node *trip)
@@ -665,98 +205,6 @@ out:
 	return i;
 }
 
-/**
- * thermal_of_populate_bind_params - parse and fill cooling map data
- * @np: DT node containing a cooling-map node
- * @__tbp: data structure to be filled with cooling map info
- * @trips: array of thermal zone trip points
- * @ntrips: number of trip points inside trips.
- *
- * This function parses a cooling-map type of node represented by
- * @np parameter and fills the read data into @__tbp data structure.
- * It needs the already parsed array of trip points of the thermal zone
- * in consideration.
- *
- * Return: 0 on success, proper error code otherwise
- */
-static int thermal_of_populate_bind_params(struct device_node *tz_np,
-					   struct device_node *np,
-					   struct __thermal_bind_params *__tbp)
-{
-	struct of_phandle_args cooling_spec;
-	struct __thermal_cooling_bind_param *__tcbp;
-	struct device_node *trip;
-	int ret, i, count;
-	int trip_id;
-	u32 prop;
-
-	/* Default weight. Usage is optional */
-	__tbp->usage = THERMAL_WEIGHT_DEFAULT;
-	ret = of_property_read_u32(np, "contribution", &prop);
-	if (ret == 0)
-		__tbp->usage = prop;
-
-	trip = of_parse_phandle(np, "trip", 0);
-	if (!trip) {
-		pr_err("missing trip property\n");
-		return -ENODEV;
-	}
-
-	trip_id = of_find_trip_id(tz_np, trip);
-	if (trip_id < 0) {
-		ret = trip_id;
-		goto end;
-	}
-
-	__tbp->trip_id = trip_id;
-
-	count = of_count_phandle_with_args(np, "cooling-device",
-					   "#cooling-cells");
-	if (count <= 0) {
-		pr_err("Add a cooling_device property with at least one device\n");
-		ret = -ENOENT;
-		goto end;
-	}
-
-	__tcbp = kcalloc(count, sizeof(*__tcbp), GFP_KERNEL);
-	if (!__tcbp) {
-		ret = -ENOMEM;
-		goto end;
-	}
-
-	for (i = 0; i < count; i++) {
-		ret = of_parse_phandle_with_args(np, "cooling-device",
-				"#cooling-cells", i, &cooling_spec);
-		if (ret < 0) {
-			pr_err("Invalid cooling-device entry\n");
-			goto free_tcbp;
-		}
-
-		__tcbp[i].cooling_device = cooling_spec.np;
-
-		if (cooling_spec.args_count >= 2) { /* at least min and max */
-			__tcbp[i].min = cooling_spec.args[0];
-			__tcbp[i].max = cooling_spec.args[1];
-		} else {
-			pr_err("wrong reference to cooling device, missing limits\n");
-		}
-	}
-
-	__tbp->tcbp = __tcbp;
-	__tbp->count = count;
-
-	goto end;
-
-free_tcbp:
-	for (i = i - 1; i >= 0; i--)
-		of_node_put(__tcbp[i].cooling_device);
-	kfree(__tcbp);
-end:
-	of_node_put(trip);
-
-	return ret;
-}
-
 /*
  * It maps 'enum thermal_trip_type' found in include/linux/thermal.h
  * into the device tree binding of 'trip', property type.
@@ -873,174 +321,6 @@ out_of_node_put:
 	return ERR_PTR(ret);
 }
 
-/**
- * thermal_of_build_thermal_zone - parse and fill one thermal zone data
- * @np: DT node containing a thermal zone node
- *
- * This function parses a thermal zone type of node represented by
- * @np parameter and fills the read data into a __thermal_zone data structure
- * and return this pointer.
- *
- * TODO: Missing properties to parse: thermal-sensor-names
- *
- * Return: On success returns a valid struct __thermal_zone,
- * otherwise, it returns a corresponding ERR_PTR(). Caller must
- * check the return value with help of IS_ERR() helper.
- */
-static struct __thermal_zone
-__init *thermal_of_build_thermal_zone(struct device_node *np)
-{
-	struct device_node *child = NULL, *gchild;
-	struct __thermal_zone *tz;
-	int ret, i;
-	u32 prop, coef[2];
-
-	if (!np) {
-		pr_err("no thermal zone np\n");
-		return ERR_PTR(-EINVAL);
-	}
-
-	tz = kzalloc(sizeof(*tz), GFP_KERNEL);
-	if (!tz)
-		return ERR_PTR(-ENOMEM);
-
-	ret = of_property_read_u32(np, "polling-delay-passive", &prop);
-	if (ret < 0) {
-		pr_err("%pOFn: missing polling-delay-passive property\n", np);
-		goto free_tz;
-	}
-	tz->passive_delay = prop;
-
-	ret = of_property_read_u32(np, "polling-delay", &prop);
-	if (ret < 0) {
-		pr_err("%pOFn: missing polling-delay property\n", np);
-		goto free_tz;
-	}
-	tz->polling_delay = prop;
-
-	/*
-	 * REVIST: for now, the thermal framework supports only
-	 * one sensor per thermal zone. Thus, we are considering
-	 * only the first two values as slope and offset.
-	 */
-	ret = of_property_read_u32_array(np, "coefficients", coef, 2);
-	if (ret == 0) {
-		tz->slope = coef[0];
-		tz->offset = coef[1];
-	} else {
-		tz->slope = 1;
-		tz->offset = 0;
-	}
-
-	tz->trips = thermal_of_trips_init(np, &tz->ntrips);
-	if (IS_ERR(tz->trips)) {
-		ret = PTR_ERR(tz->trips);
-		goto finish;
-	}
-
-	/* cooling-maps */
-	child = of_get_child_by_name(np, "cooling-maps");
-
-	/* cooling-maps not provided */
-	if (!child)
-		goto finish;
-
-	tz->num_tbps = of_get_child_count(child);
-	if (tz->num_tbps == 0)
-		goto finish;
-
-	tz->tbps = kcalloc(tz->num_tbps, sizeof(*tz->tbps), GFP_KERNEL);
-	if (!tz->tbps) {
-		ret = -ENOMEM;
-		goto free_trips;
-	}
-
-	i = 0;
-	for_each_child_of_node(child, gchild) {
-		ret = thermal_of_populate_bind_params(np, gchild, &tz->tbps[i++]);
-		if (ret) {
-			of_node_put(gchild);
-			goto free_tbps;
-		}
-	}
-
-finish:
-	of_node_put(child);
-
-	return tz;
-
-free_tbps:
-	for (i = i - 1; i >= 0; i--) {
-		struct __thermal_bind_params *tbp = tz->tbps + i;
-		int j;
-
-		for (j = 0; j < tbp->count; j++)
-			of_node_put(tbp->tcbp[j].cooling_device);
-
-		kfree(tbp->tcbp);
-	}
-
-	kfree(tz->tbps);
-free_trips:
-	kfree(tz->trips);
-free_tz:
-	kfree(tz);
-	of_node_put(child);
-
-	return ERR_PTR(ret);
-}
-
-static void of_thermal_free_zone(struct __thermal_zone *tz)
-{
-	struct __thermal_bind_params *tbp;
-	int i, j;
-
-	for (i = 0; i < tz->num_tbps; i++) {
-		tbp = tz->tbps + i;
-
-		for (j = 0; j < tbp->count; j++)
-			of_node_put(tbp->tcbp[j].cooling_device);
-
-		kfree(tbp->tcbp);
-	}
-
-	kfree(tz->tbps);
-	kfree(tz->trips);
-	kfree(tz);
-}
-
-/**
- * of_thermal_destroy_zones - remove all zones parsed and allocated resources
- *
- * Finds all zones parsed and added to the thermal framework and remove them
- * from the system, together with their resources.
- *
- */
-static __init void of_thermal_destroy_zones(void)
-{
-	struct device_node *np, *child;
-
-	np = of_find_node_by_name(NULL, "thermal-zones");
-	if (!np) {
-		pr_debug("unable to find thermal zones\n");
-		return;
-	}
-
-	for_each_available_child_of_node(np, child) {
-		struct thermal_zone_device *zone;
-
-		zone = thermal_zone_get_zone_by_name(child->name);
-		if (IS_ERR(zone))
-			continue;
-
-		thermal_zone_device_unregister(zone);
-		kfree(zone->tzp);
-		kfree(zone->ops);
-		of_thermal_free_zone(zone->devdata);
-	}
-	of_node_put(np);
-}
-
 static struct device_node *of_thermal_zone_find(struct device_node *sensor, int id)
 {
 	struct device_node *np, *tz;
@@ -1492,95 +772,7 @@ EXPORT_SYMBOL_GPL(devm_thermal_of_zone_register);
  */
 void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz)
 {
-	WARN_ON(devres_release(dev, devm_thermal_zone_of_sensor_release,
+	WARN_ON(devres_release(dev, devm_thermal_of_zone_release,
 			       devm_thermal_of_zone_match, tz));
 }
 EXPORT_SYMBOL_GPL(devm_thermal_of_zone_unregister);
-
-/**
- * of_parse_thermal_zones - parse device tree thermal data
- *
- * Initialization function that can be called by machine initialization
- * code to parse thermal data and populate the thermal framework
- * with hardware thermal zones info. This function only parses thermal zones.
- * Cooling devices and sensor devices nodes are supposed to be parsed
- * by their respective drivers.
- *
- * Return: 0 on success, proper error code otherwise
- *
- */
-int of_parse_thermal_zones(void)
-{
-	struct device_node *np, *child;
-	struct __thermal_zone *tz;
-	struct thermal_zone_device_ops *ops;
-
-	np = of_find_node_by_name(NULL, "thermal-zones");
-	if (!np) {
-		pr_debug("unable to find thermal zones\n");
-		return 0; /* Run successfully on systems without thermal DT */
-	}
-
-	for_each_available_child_of_node(np, child) {
-		struct thermal_zone_device *zone;
-		struct thermal_zone_params *tzp;
-		int i, mask = 0;
-		u32 prop;
-
-		tz = thermal_of_build_thermal_zone(child);
-		if (IS_ERR(tz)) {
-			pr_err("failed to build thermal zone %pOFn: %ld\n",
-			       child,
-			       PTR_ERR(tz));
-			continue;
-		}
-
-		ops = kmemdup(&of_thermal_ops, sizeof(*ops), GFP_KERNEL);
-		if (!ops)
-			goto exit_free;
-
-		tzp = kzalloc(sizeof(*tzp), GFP_KERNEL);
-		if (!tzp) {
-			kfree(ops);
-			goto exit_free;
-		}
-
-		/* No hwmon because there might be hwmon drivers registering */
-		tzp->no_hwmon = true;
-
-		if (!of_property_read_u32(child, "sustainable-power", &prop))
-			tzp->sustainable_power = prop;
-
-		for (i = 0; i < tz->ntrips; i++)
-			mask |= 1 << i;
-
-		/* these two are left for temperature drivers to use */
-		tzp->slope = tz->slope;
-		tzp->offset = tz->offset;
-
-		zone = thermal_zone_device_register_with_trips(child->name, tz->trips, tz->ntrips,
-							       mask, tz, ops, tzp, tz->passive_delay,
-							       tz->polling_delay);
-		if (IS_ERR(zone)) {
-			pr_err("Failed to build %pOFn zone %ld\n", child,
-			       PTR_ERR(zone));
-			kfree(tzp);
-			kfree(ops);
-			of_thermal_free_zone(tz);
-			/* attempting to build remaining zones still */
-		}
-	}
-	of_node_put(np);
-
-	return 0;
-
-exit_free:
-	of_node_put(child);
-	of_node_put(np);
-	of_thermal_free_zone(tz);
-
-	/* no memory available, so free what we have built */
-	of_thermal_destroy_zones();
-
-	return -ENOMEM;
-}
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index e2ac9d473bd6..86c24ddd5985 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -296,33 +296,6 @@ struct thermal_zone_params {
 	int offset;
 };
 
-/**
- * struct thermal_zone_of_device_ops - callbacks for handling DT based zones
- *
- * Mandatory:
- * @get_temp: a pointer to a function that reads the sensor temperature.
- *
- * Optional:
- * @get_trend: a pointer to a function that reads the sensor temperature trend.
- * @set_trips: a pointer to a function that sets a temperature window. When
- *	       this window is left the driver must inform the thermal core via
- *	       thermal_zone_device_update.
- * @set_emul_temp: a pointer to a function that sets sensor emulated
- *		   temperature.
- * @set_trip_temp: a pointer to a function that sets the trip temperature on
- *		   hardware.
- * @change_mode: a pointer to a function that notifies the thermal zone
- *		   mode change.
- */
-struct thermal_zone_of_device_ops {
-	int (*get_temp)(void *, int *);
-	int (*get_trend)(void *, int, enum thermal_trend *);
-	int (*set_trips)(void *, int, int);
-	int (*set_emul_temp)(void *, int);
-	int (*set_trip_temp)(void *, int, int);
-	int (*change_mode) (void *, enum thermal_device_mode);
-};
-
 /* Function declarations */
 #ifdef CONFIG_THERMAL_OF
 struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
@@ -335,61 +308,41 @@ void thermal_of_zone_unregister(struct thermal_zone_device *tz);
 
 void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz);
 
+void thermal_of_zone_unregister(struct thermal_zone_device *tz);
+
 int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
 				  struct device_node *sensor_np,
 				  u32 *id);
-struct thermal_zone_device *
-thermal_zone_of_sensor_register(struct device *dev, int id, void *data,
-				const struct thermal_zone_of_device_ops *ops);
-void thermal_zone_of_sensor_unregister(struct device *dev,
-				       struct thermal_zone_device *tz);
-struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
-		struct device *dev, int id, void *data,
-		const struct thermal_zone_of_device_ops *ops);
-void devm_thermal_zone_of_sensor_unregister(struct device *dev,
-					    struct thermal_zone_device *tz);
 #else
-
-static inline int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
-					 struct device_node *sensor_np,
-					 u32 *id)
-{
-	return -ENOENT;
-}
-static inline struct thermal_zone_device *
-thermal_zone_of_sensor_register(struct device *dev, int id, void *data,
-				const struct thermal_zone_of_device_ops *ops)
-{
-	return ERR_PTR(-ENODEV);
-}
-
 static inline
-void thermal_zone_of_sensor_unregister(struct device *dev,
-				       struct thermal_zone_device *tz)
+struct thermal_zone_device *thermal_of_zone_register(struct device_node *sensor, int id, void *data,
+						     const struct thermal_zone_device_ops *ops)
 {
+	return ERR_PTR(-ENOTSUPP);
 }
 
-static inline struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
-		struct device *dev, int id, void *data,
-		const struct thermal_zone_of_device_ops *ops)
+static inline
+struct thermal_zone_device *devm_thermal_of_zone_register(struct device *dev, int id, void *data,
+							  const struct thermal_zone_device_ops *ops)
 {
-	return ERR_PTR(-ENODEV);
+	return ERR_PTR(-ENOTSUPP);
 }
 
 static inline void thermal_of_zone_unregister(struct thermal_zone_device *tz)
 {
 }
 
-static inline void devm_thermal_of_zone_unregister(struct device *dev, struct thermal_zone_device *tz)
+static inline void devm_thermal_of_zone_unregister(struct device *dev,
+						   struct thermal_zone_device *tz)
 {
 }
 
-static inline
-void devm_thermal_zone_of_sensor_unregister(struct device *dev,
-					    struct thermal_zone_device *tz)
+static inline int thermal_zone_of_get_sensor_id(struct device_node *tz_np,
+						struct device_node *sensor_np,
+						u32 *id)
 {
+	return -ENOENT;
 }
-
 #endif
 
 #ifdef CONFIG_THERMAL
-- 
cgit v1.2.3


From e920209847c396ca243259160b7d10d0dae17b35 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Tue, 9 Aug 2022 11:43:46 +0800
Subject: thermal/drivers/qcom/spmi-adc-tm5: Remove unnecessary print function
 dev_err()

The print function dev_err() is redundant because platform_get_irq()
already prints an error.

./drivers/thermal/qcom/qcom-spmi-adc-tm5.c:1029:2-9: line 1029 is redundant because platform_get_irq() already prints an error.

Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=1846
Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Reviewed-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Link: https://lore.kernel.org/r/20220809034346.128607-1-jiapeng.chong@linux.alibaba.com
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/qcom/qcom-spmi-adc-tm5.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/thermal/qcom/qcom-spmi-adc-tm5.c b/drivers/thermal/qcom/qcom-spmi-adc-tm5.c
index add6f40e5e2a..af68adf720cc 100644
--- a/drivers/thermal/qcom/qcom-spmi-adc-tm5.c
+++ b/drivers/thermal/qcom/qcom-spmi-adc-tm5.c
@@ -1025,10 +1025,8 @@ static int adc_tm5_probe(struct platform_device *pdev)
 	adc_tm->base = reg;
 
 	irq = platform_get_irq(pdev, 0);
-	if (irq < 0) {
-		dev_err(dev, "get_irq failed: %d\n", irq);
+	if (irq < 0)
 		return irq;
-	}
 
 	ret = adc_tm5_get_dt_data(adc_tm, node);
 	if (ret) {
-- 
cgit v1.2.3


From 9662756a9a1c34b3ee606dcddfda6a457f89b07f Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 5 Aug 2022 17:38:30 +0200
Subject: thermal/core: Rearm the monitoring only one time

The current code calls monitor_thermal_zone() inside the
handle_thermal_trip() function. But this one is called in a loop for
each trip point which means the monitoring is rearmed several times
for nothing (assuming there could be several passive and active trip
points).

Move the monitor_thermal_zone() function out of the
handle_thermal_trip() function and after the thermal trip loop, so the
timer will be disabled or rearmed one time.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20220805153834.2510142-1-daniel.lezcano@linaro.org
---
 drivers/thermal/thermal_core.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 69447aba7e65..ea41ea66702a 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -383,11 +383,6 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
 		handle_critical_trips(tz, trip, trip_temp, type);
 	else
 		handle_non_critical_trips(tz, trip);
-	/*
-	 * Alright, we handled this trip successfully.
-	 * So, start monitoring again.
-	 */
-	monitor_thermal_zone(tz);
 }
 
 static void update_temperature(struct thermal_zone_device *tz)
@@ -503,6 +498,8 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
 
 	for (count = 0; count < tz->num_trips; count++)
 		handle_thermal_trip(tz, count);
+
+	monitor_thermal_zone(tz);
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_update);
 
-- 
cgit v1.2.3


From 15a73839e3ced8d418e6c34548f5e2789f9da619 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 5 Aug 2022 17:38:31 +0200
Subject: thermal/core: Rework the monitoring a bit

The should_stop_polling() function wraps the function
thermal_zone_device_is_enabled().

The monitor_thermal_zone() function checks if the thermal zone is
enabled via the should_stop_polling() function.

However, the instant after checking the thermal zone is enabled, this
one can be disabled, so even if that reduces the race window, it does
not prevent that and the monitoring can be set again with the thermal
zone disabled.

For this reason, the function should_stop_polling() is replaced by a
direct check of the thermal zone mode with the mutex locks held, that
prevents the situation described above.

As the semantic is clear with the thermal_zone_is_enabled() function,
we can remove the should_stop_polling() function and replace the check
with the former function.

While at it, reorder the checks to improve the readability of the
monitor_thermal_zone() function.

In the future, the thermal_zone_device_disable() and the
thermal_zone_device_enable() functions should unset / set the polling
timer directly instead of relying on the next
thermal_zone_device_update() call to do that. That will make a
synchronous thermal zone mode change but the locking scheme should be
double checked for that which out of the scope of this change.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20220805153834.2510142-2-daniel.lezcano@linaro.org
---
 drivers/thermal/thermal_core.c | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index ea41ea66702a..5408e92a1168 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -295,25 +295,16 @@ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
 		cancel_delayed_work(&tz->poll_queue);
 }
 
-static inline bool should_stop_polling(struct thermal_zone_device *tz)
-{
-	return !thermal_zone_device_is_enabled(tz);
-}
-
 static void monitor_thermal_zone(struct thermal_zone_device *tz)
 {
-	bool stop;
-
-	stop = should_stop_polling(tz);
-
 	mutex_lock(&tz->lock);
 
-	if (!stop && tz->passive)
+	if (tz->mode != THERMAL_DEVICE_ENABLED)
+		thermal_zone_device_set_polling(tz, 0);
+	else if (tz->passive)
 		thermal_zone_device_set_polling(tz, tz->passive_delay_jiffies);
-	else if (!stop && tz->polling_delay_jiffies)
+	else if (tz->polling_delay_jiffies)
 		thermal_zone_device_set_polling(tz, tz->polling_delay_jiffies);
-	else
-		thermal_zone_device_set_polling(tz, 0);
 
 	mutex_unlock(&tz->lock);
 }
@@ -480,7 +471,7 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
 {
 	int count;
 
-	if (should_stop_polling(tz))
+	if (!thermal_zone_device_is_enabled(tz))
 		return;
 
 	if (atomic_read(&in_suspend))
-- 
cgit v1.2.3


From 63561fe36b094729d3d4d274bafaa030b39e89f6 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 5 Aug 2022 17:38:32 +0200
Subject: thermal/governors: Group the thermal zone lock inside the throttle
 function

The thermal zone lock is taken in the different places in the
throttling path.

At the first glance it does not hurt to move them at the beginning and
the end of the 'throttle' function. That will allow a consolidation of
the lock in the next following changes.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20220805153834.2510142-3-daniel.lezcano@linaro.org
---
 drivers/thermal/gov_bang_bang.c       |  8 ++------
 drivers/thermal/gov_fair_share.c      |  1 +
 drivers/thermal/gov_power_allocator.c | 34 +++++++++++++++-------------------
 drivers/thermal/gov_step_wise.c       |  8 ++------
 4 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c
index 991a1c54296d..f0bff2e0475b 100644
--- a/drivers/thermal/gov_bang_bang.c
+++ b/drivers/thermal/gov_bang_bang.c
@@ -31,8 +31,6 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 				trip, trip_temp, tz->temperature,
 				trip_hyst);
 
-	mutex_lock(&tz->lock);
-
 	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
 		if (instance->trip != trip)
 			continue;
@@ -65,8 +63,6 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 		instance->cdev->updated = false; /* cdev needs update */
 		mutex_unlock(&instance->cdev->lock);
 	}
-
-	mutex_unlock(&tz->lock);
 }
 
 /**
@@ -100,10 +96,10 @@ static int bang_bang_control(struct thermal_zone_device *tz, int trip)
 {
 	struct thermal_instance *instance;
 
-	thermal_zone_trip_update(tz, trip);
-
 	mutex_lock(&tz->lock);
 
+	thermal_zone_trip_update(tz, trip);
+
 	list_for_each_entry(instance, &tz->thermal_instances, tz_node)
 		thermal_cdev_update(instance->cdev);
 
diff --git a/drivers/thermal/gov_fair_share.c b/drivers/thermal/gov_fair_share.c
index 6a2abcfc648f..5d5ddd648cd2 100644
--- a/drivers/thermal/gov_fair_share.c
+++ b/drivers/thermal/gov_fair_share.c
@@ -113,6 +113,7 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
 	}
 
 	mutex_unlock(&tz->lock);
+
 	return 0;
 }
 
diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c
index 1d5052470967..d3aca236e274 100644
--- a/drivers/thermal/gov_power_allocator.c
+++ b/drivers/thermal/gov_power_allocator.c
@@ -392,8 +392,6 @@ static int allocate_power(struct thermal_zone_device *tz,
 	int i, num_actors, total_weight, ret = 0;
 	int trip_max_desired_temperature = params->trip_max_desired_temperature;
 
-	mutex_lock(&tz->lock);
-
 	num_actors = 0;
 	total_weight = 0;
 	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
@@ -404,10 +402,8 @@ static int allocate_power(struct thermal_zone_device *tz,
 		}
 	}
 
-	if (!num_actors) {
-		ret = -ENODEV;
-		goto unlock;
-	}
+	if (!num_actors)
+		return -ENODEV;
 
 	/*
 	 * We need to allocate five arrays of the same size:
@@ -421,10 +417,8 @@ static int allocate_power(struct thermal_zone_device *tz,
 	BUILD_BUG_ON(sizeof(*req_power) != sizeof(*extra_actor_power));
 	BUILD_BUG_ON(sizeof(*req_power) != sizeof(*weighted_req_power));
 	req_power = kcalloc(num_actors * 5, sizeof(*req_power), GFP_KERNEL);
-	if (!req_power) {
-		ret = -ENOMEM;
-		goto unlock;
-	}
+	if (!req_power)
+		return -ENOMEM;
 
 	max_power = &req_power[num_actors];
 	granted_power = &req_power[2 * num_actors];
@@ -496,8 +490,6 @@ static int allocate_power(struct thermal_zone_device *tz,
 				      control_temp - tz->temperature);
 
 	kfree(req_power);
-unlock:
-	mutex_unlock(&tz->lock);
 
 	return ret;
 }
@@ -576,7 +568,6 @@ static void allow_maximum_power(struct thermal_zone_device *tz, bool update)
 	struct power_allocator_params *params = tz->governor_data;
 	u32 req_power;
 
-	mutex_lock(&tz->lock);
 	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
 		struct thermal_cooling_device *cdev = instance->cdev;
 
@@ -598,7 +589,6 @@ static void allow_maximum_power(struct thermal_zone_device *tz, bool update)
 
 		mutex_unlock(&instance->cdev->lock);
 	}
-	mutex_unlock(&tz->lock);
 }
 
 /**
@@ -707,17 +697,19 @@ static void power_allocator_unbind(struct thermal_zone_device *tz)
 
 static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
 {
-	int ret;
+	int ret = 0;
 	int switch_on_temp, control_temp;
 	struct power_allocator_params *params = tz->governor_data;
 	bool update;
 
+	mutex_lock(&tz->lock);
+
 	/*
 	 * We get called for every trip point but we only need to do
 	 * our calculations once
 	 */
 	if (trip != params->trip_max_desired_temperature)
-		return 0;
+		goto out;
 
 	ret = tz->ops->get_trip_temp(tz, params->trip_switch_on,
 				     &switch_on_temp);
@@ -726,7 +718,7 @@ static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
 		tz->passive = 0;
 		reset_pid_controller(params);
 		allow_maximum_power(tz, update);
-		return 0;
+		goto out;
 	}
 
 	tz->passive = 1;
@@ -737,10 +729,14 @@ static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
 		dev_warn(&tz->device,
 			 "Failed to get the maximum desired temperature: %d\n",
 			 ret);
-		return ret;
+		goto out;
 	}
 
-	return allocate_power(tz, control_temp);
+	ret = allocate_power(tz, control_temp);
+
+	mutex_unlock(&tz->lock);
+out:
+	return ret;
 }
 
 static struct thermal_governor thermal_gov_power_allocator = {
diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c
index 9729b46d0258..597a0ebec7a4 100644
--- a/drivers/thermal/gov_step_wise.c
+++ b/drivers/thermal/gov_step_wise.c
@@ -117,8 +117,6 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 	dev_dbg(&tz->device, "Trip%d[type=%d,temp=%d]:trend=%d,throttle=%d\n",
 				trip, trip_type, trip_temp, trend, throttle);
 
-	mutex_lock(&tz->lock);
-
 	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
 		if (instance->trip != trip)
 			continue;
@@ -145,8 +143,6 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 		instance->cdev->updated = false; /* cdev needs update */
 		mutex_unlock(&instance->cdev->lock);
 	}
-
-	mutex_unlock(&tz->lock);
 }
 
 /**
@@ -164,10 +160,10 @@ static int step_wise_throttle(struct thermal_zone_device *tz, int trip)
 {
 	struct thermal_instance *instance;
 
-	thermal_zone_trip_update(tz, trip);
-
 	mutex_lock(&tz->lock);
 
+	thermal_zone_trip_update(tz, trip);
+
 	list_for_each_entry(instance, &tz->thermal_instances, tz_node)
 		thermal_cdev_update(instance->cdev);
 
-- 
cgit v1.2.3


From 670a5e356cb6dfc61b87b599eba483af6a3a99ad Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 5 Aug 2022 17:38:33 +0200
Subject: thermal/core: Move the thermal zone lock out of the governors

All the governors throttling ops are taking/releasing the lock at the
beginning and the end of the function.

We can move the mutex to the throttling call site instead.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20220805153834.2510142-4-daniel.lezcano@linaro.org
---
 drivers/thermal/gov_bang_bang.c       |  4 +---
 drivers/thermal/gov_fair_share.c      |  4 +---
 drivers/thermal/gov_power_allocator.c | 16 ++++++----------
 drivers/thermal/gov_step_wise.c       |  4 +---
 drivers/thermal/thermal_core.c        |  2 ++
 5 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c
index f0bff2e0475b..a08bbe33be96 100644
--- a/drivers/thermal/gov_bang_bang.c
+++ b/drivers/thermal/gov_bang_bang.c
@@ -96,15 +96,13 @@ static int bang_bang_control(struct thermal_zone_device *tz, int trip)
 {
 	struct thermal_instance *instance;
 
-	mutex_lock(&tz->lock);
+	lockdep_assert_held(&tz->lock);
 
 	thermal_zone_trip_update(tz, trip);
 
 	list_for_each_entry(instance, &tz->thermal_instances, tz_node)
 		thermal_cdev_update(instance->cdev);
 
-	mutex_unlock(&tz->lock);
-
 	return 0;
 }
 
diff --git a/drivers/thermal/gov_fair_share.c b/drivers/thermal/gov_fair_share.c
index 5d5ddd648cd2..a4ee4661e9cc 100644
--- a/drivers/thermal/gov_fair_share.c
+++ b/drivers/thermal/gov_fair_share.c
@@ -82,7 +82,7 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
 	int total_instance = 0;
 	int cur_trip_level = get_trip_level(tz);
 
-	mutex_lock(&tz->lock);
+	lockdep_assert_held(&tz->lock);
 
 	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
 		if (instance->trip != trip)
@@ -112,8 +112,6 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
 		mutex_unlock(&cdev->lock);
 	}
 
-	mutex_unlock(&tz->lock);
-
 	return 0;
 }
 
diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c
index d3aca236e274..2d1aeaba38a8 100644
--- a/drivers/thermal/gov_power_allocator.c
+++ b/drivers/thermal/gov_power_allocator.c
@@ -697,19 +697,19 @@ static void power_allocator_unbind(struct thermal_zone_device *tz)
 
 static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
 {
-	int ret = 0;
+	int ret;
 	int switch_on_temp, control_temp;
 	struct power_allocator_params *params = tz->governor_data;
 	bool update;
 
-	mutex_lock(&tz->lock);
+	lockdep_assert_held(&tz->lock);
 
 	/*
 	 * We get called for every trip point but we only need to do
 	 * our calculations once
 	 */
 	if (trip != params->trip_max_desired_temperature)
-		goto out;
+		return 0;
 
 	ret = tz->ops->get_trip_temp(tz, params->trip_switch_on,
 				     &switch_on_temp);
@@ -718,7 +718,7 @@ static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
 		tz->passive = 0;
 		reset_pid_controller(params);
 		allow_maximum_power(tz, update);
-		goto out;
+		return 0;
 	}
 
 	tz->passive = 1;
@@ -729,14 +729,10 @@ static int power_allocator_throttle(struct thermal_zone_device *tz, int trip)
 		dev_warn(&tz->device,
 			 "Failed to get the maximum desired temperature: %d\n",
 			 ret);
-		goto out;
+		return ret;
 	}
 
-	ret = allocate_power(tz, control_temp);
-
-	mutex_unlock(&tz->lock);
-out:
-	return ret;
+	return allocate_power(tz, control_temp);
 }
 
 static struct thermal_governor thermal_gov_power_allocator = {
diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c
index 597a0ebec7a4..cdd3354bc27f 100644
--- a/drivers/thermal/gov_step_wise.c
+++ b/drivers/thermal/gov_step_wise.c
@@ -160,15 +160,13 @@ static int step_wise_throttle(struct thermal_zone_device *tz, int trip)
 {
 	struct thermal_instance *instance;
 
-	mutex_lock(&tz->lock);
+	lockdep_assert_held(&tz->lock);
 
 	thermal_zone_trip_update(tz, trip);
 
 	list_for_each_entry(instance, &tz->thermal_instances, tz_node)
 		thermal_cdev_update(instance->cdev);
 
-	mutex_unlock(&tz->lock);
-
 	return 0;
 }
 
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 5408e92a1168..fcac28d28739 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -311,8 +311,10 @@ static void monitor_thermal_zone(struct thermal_zone_device *tz)
 
 static void handle_non_critical_trips(struct thermal_zone_device *tz, int trip)
 {
+	mutex_lock(&tz->lock);
 	tz->governor ? tz->governor->throttle(tz, trip) :
 		       def_governor->throttle(tz, trip);
+	mutex_unlock(&tz->lock);
 }
 
 void thermal_zone_device_critical(struct thermal_zone_device *tz)
-- 
cgit v1.2.3


From a930da9bf583b2add01fb0e086913664dadaffd0 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 5 Aug 2022 17:38:34 +0200
Subject: thermal/core: Move the mutex inside the thermal_zone_device_update()
 function

All the different calls inside the thermal_zone_device_update()
function take the mutex.

The previous changes move the mutex out of the different functions,
like the throttling ops. Now that the mutexes are all at the same
level in the call stack for the thermal_zone_device_update() function,
they can be moved inside this one.

That has the benefit of:

1. Simplify the code by not having a plethora of places where the lock is taken

2. Probably closes more race windows because releasing the lock from
one line to another can give the opportunity to the thermal zone to change
its state in the meantime. For example, the thermal zone can be
enabled right after checking it is disabled.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20220805153834.2510142-5-daniel.lezcano@linaro.org
---
 drivers/thermal/thermal_core.c    | 32 ++++++-----------
 drivers/thermal/thermal_core.h    |  2 ++
 drivers/thermal/thermal_helpers.c | 73 +++++++++++++++++++++++----------------
 drivers/thermal/thermal_sysfs.c   |  6 +++-
 4 files changed, 61 insertions(+), 52 deletions(-)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index fcac28d28739..481217092cdd 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -297,24 +297,18 @@ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz,
 
 static void monitor_thermal_zone(struct thermal_zone_device *tz)
 {
-	mutex_lock(&tz->lock);
-
 	if (tz->mode != THERMAL_DEVICE_ENABLED)
 		thermal_zone_device_set_polling(tz, 0);
 	else if (tz->passive)
 		thermal_zone_device_set_polling(tz, tz->passive_delay_jiffies);
 	else if (tz->polling_delay_jiffies)
 		thermal_zone_device_set_polling(tz, tz->polling_delay_jiffies);
-
-	mutex_unlock(&tz->lock);
 }
 
 static void handle_non_critical_trips(struct thermal_zone_device *tz, int trip)
 {
-	mutex_lock(&tz->lock);
 	tz->governor ? tz->governor->throttle(tz, trip) :
 		       def_governor->throttle(tz, trip);
-	mutex_unlock(&tz->lock);
 }
 
 void thermal_zone_device_critical(struct thermal_zone_device *tz)
@@ -382,7 +376,7 @@ static void update_temperature(struct thermal_zone_device *tz)
 {
 	int temp, ret;
 
-	ret = thermal_zone_get_temp(tz, &temp);
+	ret = __thermal_zone_get_temp(tz, &temp);
 	if (ret) {
 		if (ret != -EAGAIN)
 			dev_warn(&tz->device,
@@ -391,10 +385,8 @@ static void update_temperature(struct thermal_zone_device *tz)
 		return;
 	}
 
-	mutex_lock(&tz->lock);
 	tz->last_temperature = tz->temperature;
 	tz->temperature = temp;
-	mutex_unlock(&tz->lock);
 
 	trace_thermal_temperature(tz);
 
@@ -457,15 +449,9 @@ EXPORT_SYMBOL_GPL(thermal_zone_device_disable);
 
 int thermal_zone_device_is_enabled(struct thermal_zone_device *tz)
 {
-	enum thermal_device_mode mode;
-
-	mutex_lock(&tz->lock);
-
-	mode = tz->mode;
+	lockdep_assert_held(&tz->lock);
 
-	mutex_unlock(&tz->lock);
-
-	return mode == THERMAL_DEVICE_ENABLED;
+	return tz->mode == THERMAL_DEVICE_ENABLED;
 }
 
 void thermal_zone_device_update(struct thermal_zone_device *tz,
@@ -473,9 +459,6 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
 {
 	int count;
 
-	if (!thermal_zone_device_is_enabled(tz))
-		return;
-
 	if (atomic_read(&in_suspend))
 		return;
 
@@ -483,9 +466,14 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
 		      "'get_temp' ops set\n", __func__))
 		return;
 
+	mutex_lock(&tz->lock);
+
+	if (!thermal_zone_device_is_enabled(tz))
+		goto out;
+
 	update_temperature(tz);
 
-	thermal_zone_set_trips(tz);
+	__thermal_zone_set_trips(tz);
 
 	tz->notify_event = event;
 
@@ -493,6 +481,8 @@ void thermal_zone_device_update(struct thermal_zone_device *tz,
 		handle_thermal_trip(tz, count);
 
 	monitor_thermal_zone(tz);
+out:
+	mutex_unlock(&tz->lock);
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_update);
 
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 2241d2dce017..1571917bd3c8 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -112,6 +112,8 @@ int thermal_build_list_of_policies(char *buf);
 
 /* Helpers */
 void thermal_zone_set_trips(struct thermal_zone_device *tz);
+void __thermal_zone_set_trips(struct thermal_zone_device *tz);
+int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
 
 /* sysfs I/F */
 int thermal_zone_create_device_groups(struct thermal_zone_device *, int);
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index 690890f054a3..c65cdce8f856 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -64,27 +64,17 @@ get_thermal_instance(struct thermal_zone_device *tz,
 }
 EXPORT_SYMBOL(get_thermal_instance);
 
-/**
- * thermal_zone_get_temp() - returns the temperature of a thermal zone
- * @tz: a valid pointer to a struct thermal_zone_device
- * @temp: a valid pointer to where to store the resulting temperature.
- *
- * When a valid thermal zone reference is passed, it will fetch its
- * temperature and fill @temp.
- *
- * Return: On success returns 0, an error code otherwise
- */
-int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
+int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
 {
 	int ret = -EINVAL;
 	int count;
 	int crit_temp = INT_MAX;
 	enum thermal_trip_type type;
 
-	if (!tz || IS_ERR(tz) || !tz->ops->get_temp)
-		goto exit;
+	lockdep_assert_held(&tz->lock);
 
-	mutex_lock(&tz->lock);
+	if (!tz || IS_ERR(tz) || !tz->ops->get_temp)
+		return -EINVAL;
 
 	ret = tz->ops->get_temp(tz, temp);
 
@@ -107,35 +97,42 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
 			*temp = tz->emul_temperature;
 	}
 
-	mutex_unlock(&tz->lock);
-exit:
 	return ret;
 }
-EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
 
 /**
- * thermal_zone_set_trips - Computes the next trip points for the driver
- * @tz: a pointer to a thermal zone device structure
+ * thermal_zone_get_temp() - returns the temperature of a thermal zone
+ * @tz: a valid pointer to a struct thermal_zone_device
+ * @temp: a valid pointer to where to store the resulting temperature.
  *
- * The function computes the next temperature boundaries by browsing
- * the trip points. The result is the closer low and high trip points
- * to the current temperature. These values are passed to the backend
- * driver to let it set its own notification mechanism (usually an
- * interrupt).
+ * When a valid thermal zone reference is passed, it will fetch its
+ * temperature and fill @temp.
  *
- * It does not return a value
+ * Return: On success returns 0, an error code otherwise
  */
-void thermal_zone_set_trips(struct thermal_zone_device *tz)
+int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
+{
+	int ret;
+
+	mutex_lock(&tz->lock);
+	ret = __thermal_zone_get_temp(tz, temp);
+	mutex_unlock(&tz->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
+
+void __thermal_zone_set_trips(struct thermal_zone_device *tz)
 {
 	int low = -INT_MAX;
 	int high = INT_MAX;
 	int trip_temp, hysteresis;
 	int i, ret;
 
-	mutex_lock(&tz->lock);
+	lockdep_assert_held(&tz->lock);
 
 	if (!tz->ops->set_trips || !tz->ops->get_trip_hyst)
-		goto exit;
+		return;
 
 	for (i = 0; i < tz->num_trips; i++) {
 		int trip_low;
@@ -154,7 +151,7 @@ void thermal_zone_set_trips(struct thermal_zone_device *tz)
 
 	/* No need to change trip points */
 	if (tz->prev_low_trip == low && tz->prev_high_trip == high)
-		goto exit;
+		return;
 
 	tz->prev_low_trip = low;
 	tz->prev_high_trip = high;
@@ -169,8 +166,24 @@ void thermal_zone_set_trips(struct thermal_zone_device *tz)
 	ret = tz->ops->set_trips(tz, low, high);
 	if (ret)
 		dev_err(&tz->device, "Failed to set trips: %d\n", ret);
+}
 
-exit:
+/**
+ * thermal_zone_set_trips - Computes the next trip points for the driver
+ * @tz: a pointer to a thermal zone device structure
+ *
+ * The function computes the next temperature boundaries by browsing
+ * the trip points. The result is the closer low and high trip points
+ * to the current temperature. These values are passed to the backend
+ * driver to let it set its own notification mechanism (usually an
+ * interrupt).
+ *
+ * It does not return a value
+ */
+void thermal_zone_set_trips(struct thermal_zone_device *tz)
+{
+	mutex_lock(&tz->lock);
+	__thermal_zone_set_trips(tz);
 	mutex_unlock(&tz->lock);
 }
 
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index 0f8201060c38..78c5841bdfae 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -49,7 +49,11 @@ static ssize_t
 mode_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
-	int enabled = thermal_zone_device_is_enabled(tz);
+	int enabled;
+
+	mutex_lock(&tz->lock);
+	enabled = thermal_zone_device_is_enabled(tz);
+	mutex_unlock(&tz->lock);
 
 	return sprintf(buf, "%s\n", enabled ? "enabled" : "disabled");
 }
-- 
cgit v1.2.3


From 2f9d142c93c293b2526d63e4e75716945edf0cd2 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 12 Aug 2022 15:07:43 +0200
Subject: thermal/core: Fix lockdep_assert() warning

The function thermal_zone_device_is_enabled() must be called with the
thermal zone lock held. In the resume path, it is called without.

As the thermal_zone_device_is_enabled() is also checked in
thermal_zone_device_update(), do the check in resume() function is
pointless, except for saving an extra initialization which does not
hurt if it is done in all the cases.

Fixes: ca48ad71717dd ("thermal/core: Move the mutex inside the thermal_zone_device_update() function")
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
---
 drivers/thermal/thermal_core.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 481217092cdd..99c0c19fa5a6 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1438,9 +1438,6 @@ static int thermal_pm_notify(struct notifier_block *nb,
 	case PM_POST_SUSPEND:
 		atomic_set(&in_suspend, 0);
 		list_for_each_entry(tz, &thermal_tz_list, node) {
-			if (!thermal_zone_device_is_enabled(tz))
-				continue;
-
 			thermal_zone_device_init(tz);
 			thermal_zone_device_update(tz,
 						   THERMAL_EVENT_UNSPECIFIED);
-- 
cgit v1.2.3


From 06f36055121769b9eb9b7d28c7499d1cc8269dc3 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Wed, 17 Aug 2022 17:30:40 +0200
Subject: Revert "mlxsw: core: Add the hottest thermal zone detection"

This reverts commit 2dc2f760052da4925482ecdcdc5c94d4a599153c and
commit 6f73862fabd93213de157d9cc6ef76084311c628.

As discussed in the thread:

https://lore.kernel.org/all/f3c62ebe-7d59-c537-a010-bff366c8aeba@linaro.org/

the feature provided by commits 2dc2f760052da and 6f73862fabd93 is
actually already handled by the thermal framework via the cooling
device state aggregation, thus all this code is pointless.

The revert conflicts with the following changes:
 - 7f4957be0d5b8: thermal: Use mode helpers in drivers
 - 6a79507cfe94c: mlxsw: core: Extend thermal module with per QSFP module thermal zones

These conflicts were fixed and the resulting changes are in this patch.

Both reverts are in the same change as requested by Ido Schimmel:

https://lore.kernel.org/all/Yvz7+RUsmVco3Xpj@shredder/

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Tested-by: Vadim Pasternak <vadimp@nvidia.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Link: https://lore.kernel.org/r/20220817153040.2464245-1-daniel.lezcano@linaro.org
---
 drivers/net/ethernet/mellanox/mlxsw/core_thermal.c | 77 +---------------------
 1 file changed, 2 insertions(+), 75 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
index 3548fe1df7c8..987fe5c9d5a3 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core_thermal.c
@@ -21,7 +21,6 @@
 #define MLXSW_THERMAL_ASIC_TEMP_HOT	105000	/* 105C */
 #define MLXSW_THERMAL_HYSTERESIS_TEMP	5000	/* 5C */
 #define MLXSW_THERMAL_MODULE_TEMP_SHIFT	(MLXSW_THERMAL_HYSTERESIS_TEMP * 2)
-#define MLXSW_THERMAL_TEMP_SCORE_MAX	GENMASK(31, 0)
 #define MLXSW_THERMAL_MAX_STATE	10
 #define MLXSW_THERMAL_MIN_STATE	2
 #define MLXSW_THERMAL_MAX_DUTY	255
@@ -101,8 +100,6 @@ struct mlxsw_thermal {
 	struct thermal_cooling_device *cdevs[MLXSW_MFCR_PWMS_MAX];
 	u8 cooling_levels[MLXSW_THERMAL_MAX_STATE + 1];
 	struct mlxsw_thermal_trip trips[MLXSW_THERMAL_NUM_TRIPS];
-	unsigned int tz_highest_score;
-	struct thermal_zone_device *tz_highest_dev;
 	struct mlxsw_thermal_area line_cards[];
 };
 
@@ -193,34 +190,6 @@ mlxsw_thermal_module_trips_update(struct device *dev, struct mlxsw_core *core,
 	return 0;
 }
 
-static void mlxsw_thermal_tz_score_update(struct mlxsw_thermal *thermal,
-					  struct thermal_zone_device *tzdev,
-					  struct mlxsw_thermal_trip *trips,
-					  int temp)
-{
-	struct mlxsw_thermal_trip *trip = trips;
-	unsigned int score, delta, i, shift = 1;
-
-	/* Calculate thermal zone score, if temperature is above the hot
-	 * threshold score is set to MLXSW_THERMAL_TEMP_SCORE_MAX.
-	 */
-	score = MLXSW_THERMAL_TEMP_SCORE_MAX;
-	for (i = MLXSW_THERMAL_TEMP_TRIP_NORM; i < MLXSW_THERMAL_NUM_TRIPS;
-	     i++, trip++) {
-		if (temp < trip->temp) {
-			delta = DIV_ROUND_CLOSEST(temp, trip->temp - temp);
-			score = delta * shift;
-			break;
-		}
-		shift *= 256;
-	}
-
-	if (score > thermal->tz_highest_score) {
-		thermal->tz_highest_score = score;
-		thermal->tz_highest_dev = tzdev;
-	}
-}
-
 static int mlxsw_thermal_bind(struct thermal_zone_device *tzdev,
 			      struct thermal_cooling_device *cdev)
 {
@@ -286,9 +255,6 @@ static int mlxsw_thermal_get_temp(struct thermal_zone_device *tzdev,
 		return err;
 	}
 	mlxsw_reg_mtmp_unpack(mtmp_pl, &temp, NULL, NULL, NULL, NULL);
-	if (temp > 0)
-		mlxsw_thermal_tz_score_update(thermal, tzdev, thermal->trips,
-					      temp);
 
 	*p_temp = temp;
 	return 0;
@@ -349,21 +315,6 @@ static int mlxsw_thermal_set_trip_hyst(struct thermal_zone_device *tzdev,
 	return 0;
 }
 
-static int mlxsw_thermal_trend_get(struct thermal_zone_device *tzdev,
-				   int trip, enum thermal_trend *trend)
-{
-	struct mlxsw_thermal *thermal = tzdev->devdata;
-
-	if (trip < 0 || trip >= MLXSW_THERMAL_NUM_TRIPS)
-		return -EINVAL;
-
-	if (tzdev == thermal->tz_highest_dev)
-		return 1;
-
-	*trend = THERMAL_TREND_STABLE;
-	return 0;
-}
-
 static struct thermal_zone_params mlxsw_thermal_params = {
 	.no_hwmon = true,
 };
@@ -377,7 +328,6 @@ static struct thermal_zone_device_ops mlxsw_thermal_ops = {
 	.set_trip_temp	= mlxsw_thermal_set_trip_temp,
 	.get_trip_hyst	= mlxsw_thermal_get_trip_hyst,
 	.set_trip_hyst	= mlxsw_thermal_set_trip_hyst,
-	.get_trend	= mlxsw_thermal_trend_get,
 };
 
 static int mlxsw_thermal_module_bind(struct thermal_zone_device *tzdev,
@@ -463,7 +413,6 @@ static int mlxsw_thermal_module_temp_get(struct thermal_zone_device *tzdev,
 	int temp, crit_temp, emerg_temp;
 	struct device *dev;
 	u16 sensor_index;
-	int err;
 
 	dev = thermal->bus_info->dev;
 	sensor_index = MLXSW_REG_MTMP_MODULE_INDEX_MIN + tz->module;
@@ -479,10 +428,8 @@ static int mlxsw_thermal_module_temp_get(struct thermal_zone_device *tzdev,
 		return 0;
 
 	/* Update trip points. */
-	err = mlxsw_thermal_module_trips_update(dev, thermal->core, tz,
-						crit_temp, emerg_temp);
-	if (!err && temp > 0)
-		mlxsw_thermal_tz_score_update(thermal, tzdev, tz->trips, temp);
+	mlxsw_thermal_module_trips_update(dev, thermal->core, tz,
+					  crit_temp, emerg_temp);
 
 	return 0;
 }
@@ -546,22 +493,6 @@ mlxsw_thermal_module_trip_hyst_set(struct thermal_zone_device *tzdev, int trip,
 	return 0;
 }
 
-static int mlxsw_thermal_module_trend_get(struct thermal_zone_device *tzdev,
-					  int trip, enum thermal_trend *trend)
-{
-	struct mlxsw_thermal_module *tz = tzdev->devdata;
-	struct mlxsw_thermal *thermal = tz->parent;
-
-	if (trip < 0 || trip >= MLXSW_THERMAL_NUM_TRIPS)
-		return -EINVAL;
-
-	if (tzdev == thermal->tz_highest_dev)
-		return 1;
-
-	*trend = THERMAL_TREND_STABLE;
-	return 0;
-}
-
 static struct thermal_zone_device_ops mlxsw_thermal_module_ops = {
 	.bind		= mlxsw_thermal_module_bind,
 	.unbind		= mlxsw_thermal_module_unbind,
@@ -571,7 +502,6 @@ static struct thermal_zone_device_ops mlxsw_thermal_module_ops = {
 	.set_trip_temp	= mlxsw_thermal_module_trip_temp_set,
 	.get_trip_hyst	= mlxsw_thermal_module_trip_hyst_get,
 	.set_trip_hyst	= mlxsw_thermal_module_trip_hyst_set,
-	.get_trend	= mlxsw_thermal_module_trend_get,
 };
 
 static int mlxsw_thermal_gearbox_temp_get(struct thermal_zone_device *tzdev,
@@ -592,8 +522,6 @@ static int mlxsw_thermal_gearbox_temp_get(struct thermal_zone_device *tzdev,
 		return err;
 
 	mlxsw_reg_mtmp_unpack(mtmp_pl, &temp, NULL, NULL, NULL, NULL);
-	if (temp > 0)
-		mlxsw_thermal_tz_score_update(thermal, tzdev, tz->trips, temp);
 
 	*p_temp = temp;
 	return 0;
@@ -608,7 +536,6 @@ static struct thermal_zone_device_ops mlxsw_thermal_gearbox_ops = {
 	.set_trip_temp	= mlxsw_thermal_module_trip_temp_set,
 	.get_trip_hyst	= mlxsw_thermal_module_trip_hyst_get,
 	.set_trip_hyst	= mlxsw_thermal_module_trip_hyst_set,
-	.get_trend	= mlxsw_thermal_module_trend_get,
 };
 
 static int mlxsw_thermal_get_max_state(struct thermal_cooling_device *cdev,
-- 
cgit v1.2.3


From 0db7058e8e23e6bbab1b4747ecabd1784c34f50b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 24 May 2022 11:01:18 +0200
Subject: x86/clear_user: Make it faster
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based on a patch by Mark Hemment <markhemm@googlemail.com> and
incorporating very sane suggestions from Linus.

The point here is to have the default case with FSRM - which is supposed
to be the majority of x86 hw out there - if not now then soon - be
directly inlined into the instruction stream so that no function call
overhead is taking place.

Drop the early clobbers from the @size and @addr operands as those are
not needed anymore since we have single instruction alternatives.

The benchmarks I ran would show very small improvements and a PF
benchmark would even show weird things like slowdowns with higher core
counts.

So for a ~6m running the git test suite, the function gets called under
700K times, all from padzero():

  <...>-2536    [006] .....   261.208801: padzero: to: 0x55b0663ed214, size: 3564, cycles: 21900
  <...>-2536    [006] .....   261.208819: padzero: to: 0x7f061adca078, size: 3976, cycles: 17160
  <...>-2537    [008] .....   261.211027: padzero: to: 0x5572d019e240, size: 3520, cycles: 23850
  <...>-2537    [008] .....   261.211049: padzero: to: 0x7f1288dc9078, size: 3976, cycles: 15900
   ...

which is around 1%-ish of the total time and which is consistent with
the benchmark numbers.

So Mel gave me the idea to simply measure how fast the function becomes.
I.e.:

  start = rdtsc_ordered();
  ret = __clear_user(to, n);
  end = rdtsc_ordered();

Computing the mean average of all the samples collected during the test
suite run then shows some improvement:

  clear_user_original:
  Amean: 9219.71 (Sum: 6340154910, samples: 687674)

  fsrm:
  Amean: 8030.63 (Sum: 5522277720, samples: 687652)

That's on Zen3.

The situation looks a lot more confusing on Intel:

Icelake:

  clear_user_original:
  Amean: 19679.4 (Sum: 13652560764, samples: 693750)
  Amean: 19743.7 (Sum: 13693470604, samples: 693562)

(I ran it twice just to be sure.)

  ERMS:
  Amean: 20374.3 (Sum: 13910601024, samples: 682752)
  Amean: 20453.7 (Sum: 14186223606, samples: 693576)

  FSRM:
  Amean: 20458.2 (Sum: 13918381386, sample s: 680331)

The original microbenchmark which people were complaining about:

  for i in $(seq 1 10); do dd if=/dev/zero of=/dev/null bs=1M status=progress count=65536; done 2>&1 | grep copied
  32207011840 bytes (32 GB, 30 GiB) copied, 1 s, 32.2 GB/s
  68719476736 bytes (69 GB, 64 GiB) copied, 1.93069 s, 35.6 GB/s
  37597741056 bytes (38 GB, 35 GiB) copied, 1 s, 37.6 GB/s
  68719476736 bytes (69 GB, 64 GiB) copied, 1.78017 s, 38.6 GB/s
  62020124672 bytes (62 GB, 58 GiB) copied, 2 s, 31.0 GB/s
  68719476736 bytes (69 GB, 64 GiB) copied, 2.13716 s, 32.2 GB/s
  60010004480 bytes (60 GB, 56 GiB) copied, 1 s, 60.0 GB/s
  68719476736 bytes (69 GB, 64 GiB) copied, 1.14129 s, 60.2 GB/s
  53212086272 bytes (53 GB, 50 GiB) copied, 1 s, 53.2 GB/s
  68719476736 bytes (69 GB, 64 GiB) copied, 1.28398 s, 53.5 GB/s
  55698259968 bytes (56 GB, 52 GiB) copied, 1 s, 55.7 GB/s
  68719476736 bytes (69 GB, 64 GiB) copied, 1.22507 s, 56.1 GB/s
  55306092544 bytes (55 GB, 52 GiB) copied, 1 s, 55.3 GB/s
  68719476736 bytes (69 GB, 64 GiB) copied, 1.23647 s, 55.6 GB/s
  54387539968 bytes (54 GB, 51 GiB) copied, 1 s, 54.4 GB/s
  68719476736 bytes (69 GB, 64 GiB) copied, 1.25693 s, 54.7 GB/s
  50566529024 bytes (51 GB, 47 GiB) copied, 1 s, 50.6 GB/s
  68719476736 bytes (69 GB, 64 GiB) copied, 1.35096 s, 50.9 GB/s
  58308165632 bytes (58 GB, 54 GiB) copied, 1 s, 58.3 GB/s
  68719476736 bytes (69 GB, 64 GiB) copied, 1.17394 s, 58.5 GB/s

Now the same thing with smaller buffers:

  for i in $(seq 1 10); do dd if=/dev/zero of=/dev/null bs=1M status=progress count=8192; done 2>&1 | grep copied
  8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.28485 s, 30.2 GB/s
  8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.276112 s, 31.1 GB/s
  8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.29136 s, 29.5 GB/s
  8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.283803 s, 30.3 GB/s
  8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.306503 s, 28.0 GB/s
  8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.349169 s, 24.6 GB/s
  8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.276912 s, 31.0 GB/s
  8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.265356 s, 32.4 GB/s
  8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.28464 s, 30.2 GB/s
  8589934592 bytes (8.6 GB, 8.0 GiB) copied, 0.242998 s, 35.3 GB/s

is also not conclusive because it all depends on the buffer sizes,
their alignments and when the microcode detects that cachelines can be
aggregated properly and copied in bigger sizes.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/CAHk-=wh=Mu_EYhtOmPn6AxoQZyEh-4fo2Zx3G7rBv1g7vwoKiw@mail.gmail.com
---
 arch/x86/include/asm/uaccess.h    |   5 +-
 arch/x86/include/asm/uaccess_64.h |  45 +++++++++++++
 arch/x86/lib/clear_page_64.S      | 138 ++++++++++++++++++++++++++++++++++++++
 arch/x86/lib/usercopy_64.c        |  40 -----------
 tools/objtool/check.c             |   3 +
 5 files changed, 188 insertions(+), 43 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 913e593a3b45..c46207946e05 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -502,9 +502,6 @@ strncpy_from_user(char *dst, const char __user *src, long count);
 
 extern __must_check long strnlen_user(const char __user *str, long n);
 
-unsigned long __must_check clear_user(void __user *mem, unsigned long len);
-unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
-
 #ifdef CONFIG_ARCH_HAS_COPY_MC
 unsigned long __must_check
 copy_mc_to_kernel(void *to, const void *from, unsigned len);
@@ -526,6 +523,8 @@ extern struct movsl_mask {
 #define ARCH_HAS_NOCACHE_UACCESS 1
 
 #ifdef CONFIG_X86_32
+unsigned long __must_check clear_user(void __user *mem, unsigned long len);
+unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
 # include <asm/uaccess_32.h>
 #else
 # include <asm/uaccess_64.h>
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 45697e04d771..d13d71af5cf6 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -79,4 +79,49 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
 	kasan_check_write(dst, size);
 	return __copy_user_flushcache(dst, src, size);
 }
+
+/*
+ * Zero Userspace.
+ */
+
+__must_check unsigned long
+clear_user_original(void __user *addr, unsigned long len);
+__must_check unsigned long
+clear_user_rep_good(void __user *addr, unsigned long len);
+__must_check unsigned long
+clear_user_erms(void __user *addr, unsigned long len);
+
+static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
+{
+	might_fault();
+	stac();
+
+	/*
+	 * No memory constraint because it doesn't change any memory gcc
+	 * knows about.
+	 */
+	asm volatile(
+		"1:\n\t"
+		ALTERNATIVE_3("rep stosb",
+			      "call clear_user_erms",	  ALT_NOT(X86_FEATURE_FSRM),
+			      "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS),
+			      "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD))
+		"2:\n"
+	       _ASM_EXTABLE_UA(1b, 2b)
+	       : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
+	       : "a" (0)
+		/* rep_good clobbers %rdx */
+	       : "rdx");
+
+	clac();
+
+	return size;
+}
+
+static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
+{
+	if (access_ok(to, n))
+		return __clear_user(to, n);
+	return n;
+}
 #endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index fe59b8ac4fcc..ecbfb4dd3b01 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 #include <linux/linkage.h>
+#include <asm/asm.h>
 #include <asm/export.h>
 
 /*
@@ -50,3 +51,140 @@ SYM_FUNC_START(clear_page_erms)
 	RET
 SYM_FUNC_END(clear_page_erms)
 EXPORT_SYMBOL_GPL(clear_page_erms)
+
+/*
+ * Default clear user-space.
+ * Input:
+ * rdi destination
+ * rcx count
+ *
+ * Output:
+ * rcx: uncleared bytes or 0 if successful.
+ */
+SYM_FUNC_START(clear_user_original)
+	/*
+	 * Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
+	 * i.e., no need for a 'q' suffix and thus a REX prefix.
+	 */
+	mov %ecx,%eax
+	shr $3,%rcx
+	jz .Lrest_bytes
+
+	# do the qwords first
+	.p2align 4
+.Lqwords:
+	movq $0,(%rdi)
+	lea 8(%rdi),%rdi
+	dec %rcx
+	jnz .Lqwords
+
+.Lrest_bytes:
+	and $7,  %eax
+	jz .Lexit
+
+	# now do the rest bytes
+.Lbytes:
+	movb $0,(%rdi)
+	inc %rdi
+	dec %eax
+	jnz .Lbytes
+
+.Lexit:
+	/*
+	 * %rax still needs to be cleared in the exception case because this function is called
+	 * from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
+	 * in case it might reuse it somewhere.
+	 */
+        xor %eax,%eax
+        RET
+
+.Lqwords_exception:
+        # convert remaining qwords back into bytes to return to caller
+        shl $3, %rcx
+        and $7, %eax
+        add %rax,%rcx
+        jmp .Lexit
+
+.Lbytes_exception:
+        mov %eax,%ecx
+        jmp .Lexit
+
+        _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
+        _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
+SYM_FUNC_END(clear_user_original)
+EXPORT_SYMBOL(clear_user_original)
+
+/*
+ * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
+ * present.
+ * Input:
+ * rdi destination
+ * rcx count
+ *
+ * Output:
+ * rcx: uncleared bytes or 0 if successful.
+ */
+SYM_FUNC_START(clear_user_rep_good)
+	# call the original thing for less than a cacheline
+	cmp $64, %rcx
+	jb clear_user_original
+
+.Lprep:
+	# copy lower 32-bits for rest bytes
+	mov %ecx, %edx
+	shr $3, %rcx
+	jz .Lrep_good_rest_bytes
+
+.Lrep_good_qwords:
+	rep stosq
+
+.Lrep_good_rest_bytes:
+	and $7, %edx
+	jz .Lrep_good_exit
+
+.Lrep_good_bytes:
+	mov %edx, %ecx
+	rep stosb
+
+.Lrep_good_exit:
+	# see .Lexit comment above
+	xor %eax, %eax
+	RET
+
+.Lrep_good_qwords_exception:
+	# convert remaining qwords back into bytes to return to caller
+	shl $3, %rcx
+	and $7, %edx
+	add %rdx, %rcx
+	jmp .Lrep_good_exit
+
+	_ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception)
+	_ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit)
+SYM_FUNC_END(clear_user_rep_good)
+EXPORT_SYMBOL(clear_user_rep_good)
+
+/*
+ * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present.
+ * Input:
+ * rdi destination
+ * rcx count
+ *
+ * Output:
+ * rcx: uncleared bytes or 0 if successful.
+ *
+ */
+SYM_FUNC_START(clear_user_erms)
+	# call the original thing for less than a cacheline
+	cmp $64, %rcx
+	jb clear_user_original
+
+.Lerms_bytes:
+	rep stosb
+
+.Lerms_exit:
+	xorl %eax,%eax
+	RET
+
+	_ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit)
+SYM_FUNC_END(clear_user_erms)
+EXPORT_SYMBOL(clear_user_erms)
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0ae6cf804197..6c1f8ac5e721 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -14,46 +14,6 @@
  * Zero Userspace
  */
 
-unsigned long __clear_user(void __user *addr, unsigned long size)
-{
-	long __d0;
-	might_fault();
-	/* no memory constraint because it doesn't change any memory gcc knows
-	   about */
-	stac();
-	asm volatile(
-		"	testq  %[size8],%[size8]\n"
-		"	jz     4f\n"
-		"	.align 16\n"
-		"0:	movq $0,(%[dst])\n"
-		"	addq   $8,%[dst]\n"
-		"	decl %%ecx ; jnz   0b\n"
-		"4:	movq  %[size1],%%rcx\n"
-		"	testl %%ecx,%%ecx\n"
-		"	jz     2f\n"
-		"1:	movb   $0,(%[dst])\n"
-		"	incq   %[dst]\n"
-		"	decl %%ecx ; jnz  1b\n"
-		"2:\n"
-
-		_ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1])
-		_ASM_EXTABLE_UA(1b, 2b)
-
-		: [size8] "=&c"(size), [dst] "=&D" (__d0)
-		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
-	clac();
-	return size;
-}
-EXPORT_SYMBOL(__clear_user);
-
-unsigned long clear_user(void __user *to, unsigned long n)
-{
-	if (access_ok(to, n))
-		return __clear_user(to, n);
-	return n;
-}
-EXPORT_SYMBOL(clear_user);
-
 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
 /**
  * clean_cache_range - write back a cache range with CLWB
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 0cec74da7ffe..4b2e11726f4e 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1071,6 +1071,9 @@ static const char *uaccess_safe_builtin[] = {
 	"copy_mc_fragile_handle_tail",
 	"copy_mc_enhanced_fast_string",
 	"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
+	"clear_user_erms",
+	"clear_user_rep_good",
+	"clear_user_original",
 	NULL
 };
 
-- 
cgit v1.2.3


From 3ecf671f1d354f40228e407ab350abd41034410b Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 13 Aug 2022 22:38:21 +0000
Subject: x86/microcode: Document the whole late loading problem

Commit

  d23d33ea0fcd ("x86/microcode: Taint and warn on late loading")

started tainting the kernel after microcode late loading.

There is some history behind why x86 microcode started doing the late
loading stop_machine() rendezvous. Document the whole situation.

No functional changes.

  [ bp: Fix typos, heavily massage. ]

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220813223825.3164861-2-ashok.raj@intel.com
---
 Documentation/admin-guide/tainted-kernels.rst |   6 ++
 Documentation/x86/microcode.rst               | 116 ++++++++++++++++++++++++--
 2 files changed, 113 insertions(+), 9 deletions(-)

diff --git a/Documentation/admin-guide/tainted-kernels.rst b/Documentation/admin-guide/tainted-kernels.rst
index 7d80e8c307d1..92a8a07f5c43 100644
--- a/Documentation/admin-guide/tainted-kernels.rst
+++ b/Documentation/admin-guide/tainted-kernels.rst
@@ -134,6 +134,12 @@ More detailed explanation for tainting
        scsi/snic on something else than x86_64, scsi/ips on non
        x86/x86_64/itanium, have broken firmware settings for the
        irqchip/irq-gic on arm64 ...).
+     - x86/x86_64: Microcode late loading is dangerous and will result in
+       tainting the kernel. It requires that all CPUs rendezvous to make sure
+       the update happens when the system is as quiescent as possible. However,
+       a higher priority MCE/SMI/NMI can move control flow away from that
+       rendezvous and interrupt the update, which can be detrimental to the
+       machine.
 
  3)  ``R`` if a module was force unloaded by ``rmmod -f``, ``' '`` if all
      modules were unloaded normally.
diff --git a/Documentation/x86/microcode.rst b/Documentation/x86/microcode.rst
index a320d37982ed..b627c6f36bcf 100644
--- a/Documentation/x86/microcode.rst
+++ b/Documentation/x86/microcode.rst
@@ -6,6 +6,7 @@ The Linux Microcode Loader
 
 :Authors: - Fenghua Yu <fenghua.yu@intel.com>
           - Borislav Petkov <bp@suse.de>
+	  - Ashok Raj <ashok.raj@intel.com>
 
 The kernel has a x86 microcode loading facility which is supposed to
 provide microcode loading methods in the OS. Potential use cases are
@@ -92,15 +93,8 @@ vendor's site.
 Late loading
 ============
 
-There are two legacy user space interfaces to load microcode, either through
-/dev/cpu/microcode or through /sys/devices/system/cpu/microcode/reload file
-in sysfs.
-
-The /dev/cpu/microcode method is deprecated because it needs a special
-userspace tool for that.
-
-The easier method is simply installing the microcode packages your distro
-supplies and running::
+You simply install the microcode packages your distro supplies and
+run::
 
   # echo 1 > /sys/devices/system/cpu/microcode/reload
 
@@ -110,6 +104,110 @@ The loading mechanism looks for microcode blobs in
 /lib/firmware/{intel-ucode,amd-ucode}. The default distro installation
 packages already put them there.
 
+Since kernel 5.19, late loading is not enabled by default.
+
+The /dev/cpu/microcode method has been removed in 5.19.
+
+Why is late loading dangerous?
+==============================
+
+Synchronizing all CPUs
+----------------------
+
+The microcode engine which receives the microcode update is shared
+between the two logical threads in a SMT system. Therefore, when
+the update is executed on one SMT thread of the core, the sibling
+"automatically" gets the update.
+
+Since the microcode can "simulate" MSRs too, while the microcode update
+is in progress, those simulated MSRs transiently cease to exist. This
+can result in unpredictable results if the SMT sibling thread happens to
+be in the middle of an access to such an MSR. The usual observation is
+that such MSR accesses cause #GPs to be raised to signal that former are
+not present.
+
+The disappearing MSRs are just one common issue which is being observed.
+Any other instruction that's being patched and gets concurrently
+executed by the other SMT sibling, can also result in similar,
+unpredictable behavior.
+
+To eliminate this case, a stop_machine()-based CPU synchronization was
+introduced as a way to guarantee that all logical CPUs will not execute
+any code but just wait in a spin loop, polling an atomic variable.
+
+While this took care of device or external interrupts, IPIs including
+LVT ones, such as CMCI etc, it cannot address other special interrupts
+that can't be shut off. Those are Machine Check (#MC), System Management
+(#SMI) and Non-Maskable interrupts (#NMI).
+
+Machine Checks
+--------------
+
+Machine Checks (#MC) are non-maskable. There are two kinds of MCEs.
+Fatal un-recoverable MCEs and recoverable MCEs. While un-recoverable
+errors are fatal, recoverable errors can also happen in kernel context
+are also treated as fatal by the kernel.
+
+On certain Intel machines, MCEs are also broadcast to all threads in a
+system. If one thread is in the middle of executing WRMSR, a MCE will be
+taken at the end of the flow. Either way, they will wait for the thread
+performing the wrmsr(0x79) to rendezvous in the MCE handler and shutdown
+eventually if any of the threads in the system fail to check in to the
+MCE rendezvous.
+
+To be paranoid and get predictable behavior, the OS can choose to set
+MCG_STATUS.MCIP. Since MCEs can be at most one in a system, if an
+MCE was signaled, the above condition will promote to a system reset
+automatically. OS can turn off MCIP at the end of the update for that
+core.
+
+System Management Interrupt
+---------------------------
+
+SMIs are also broadcast to all CPUs in the platform. Microcode update
+requests exclusive access to the core before writing to MSR 0x79. So if
+it does happen such that, one thread is in WRMSR flow, and the 2nd got
+an SMI, that thread will be stopped in the first instruction in the SMI
+handler.
+
+Since the secondary thread is stopped in the first instruction in SMI,
+there is very little chance that it would be in the middle of executing
+an instruction being patched. Plus OS has no way to stop SMIs from
+happening.
+
+Non-Maskable Interrupts
+-----------------------
+
+When thread0 of a core is doing the microcode update, if thread1 is
+pulled into NMI, that can cause unpredictable behavior due to the
+reasons above.
+
+OS can choose a variety of methods to avoid running into this situation.
+
+
+Is the microcode suitable for late loading?
+-------------------------------------------
+
+Late loading is done when the system is fully operational and running
+real workloads. Late loading behavior depends on what the base patch on
+the CPU is before upgrading to the new patch.
+
+This is true for Intel CPUs.
+
+Consider, for example, a CPU has patch level 1 and the update is to
+patch level 3.
+
+Between patch1 and patch3, patch2 might have deprecated a software-visible
+feature.
+
+This is unacceptable if software is even potentially using that feature.
+For instance, say MSR_X is no longer available after an update,
+accessing that MSR will cause a #GP fault.
+
+Basically there is no way to declare a new microcode update suitable
+for late-loading. This is another one of the problems that caused late
+loading to be not enabled by default.
+
 Builtin microcode
 =================
 
-- 
cgit v1.2.3


From 23c48a124b469cee2eb0c75e6d22d366d1caa118 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng@loongson.cn>
Date: Wed, 17 Aug 2022 10:08:30 +0800
Subject: docs/zh_CN: Update the translation of gpio to 6.0-rc1

Update to commit 5513b411ea5b ("Documentation: rename pinctl to
pin-control")
Move .../zh_CN/gpio.txt to .../zh_CN/driver-api/gpio/legacy.rst
Translate .../driver-api/index.rst into Chinese.
Translate .../driver-api/gpio/index.rst into Chinese.

Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Reviewed-by: Alex Shi <alexs@kernel.org>
Reviewed-by: Wu XiangCheng <bobwxc@email.cn>
Link: https://lore.kernel.org/r/20220817020830.799468-1-siyanteng@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../translations/zh_CN/driver-api/gpio/index.rst   |  69 ++
 .../translations/zh_CN/driver-api/gpio/legacy.rst  | 694 +++++++++++++++++++++
 .../translations/zh_CN/driver-api/index.rst        | 132 ++++
 Documentation/translations/zh_CN/gpio.txt          | 650 -------------------
 Documentation/translations/zh_CN/index.rst         |   2 +-
 5 files changed, 896 insertions(+), 651 deletions(-)
 create mode 100644 Documentation/translations/zh_CN/driver-api/gpio/index.rst
 create mode 100644 Documentation/translations/zh_CN/driver-api/gpio/legacy.rst
 create mode 100644 Documentation/translations/zh_CN/driver-api/index.rst
 delete mode 100644 Documentation/translations/zh_CN/gpio.txt

diff --git a/Documentation/translations/zh_CN/driver-api/gpio/index.rst b/Documentation/translations/zh_CN/driver-api/gpio/index.rst
new file mode 100644
index 000000000000..9ab64e94aced
--- /dev/null
+++ b/Documentation/translations/zh_CN/driver-api/gpio/index.rst
@@ -0,0 +1,69 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/driver-api/gpio/index.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+=======================
+通用型输入/输出（GPIO）
+=======================
+
+目录:
+
+.. toctree::
+   :maxdepth: 2
+
+   legacy
+
+Todolist:
+
+*   intro
+*   using-gpio
+*   driver
+*   consumer
+*   board
+*   drivers-on-gpio
+*   bt8xxgpio
+
+核心
+====
+
+该API在以下内核代码中:
+
+include/linux/gpio/driver.h
+
+drivers/gpio/gpiolib.c
+
+ACPI支持
+========
+
+该API在以下内核代码中:
+
+drivers/gpio/gpiolib-acpi.c
+
+设备树支持
+==========
+
+该API在以下内核代码中:
+
+drivers/gpio/gpiolib-of.c
+
+设备管理支持
+============
+
+该API在以下内核代码中:
+
+drivers/gpio/gpiolib-devres.c
+
+sysfs帮助（函数）
+=================
+
+该API在以下内核代码中:
+
+drivers/gpio/gpiolib-sysfs.c
diff --git a/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst b/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst
new file mode 100644
index 000000000000..6399521d0548
--- /dev/null
+++ b/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst
@@ -0,0 +1,694 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. include:: ../../disclaimer-zh_CN.rst
+
+:Original: Documentation/driver-api/gpio/legacy.rst
+
+:翻译:
+
+ 傅炜 Fu Wei <tekkamanninja@gmail.com>
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+传统GPIO接口
+============
+
+本文档概述了Linux下的GPIO访问公约。
+
+这些函数以 gpio_* 作为前缀。其他的函数不允许使用这样的前缀或相关的
+__gpio_* 前缀。
+
+
+什么是GPIO？
+============
+"通用输入/输出口"(GPIO)是一个灵活的由软件控制的数字信号。他们可
+由多种芯片提供,且对于从事嵌入式和定制硬件的 Linux 开发者来说是
+比较熟悉。每个GPIO 都代表一个连接到特定引脚或球栅阵列(BGA)封装中
+“球珠”的一个位。电路板原理图显示了 GPIO 与外部硬件的连接关系。
+驱动可以编写成通用代码，以使板级启动代码可传递引脚配置数据给驱动。
+
+片上系统 (SOC) 处理器对 GPIO 有很大的依赖。在某些情况下,每个
+非专用引脚都可配置为 GPIO,且大多数芯片都最少有一些 GPIO。
+可编程逻辑器件(类似 FPGA) 可以方便地提供 GPIO。像电源管理和
+音频编解码器这样的多功能芯片经常留有一些这样的引脚来帮助那些引脚
+匮乏的 SOC。同时还有通过 I2C 或 SPI 串行总线连接的“GPIO扩展器”
+芯片。大多数 PC 的南桥有一些拥有 GPIO 能力的引脚 (只有BIOS
+固件才知道如何使用他们)。
+
+GPIO 的实际功能因系统而异。通常用法有:
+
+  - 输出值可写 (高电平=1，低电平=0)。一些芯片也有如何驱动这些值的选项,
+    例如只允许输出一个值、支持“线与”及其他取值类似的模式(值得注意的是
+    “开漏”信号)
+
+  - 输入值可读(1、0)。一些芯片支持引脚在配置为“输出”时回读，这对于类似
+    “线与”的情况(以支持双向信号)是非常有用的。GPIO 控制器可能有输入
+    去毛刺/消抖逻辑,这有时需要软件控制。
+
+  - 输入通常可作为 IRQ 信号,一般是沿触发,但有时是电平触发。这样的 IRQ
+    可能配置为系统唤醒事件,以将系统从低功耗状态下唤醒。
+
+  - 通常一个 GPIO 根据不同产品电路板的需求,可以配置为输入或输出,也有仅
+    支持单向的。
+
+  - 大部分 GPIO 可以在持有自旋锁时访问,但是通常由串行总线扩展的 GPIO
+    不允许持有自旋锁。但某些系统也支持这种类型。
+
+对于给定的电路板,每个 GPIO 都用于某个特定的目的,如监控 MMC/SD 卡的
+插入/移除、检测卡的写保护状态、驱动 LED、配置收发器、模拟串行总线、
+复位硬件看门狗、感知开关状态等等。
+
+
+GPIO 公约
+=========
+注意,这个叫做“公约”，因为这不是强制性的，不遵循这个公约是无伤大雅的，
+因为此时可移植性并不重要。GPIO 常用于板级特定的电路逻辑,甚至可能
+随着电路板的版本而改变，且不可能在不同走线的电路板上使用。仅有在少数
+功能上才具有可移植性，其他功能是平台特定。这也是由于“胶合”的逻辑造成的。
+
+此外，这不需要任何的执行框架，只是一个接口。某个平台可能通过一个简单地
+访问芯片寄存器的内联函数来实现它，其他平台可能通过委托一系列不同的GPIO
+控制器的抽象函数来实现它。(有一些可选的代码能支持这种策略的实现,本文档
+后面会介绍，但作为 GPIO 接口的客户端驱动程序必须与它的实现无关。)
+
+也就是说,如果在他们的平台上支持这个公约，驱动应尽可能的使用它。同时，平台
+必须在 Kconfig 中选择 ARCH_REQUIRE_GPIOLIB 或者 ARCH_WANT_OPTIONAL_GPIOLIB
+选项。那些调用标准 GPIO 函数的驱动应该在 Kconfig 入口中声明依赖GENERIC_GPIO。
+当驱动包含文件:
+
+	#include <linux/gpio.h>
+
+则 GPIO 函数是可用,无论是“真实代码”还是经优化过的语句。如果你遵守
+这个公约，当你的代码完成后，对其他的开发者来说会更容易看懂和维护。
+
+注意，这些操作包含所用平台的 I/O 屏障代码，驱动无须显式地调用他们。
+
+
+标识 GPIO
+---------
+
+GPIO 是通过无符号整型来标识的,范围是 0 到 MAX_INT。保留“负”数
+用于其他目的,例如标识信号“在这个板子上不可用”或指示错误。未接触底层
+硬件的代码会忽略这些整数。
+
+平台会定义这些整数的用法,且通常使用 #define 来定义 GPIO，这样
+板级特定的启动代码可以直接关联相应的原理图。相对来说，驱动应该仅使用
+启动代码传递过来的 GPIO 编号，使用 platform_data 保存板级特定
+引脚配置数据 (同时还有其他须要的板级特定数据)，避免可能出现的问题。
+
+例如一个平台使用编号 32-159 来标识 GPIO,而在另一个平台使用编号0-63
+标识一组 GPIO 控制器,64-79标识另一类 GPIO 控制器,且在一个含有
+FPGA 的特定板子上使用 80-95。编号不一定要连续,那些平台中，也可以
+使用编号2000-2063来标识一个 I2C 接口的 GPIO 扩展器中的 GPIO。
+
+如果你要初始化一个带有无效 GPIO 编号的结构体,可以使用一些负编码
+(如"-EINVAL")，那将使其永远不会是有效。来测试这样一个结构体中的编号
+是否关联一个 GPIO，你可使用以下断言::
+
+	int gpio_is_valid(int number);
+
+如果编号不存在，则请求和释放 GPIO 的函数将拒绝执行相关操作(见下文)。
+其他编号也可能被拒绝,比如一个编号可能存在，但暂时在给定的电路上不可用。
+
+一个平台是否支持多个 GPIO 控制器为平台特定的实现问题，就像是否可以
+在 GPIO 编号空间中有“空洞”和是否可以在运行时添加新的控制器一样。
+这些问题会影响其他事情，包括相邻的 GPIO 编号是否存在等。
+
+使用 GPIO
+---------
+
+对于一个 GPIO，系统应该做的第一件事情就是通过 gpio_request()
+函数分配它，见下文。
+
+接下来是设置I/O方向，这通常是在板级启动代码中为所使用的 GPIO 设置
+platform_device 时完成::
+
+	/* 设置为输入或输出, 返回 0 或负的错误代码 */
+	int gpio_direction_input(unsigned gpio);
+	int gpio_direction_output(unsigned gpio, int value);
+
+返回值为零代表成功，否则返回一个负的错误代码。这个返回值需要检查，因为
+get/set(获取/设置)函数调用没法返回错误,且有可能是配置错误。通常，
+你应该在进程上下文中调用这些函数。然而,对于自旋锁安全的 GPIO，在板子
+启动的早期、进程启动前使用他们也是可以的。
+
+对于作为输出的 GPIO，为其提供初始输出值，对于避免在系统启动期间出现
+信号毛刺是很有帮助的。
+
+为了与传统的 GPIO 接口兼容, 在设置一个 GPIO 方向时，如果它还未被申请，
+则隐含了申请那个 GPIO 的操作(见下文)。这种兼容性正在从可选的 gpiolib
+框架中移除。
+
+如果这个 GPIO 编码不存在，或者特定的 GPIO 不能用于那种模式，则方向
+设置可能失败。依赖启动固件来正确地设置方向通常是一个坏主意，因为它可能
+除了启动Linux，并没有做更多的验证工作。(同理, 板子的启动代码可能需要
+将这个复用的引脚设置为 GPIO，并正确地配置上拉/下拉电阻。)
+
+
+访问自旋锁安全的 GPIO
+---------------------
+
+大多数 GPIO 控制器可以通过内存读/写指令来访问。这些指令不会休眠,可以
+安全地在硬(非线程)中断例程和类似的上下文中完成。
+
+对于那些用 gpio_cansleep()测试总是返回失败的 GPIO(见下文)，使用
+以下的函数访问::
+
+	/* GPIO 输入:返回零或非零 */
+	int gpio_get_value(unsigned gpio);
+
+	/* GPIO 输出 */
+	void gpio_set_value(unsigned gpio, int value);
+
+GPIO值是布尔值，零表示低电平，非零表示高电平。当读取一个输出引脚的值时，
+返回值应该是引脚上的值。这个值不总是和输出值相符，因为存在开漏输出信号和
+输出延迟问题。
+
+以上的 get/set 函数无错误返回值，因为之前 gpio_direction_*()应已检查过
+其是否为“无效GPIO”。此外，还需要注意的是并不是所有平台都可以从输出引脚
+中读取数据，对于不能读取的引脚应总返回零。另外，对那些在原子上下文中无法
+安全访问的 GPIO (译者注：因为访问可能导致休眠)使用这些函数是不合适的
+(见下文)。
+
+在 GPIO 编号(还有输出、值)为常数的情况下,鼓励通过平台特定的实现来优化
+这两个函数来访问 GPIO 值。这种情况(读写一个硬件寄存器)下只需要几条指令
+是很正常的,且无须自旋锁。这种优化函数比起那些在子程序上花费许多指令的
+函数可以使得模拟接口(译者注:例如 GPIO 模拟 I2C、1-wire 或 SPI)的
+应用(在空间和时间上都)更具效率。
+
+
+访问可能休眠的 GPIO
+-------------------
+
+某些 GPIO 控制器必须通过基于总线(如 I2C 或 SPI)的消息访问。读或写这些
+GPIO 值的命令需要等待其信息排到队首才发送命令，再获得其反馈。期间需要
+休眠，这不能在 IRQ 例程(中断上下文)中执行。
+
+支持此类 GPIO 的平台通过以下函数返回非零值来区分出这种 GPIO。(此函数需要
+一个之前通过 gpio_request 分配到的有效 GPIO 编号)::
+
+	int gpio_cansleep(unsigned gpio);
+
+为了访问这种 GPIO,内核定义了一套不同的函数::
+
+	/* GPIO 输入:返回零或非零 ,可能会休眠 */
+	int gpio_get_value_cansleep(unsigned gpio);
+
+	/* GPIO 输出,可能会休眠 */
+	void gpio_set_value_cansleep(unsigned gpio, int value);
+
+
+访问这样的 GPIO 需要一个允许休眠的上下文，例如线程 IRQ 处理例程，并用以上的
+访问函数替换那些没有 cansleep()后缀的自旋锁安全访问函数。
+
+除了这些访问函数可能休眠，且它们操作的 GPIO 不能在硬件 IRQ 处理例程中访问的
+事实，这些处理例程实际上和自旋锁安全的函数是一样的。
+
+** 除此之外 ** 调用设置和配置此类 GPIO 的函数也必须在允许休眠的上下文中，
+因为它们可能也需要访问 GPIO 控制器芯片 (这些设置函数通常在板级启动代码或者
+驱动探测/断开代码中，所以这是一个容易满足的约束条件。) ::
+
+                gpio_direction_input()
+                gpio_direction_output()
+                gpio_request()
+
+        ## 	gpio_request_one()
+        ##	gpio_request_array()
+        ## 	gpio_free_array()
+
+                gpio_free()
+                gpio_set_debounce()
+
+
+
+声明和释放 GPIO
+----------------
+
+为了有助于捕获系统配置错误,定义了两个函数::
+
+	/* 申请 GPIO, 返回 0 或负的错误代码.
+	 * 非空标签可能有助于诊断.
+	 */
+	int gpio_request(unsigned gpio, const char *label);
+
+	/* 释放之前声明的 GPIO */
+	void gpio_free(unsigned gpio);
+
+将无效的 GPIO 编码传递给 gpio_request()会导致失败，申请一个已使用这个
+函数声明过的 GPIO 也会失败。gpio_request()的返回值必须检查。你应该在
+进程上下文中调用这些函数。然而,对于自旋锁安全的 GPIO,在板子启动的早期、
+进入进程之前是可以申请的。
+
+这个函数完成两个基本的目标。一是标识那些实际上已作为 GPIO 使用的信号线，
+这样便于更好地诊断;系统可能需要服务几百个可用的 GPIO，但是对于任何一个
+给定的电路板通常只有一些被使用。另一个目的是捕获冲突，查明错误:如两个或
+更多驱动错误地认为他们已经独占了某个信号线,或是错误地认为移除一个管理着
+某个已激活信号的驱动是安全的。也就是说，申请 GPIO 的作用类似一种锁机制。
+
+某些平台可能也使用 GPIO 作为电源管理激活信号(例如通过关闭未使用芯片区和
+简单地关闭未使用时钟)。
+
+对于 GPIO 使用引脚控制子系统已知的引脚，子系统应该被告知其使用情况；
+一个 gpiolib 驱动的 .request()操作应调用 pinctrl_gpio_request()，
+而 gpiolib 驱动的 .free()操作应调用 pinctrl_gpio_free()。引脚控制
+子系统允许 pinctrl_gpio_request()在某个引脚或引脚组以复用形式“属于”
+一个设备时都成功返回。
+
+任何须将 GPIO 信号导向适当引脚的引脚复用硬件的编程应该发生在 GPIO
+驱动的 .direction_input()或 .direction_output()函数中，以及
+任何输出 GPIO 值的设置之后。这样可使从引脚特殊功能到 GPIO 的转换
+不会在引脚产生毛刺波形。有时当用一个 GPIO 实现其信号驱动一个非 GPIO
+硬件模块的解决方案时，就需要这种机制。
+
+某些平台允许部分或所有 GPIO 信号使用不同的引脚。类似的，GPIO 或引脚的
+其他方面也需要配置，如上拉/下拉。平台软件应该在对这些 GPIO 调用
+gpio_request()前将这类细节配置好，例如使用引脚控制子系统的映射表，
+使得 GPIO 的用户无须关注这些细节。
+
+还有一个值得注意的是在释放 GPIO 前，你必须停止使用它。
+
+
+注意:申请一个 GPIO 并没有以任何方式配置它，只不过标识那个 GPIO 处于使用
+状态。必须有另外的代码来处理引脚配置(如控制 GPIO 使用的引脚、上拉/下拉)。
+考虑到大多数情况下声明 GPIO 之后就会立即配置它们,所以定义了以下三个辅助函数::
+
+	/* 申请一个 GPIO 信号, 同时通过特定的'flags'初始化配置,
+	 * 其他和 gpio_request()的参数和返回值相同
+	 *
+	 */
+	int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
+
+	/* 在单个函数中申请多个 GPIO
+	 */
+	int gpio_request_array(struct gpio *array, size_t num);
+
+	/* 在单个函数中释放多个 GPIO
+	 */
+	void gpio_free_array(struct gpio *array, size_t num);
+
+这里 'flags' 当前定义可指定以下属性:
+
+	* GPIOF_DIR_IN		- 配置方向为输入
+	* GPIOF_DIR_OUT		- 配置方向为输出
+
+	* GPIOF_INIT_LOW	- 在作为输出时,初始值为低电平
+	* GPIOF_INIT_HIGH	- 在作为输出时,初始值为高电平
+	* GPIOF_OPEN_DRAIN	- gpio引脚为开漏信号
+	* GPIOF_OPEN_SOURCE	- gpio引脚为源极开路信号
+
+	* GPIOF_EXPORT_DIR_FIXED	- 将 gpio 导出到 sysfs，并保持方向
+	* GPIOF_EXPORT_DIR_CHANGEABLE	- 同样是导出, 但允许改变方向
+
+因为 GPIOF_INIT_* 仅有在配置为输出的时候才存在,所以有效的组合为:
+
+	* GPIOF_IN		- 配置为输入
+	* GPIOF_OUT_INIT_LOW	- 配置为输出,并初始化为低电平
+	* GPIOF_OUT_INIT_HIGH	- 配置为输出,并初始化为高电平
+
+当设置 flag 为 GPIOF_OPEN_DRAIN 时，则假设引脚是开漏信号。这样的引脚
+将不会在输出模式下置1。这样的引脚需要连接上拉电阻。通过使能这个标志，gpio库
+将会在被要求输出模式下置1时将引脚变为输入状态来使引脚置高。引脚在输出模式下
+通过置0使其输出低电平。
+
+当设置 flag 为 GPIOF_OPEN_SOURCE 时，则假设引脚为源极开路信号。这样的引脚
+将不会在输出模式下置0。这样的引脚需要连接下拉电阻。通过使能这个标志，gpio库
+将会在被要求输出模式下置0时将引脚变为输入状态来使引脚置低。引脚在输出模式下
+通过置1使其输出高电平。
+
+将来这些标志可能扩展到支持更多的属性。
+
+更进一步,为了更简单地声明/释放多个 GPIO,'struct gpio'被引进来封装所有
+这三个领域::
+
+	struct gpio {
+		unsigned	gpio;
+		unsigned long	flags;
+		const char	*label;
+	};
+
+一个典型的用例::
+
+	static struct gpio leds_gpios[] = {
+		{ 32, GPIOF_OUT_INIT_HIGH, "Power LED" }, /* 默认开启 */
+		{ 33, GPIOF_OUT_INIT_LOW,  "Green LED" }, /* 默认关闭 */
+		{ 34, GPIOF_OUT_INIT_LOW,  "Red LED"   }, /* 默认关闭 */
+		{ 35, GPIOF_OUT_INIT_LOW,  "Blue LED"  }, /* 默认关闭 */
+		{ ... },
+	};
+
+	err = gpio_request_one(31, GPIOF_IN, "Reset Button");
+	if (err)
+		...
+
+	err = gpio_request_array(leds_gpios, ARRAY_SIZE(leds_gpios));
+	if (err)
+		...
+
+	gpio_free_array(leds_gpios, ARRAY_SIZE(leds_gpios));
+
+
+GPIO 映射到 IRQ
+----------------
+
+GPIO 编号是无符号整数;IRQ 编号也是。这些构成了两个逻辑上不同的命名空间
+(GPIO 0 不一定使用 IRQ 0)。你可以通过以下函数在它们之间实现映射::
+
+	/* 映射 GPIO 编号到 IRQ 编号 */
+	int gpio_to_irq(unsigned gpio);
+
+	/* 映射 IRQ 编号到 GPIO 编号 (尽量避免使用) */
+	int irq_to_gpio(unsigned irq);
+
+它们的返回值为对应命名空间的相关编号，或是负的错误代码(如果无法映射)。
+(例如,某些 GPIO 无法做为 IRQ 使用。)以下的编号错误是未经检测的:使用一个
+未通过 gpio_direction_input()配置为输入的 GPIO 编号，或者使用一个
+并非来源于gpio_to_irq()的 IRQ 编号。
+
+这两个映射函数可能会在信号编号的加减计算过程上花些时间。它们不可休眠。
+
+gpio_to_irq()返回的非错误值可以传递给 request_irq()或者 free_irq()。
+它们通常通过板级特定的初始化代码存放到平台设备的 IRQ 资源中。注意:IRQ
+触发选项是 IRQ 接口的一部分，如 IRQF_TRIGGER_FALLING，系统唤醒能力
+也是如此。
+
+irq_to_gpio()返回的非错误值大多数通常可以被 gpio_get_value()所使用，
+比如在 IRQ 是沿触发时初始化或更新驱动状态。注意某些平台不支持反映射,所以
+你应该尽量避免使用它。
+
+
+模拟开漏信号
+------------
+
+有时在只有低电平信号作为实际驱动结果(译者注:多个输出连接于一点，逻辑电平
+结果为所有输出的逻辑与)的时候,共享的信号线需要使用“开漏”信号。(该术语
+适用于 CMOS 管；而 TTL 用“集电极开路”。)一个上拉电阻使信号为高电平。这
+有时被称为“线与”。实际上，从负逻辑(低电平为真)的角度来看，这是一个“线或”。
+
+一个开漏信号的常见例子是共享的低电平使能 IRQ 信号线。此外,有时双向数据总线
+信号也使用漏极开路信号。
+
+某些 GPIO 控制器直接支持开漏输出，还有许多不支持。当你需要开漏信号，但
+硬件又不直接支持的时候，一个常用的方法是用任何即可作输入也可作输出的 GPIO
+引脚来模拟:
+
+ LOW:	gpio_direction_output(gpio, 0) ... 这代码驱动信号并覆盖
+	上拉配置。
+
+ HIGH:	gpio_direction_input(gpio) ... 这代码关闭输出,所以上拉电阻
+	(或其他的一些器件)控制了信号。
+
+如果你将信号线“驱动”为高电平，但是 gpio_get_value(gpio)报告了一个
+低电平(在适当的上升时间后)，你就可以知道是其他的一些组件将共享信号线拉低了。
+这不一定是错误的。一个常见的例子就是 I2C 时钟的延长：一个需要较慢时钟的
+从设备延迟 SCK 的上升沿，而 I2C 主设备相应地调整其信号传输速率。
+
+GPIO控制器和引脚控制子系统
+--------------------------
+
+SOC上的GPIO控制器可能与引脚控制子系统紧密结合，即引脚可以与可选的gpio功
+能一起被其他功能使用。我们已经涵盖了这样的情况，例如一个GPIO控制器需要保
+留一个引脚或通过调用以下任何一个引脚来设置其方向::
+
+  pinctrl_gpio_request()
+  pinctrl_gpio_free()
+  pinctrl_gpio_direction_input()
+  pinctrl_gpio_direction_output()
+
+但是，引脚控制子系统是如何将GPIO号码（这是一个全局事项）与某个引脚控制器
+上的某个引脚交叉关联的？
+
+这是通过注册引脚的“范围”来实现的，这基本上是交叉参考表。这些描述是在
+Documentation/driver-api/pin-control.rst
+
+虽然引脚分配完全由引脚控制子系统管理，但gpio（在gpiolib下）仍由gpio驱动
+维护。可能发生的情况是，SoC中的不同引脚范围由不同的gpio驱动器管理。
+
+这使得在调用 "pinctrl_gpio_request" 之前，让gpio驱动向pin ctrl子系
+统宣布它们的引脚范围是合理的，以便在使用任何gpio之前要求引脚控制子系统准
+备相应的引脚。
+
+为此，gpio控制器可以用引脚控制子系统注册其引脚范围。目前有两种方法：有或
+无DT。
+
+关于对DT的支持，请参考 Documentation/devicetree/bindings/gpio/gpio.txt.
+
+对于非DT支持，用户可以用适当的参数调用gpiochip_add_pin_range()，将一
+系列的gpio引脚注册到引脚控制驱动上。为此，必须将引脚控制设备的名称字符串
+作为参数之一传给这个程序。
+
+
+这些公约忽略了什么？
+====================
+
+这些公约忽略的最大一件事就是引脚复用，因为这属于高度芯片特定的属性且
+没有可移植性。某个平台可能不需要明确的复用信息；有的对于任意给定的引脚
+可能只有两个功能选项；有的可能每个引脚有八个功能选项；有的可能可以将
+几个引脚中的任何一个作为给定的 GPIO。(是的，这些例子都来自于当前运行
+Linux 的系统。)
+
+在某些系统中,与引脚复用相关的是配置和使能集成的上、下拉模式。并不是所有
+平台都支持这种模式,或者不会以相同的方式来支持这种模式；且任何给定的电路板
+可能使用外置的上拉(或下拉)电阻,这时芯片上的就不应该使用。(当一个电路需要
+5kOhm 的拉动电阻,芯片上的 100 kOhm 电阻就不能做到。)同样的，驱动能力
+(2 mA vs 20 mA)和电压(1.8V vs 3.3V)是平台特定问题,就像模型一样在
+可配置引脚和 GPIO 之间(没)有一一对应的关系。
+
+还有其他一些系统特定的机制没有在这里指出，例如上述的输入去毛刺和线与输出
+选项。硬件可能支持批量读或写 GPIO，但是那一般是配置相关的：对于处于同一
+块区(bank)的GPIO。(GPIO 通常以 16 或 32 个组成一个区块，一个给定的
+片上系统一般有几个这样的区块。)某些系统可以通过输出 GPIO 触发 IRQ，
+或者从并非以 GPIO 管理的引脚取值。这些机制的相关代码没有必要具有可移植性。
+
+当前，动态定义 GPIO 并不是标准的，例如作为配置一个带有某些 GPIO 扩展器的
+附加电路板的副作用。
+
+GPIO 实现者的框架（可选）
+=========================
+
+前面提到了，有一个可选的实现框架，让平台使用相同的编程接口，更加简单地支持
+不同种类的 GPIO 控制器。这个框架称为"gpiolib"。
+
+作为一个辅助调试功能，如果 debugfs 可用，就会有一个 /sys/kernel/debug/gpio
+文件。通过这个框架，它可以列出所有注册的控制器,以及当前正在使用中的 GPIO
+的状态。
+
+
+控制器驱动: gpio_chip
+---------------------
+
+在框架中每个 GPIO 控制器都包装为一个 "struct gpio_chip"，他包含了
+该类型的每个控制器的常用信息:
+
+	- 设置 GPIO 方向的方法
+	- 用于访问 GPIO 值的方法
+	- 告知调用其方法是否可能休眠的标志
+	- 可选的 debugfs 信息导出方法 (显示类似上拉配置一样的额外状态)
+	- 诊断标签
+
+也包含了来自 device.platform_data 的每个实例的数据：它第一个 GPIO 的
+编号和它可用的 GPIO 的数量。
+
+实现 gpio_chip 的代码应支持多控制器实例，这可能使用驱动模型。那些代码要
+配置每个 gpio_chip，并发起gpiochip_add()。卸载一个 GPIO 控制器很少见，
+但在必要的时候可以使用 gpiochip_remove()。
+
+大部分 gpio_chip 是一个实例特定结构体的一部分，而并不将 GPIO 接口单独
+暴露出来,比如编址、电源管理等。类似编解码器这样的芯片会有复杂的非 GPIO
+状态。
+
+任何一个 debugfs 信息导出方法通常应该忽略还未申请作为 GPIO 的信号线。
+他们可以使用 gpiochip_is_requested()测试，当这个 GPIO 已经申请过了
+就返回相关的标签，否则返回 NULL。
+
+
+平台支持
+--------
+
+为了支持这个框架，一个平台的 Kconfig 文件将会 "select"(选择)
+ARCH_REQUIRE_GPIOLIB 或 ARCH_WANT_OPTIONAL_GPIOLIB，并让它的
+<asm/gpio.h> 包含 <asm-generic/gpio.h>，同时定义三个方法:
+gpio_get_value()、gpio_set_value()和 gpio_cansleep()。
+
+它也应提供一个 ARCH_NR_GPIOS 的定义值，这样可以更好地反映该平台 GPIO
+的实际数量,节省静态表的空间。(这个定义值应该包含片上系统内建 GPIO 和
+GPIO 扩展器中的数据。)
+
+ARCH_REQUIRE_GPIOLIB 意味着 gpiolib 核心在这个构架中将总是编译进内核。
+
+ARCH_WANT_OPTIONAL_GPIOLIB 意味着 gpiolib 核心默认关闭,且用户可以
+使能它,并将其编译进内核(可选)。
+
+如果这些选项都没被选择,该平台就不通过 GPIO-lib 支持 GPIO,且代码不可以
+被用户使能。
+
+以下这些方法的实现可以直接使用框架代码,并总是通过 gpio_chip 调度::
+
+  #define gpio_get_value	__gpio_get_value
+  #define gpio_set_value	__gpio_set_value
+  #define gpio_cansleep		__gpio_cansleep
+
+这些定义可以用更理想的实现方法替代，那就是使用经过逻辑优化的内联函数来访问
+基于特定片上系统的 GPIO。例如,若引用的 GPIO (寄存器位偏移)是常量“12”，
+读取或设置它可能只需少则两或三个指令，且不会休眠。当这样的优化无法实现时，
+那些函数必须使用框架提供的代码，那就至少要几十条指令才可以实现。对于用 GPIO
+模拟的 I/O 接口, 如此精简指令是很有意义的。
+
+对于片上系统，平台特定代码为片上 GPIO 每个区(bank)定义并注册 gpio_chip
+实例。那些 GPIO 应该根据芯片厂商的文档进行编码/标签,并直接和电路板原理图
+对应。他们应该开始于零并终止于平台特定的限制。这些 GPIO(代码)通常从
+arch_initcall()或者更早的地方集成进平台初始化代码，使这些 GPIO 总是可用，
+且他们通常可以作为 IRQ 使用。
+
+板级支持
+--------
+
+对于外部 GPIO 控制器(例如 I2C 或 SPI 扩展器、专用芯片、多功能器件、FPGA
+或 CPLD)，大多数常用板级特定代码都可以注册控制器设备，并保证他们的驱动知道
+gpiochip_add()所使用的 GPIO 编号。他们的起始编号通常跟在平台特定的 GPIO
+编号之后。
+
+例如板级启动代码应该创建结构体指明芯片公开的 GPIO 范围，并使用 platform_data
+将其传递给每个 GPIO 扩展器芯片。然后芯片驱动中的 probe()例程可以将这个
+数据传递给 gpiochip_add()。
+
+初始化顺序很重要。例如，如果一个设备依赖基于 I2C 的(扩展)GPIO，那么它的
+probe()例程就应该在那个 GPIO 有效以后才可以被调用。这意味着设备应该在
+GPIO 可以工作之后才可被注册。解决这类依赖的的一种方法是让这种 gpio_chip
+控制器向板级特定代码提供 setup()和 teardown()回调函数。一旦所有必须的
+资源可用之后，这些板级特定的回调函数将会注册设备，并可以在这些 GPIO 控制器
+设备变成无效时移除它们。
+
+
+用户空间的 Sysfs 接口（可选）
+=============================
+
+使用“gpiolib”实现框架的平台可以选择配置一个 GPIO 的 sysfs 用户接口。
+这不同于 debugfs 接口，因为它提供的是对 GPIO方向和值的控制，而不只显示
+一个GPIO 的状态摘要。此外,它可以出现在没有调试支持的产品级系统中。
+
+例如，通过适当的系统硬件文档，用户空间可以知道 GIOP #23 控制 Flash
+存储器的写保护(用于保护其中 Bootloader 分区)。产品的系统升级可能需要
+临时解除这个保护：首先导入一个 GPIO，改变其输出状态，然后在重新使能写保护
+前升级代码。通常情况下,GPIO #23 是不会被触及的，并且内核也不需要知道他。
+
+根据适当的硬件文档，某些系统的用户空间 GPIO 可以用于确定系统配置数据，
+这些数据是标准内核不知道的。在某些任务中，简单的用户空间 GPIO 驱动可能是
+系统真正需要的。
+
+注意：标准内核驱动中已经存在通用的“LED 和按键”GPIO 任务，分别是:
+"leds-gpio" 和 "gpio_keys"。请使用这些来替代直接访问 GPIO，因为集成在
+内核框架中的这类驱动比你在用户空间的代码更好。
+
+
+Sysfs 中的路径
+--------------
+
+在/sys/class/gpio 中有 3 类入口:
+
+   -	用于在用户空间控制 GPIO 的控制接口;
+
+   -	GPIOs 本身;以及
+
+   -	GPIO 控制器 ("gpio_chip" 实例)。
+
+除了这些标准的文件,还包含“device”符号链接。
+
+控制接口是只写的:
+
+    /sys/class/gpio/
+
+    	"export" ... 用户空间可以通过写其编号到这个文件，要求内核导出
+		一个 GPIO 的控制到用户空间。
+
+		例如: 如果内核代码没有申请 GPIO #19,"echo 19 > export"
+		将会为 GPIO #19 创建一个 "gpio19" 节点。
+
+    	"unexport" ... 导出到用户空间的逆操作。
+
+		例如: "echo 19 > unexport" 将会移除使用"export"文件导出的
+		"gpio19" 节点。
+
+GPIO 信号的路径类似 /sys/class/gpio/gpio42/ (对于 GPIO #42 来说)，
+并有如下的读/写属性:
+
+    /sys/class/gpio/gpioN/
+
+	"direction" ... 读取得到 "in" 或 "out"。这个值通常运行写入。
+		写入"out" 时,其引脚的默认输出为低电平。为了确保无故障运行，
+		"low" 或 "high" 的电平值应该写入 GPIO 的配置，作为初始输出值。
+
+		注意:如果内核不支持改变 GPIO 的方向，或者在导出时内核代码没有
+		明确允许用户空间可以重新配置 GPIO 方向，那么这个属性将不存在。
+
+	"value" ... 读取得到 0 (低电平) 或 1 (高电平)。如果 GPIO 配置为
+		输出,这个值允许写操作。任何非零值都以高电平看待。
+
+		如果引脚可以配置为中断信号，且如果已经配置了产生中断的模式
+		（见"edge"的描述），你可以对这个文件使用轮询操作(poll(2))，
+		且轮询操作会在任何中断触发时返回。如果你使用轮询操作(poll(2))，
+		请在 events 中设置 POLLPRI 和 POLLERR。如果你使用轮询操作
+		(select(2))，请在 exceptfds 设置你期望的文件描述符。在
+		轮询操作(poll(2))返回之后，既可以通过 lseek(2)操作读取
+		sysfs 文件的开始部分，也可以关闭这个文件并重新打开它来读取数据。
+
+	"edge" ... 读取得到“none”、“rising”、“falling”或者“both”。
+		将这些字符串写入这个文件可以选择沿触发模式，会使得轮询操作
+		(select(2))在"value"文件中返回。
+
+		这个文件仅有在这个引脚可以配置为可产生中断输入引脚时，才存在。
+
+	"active_low" ... 读取得到 0 (假) 或 1 (真)。写入任何非零值可以
+		翻转这个属性的(读写)值。已存在或之后通过"edge"属性设置了"rising"
+		和 "falling" 沿触发模式的轮询操作(poll(2))将会遵循这个设置。
+
+GPIO 控制器的路径类似 /sys/class/gpio/gpiochip42/ (对于从#42 GPIO
+开始实现控制的控制器),并有着以下只读属性:
+
+    /sys/class/gpio/gpiochipN/
+
+    	"base" ... 与以上的 N 相同,代表此芯片管理的第一个 GPIO 的编号
+
+    	"label" ... 用于诊断 (并不总是只有唯一值)
+
+    	"ngpio" ... 此控制器所管理的 GPIO 数量(而 GPIO 编号从 N 到
+    		N + ngpio - 1)
+
+大多数情况下,电路板的文档应当标明每个 GPIO 的使用目的。但是那些编号并不总是
+固定的,例如在扩展卡上的 GPIO会根据所使用的主板或所在堆叠架构中其他的板子而
+有所不同。在这种情况下,你可能需要使用 gpiochip 节点(尽可能地结合电路图)来
+确定给定信号所用的 GPIO 编号。
+
+
+从内核代码中导出
+----------------
+
+内核代码可以明确地管理那些已通过 gpio_request()申请的 GPIO 的导出::
+
+	/* 导出 GPIO 到用户空间 */
+	int gpio_export(unsigned gpio, bool direction_may_change);
+
+	/* gpio_export()的逆操作 */
+	void gpio_unexport();
+
+	/* 创建一个 sysfs 连接到已导出的 GPIO 节点 */
+	int gpio_export_link(struct device *dev, const char *name,
+		unsigned gpio)
+
+在一个内核驱动申请一个 GPIO 之后，它可以通过 gpio_export()使其在 sysfs
+接口中可见。该驱动可以控制信号方向是否可修改。这有助于防止用户空间代码无意间
+破坏重要的系统状态。
+
+这个明确的导出有助于(通过使某些实验更容易来)调试，也可以提供一个始终存在的接口，
+与文档配合作为板级支持包的一部分。
+
+在 GPIO 被导出之后，gpio_export_link()允许在 sysfs 文件系统的任何地方
+创建一个到这个 GPIO sysfs 节点的符号链接。这样驱动就可以通过一个描述性的
+名字，在 sysfs 中他们所拥有的设备下提供一个(到这个 GPIO sysfs 节点的)接口。
+
+
+API参考
+=======
+
+本节中列出的函数已被废弃。在新的代码中应该使用基于GPIO描述符的API。
diff --git a/Documentation/translations/zh_CN/driver-api/index.rst b/Documentation/translations/zh_CN/driver-api/index.rst
new file mode 100644
index 000000000000..24eb2198e5f1
--- /dev/null
+++ b/Documentation/translations/zh_CN/driver-api/index.rst
@@ -0,0 +1,132 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/driver-api/index.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+========================
+Linux驱动实现者的API指南
+========================
+
+内核提供了各种各样的接口来支持设备驱动的开发。这份文档只是对其中一些接口进行了
+一定程度的整理——希望随着时间的推移，它能变得更好！可用的小节可以在下面看到。
+
+.. class:: toc-title
+
+	   目录列表：
+
+.. toctree::
+   :maxdepth: 2
+
+   gpio/index
+
+Todolist:
+
+*   driver-model/index
+*   basics
+*   infrastructure
+*   ioctl
+*   early-userspace/index
+*   pm/index
+*   clk
+*   device-io
+*   dma-buf
+*   device_link
+*   component
+*   message-based
+*   infiniband
+*   aperture
+*   frame-buffer
+*   regulator
+*   reset
+*   iio/index
+*   input
+*   usb/index
+*   firewire
+*   pci/index
+*   cxl/index
+*   spi
+*   i2c
+*   ipmb
+*   ipmi
+*   i3c/index
+*   interconnect
+*   devfreq
+*   hsi
+*   edac
+*   scsi
+*   libata
+*   target
+*   mailbox
+*   mtdnand
+*   miscellaneous
+*   mei/index
+*   mtd/index
+*   mmc/index
+*   nvdimm/index
+*   w1
+*   rapidio/index
+*   s390-drivers
+*   vme
+*   80211/index
+*   uio-howto
+*   firmware/index
+*   pin-control
+*   md/index
+*   media/index
+*   misc_devices
+*   nfc/index
+*   dmaengine/index
+*   slimbus
+*   soundwire/index
+*   thermal/index
+*   fpga/index
+*   acpi/index
+*   auxiliary_bus
+*   backlight/lp855x-driver.rst
+*   connector
+*   console
+*   dcdbas
+*   eisa
+*   isa
+*   isapnp
+*   io-mapping
+*   io_ordering
+*   generic-counter
+*   memory-devices/index
+*   men-chameleon-bus
+*   ntb
+*   nvmem
+*   parport-lowlevel
+*   pps
+*   ptp
+*   phy/index
+*   pwm
+*   pldmfw/index
+*   rfkill
+*   serial/index
+*   sm501
+*   surface_aggregator/index
+*   switchtec
+*   sync_file
+*   tty/index
+*   vfio-mediated-device
+*   vfio
+*   vfio-pci-device-specific-driver-acceptance
+*   xilinx/index
+*   xillybus
+*   zorro
+*   hte/index
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/translations/zh_CN/gpio.txt b/Documentation/translations/zh_CN/gpio.txt
deleted file mode 100644
index a23ee14fc927..000000000000
--- a/Documentation/translations/zh_CN/gpio.txt
+++ /dev/null
@@ -1,650 +0,0 @@
-Chinese translated version of Documentation/admin-guide/gpio
-
-If you have any comment or update to the content, please contact the
-original document maintainer directly.  However, if you have a problem
-communicating in English you can also ask the Chinese maintainer for
-help.  Contact the Chinese maintainer if this translation is outdated
-or if there is a problem with the translation.
-
-Maintainer: Grant Likely <grant.likely@secretlab.ca>
-		Linus Walleij <linus.walleij@linaro.org>
-Chinese maintainer: Fu Wei <tekkamanninja@gmail.com>
----------------------------------------------------------------------
-Documentation/admin-guide/gpio 的中文翻译
-
-如果想评论或更新本文的内容，请直接联系原文档的维护者。如果你使用英文
-交流有困难的话，也可以向中文版维护者求助。如果本翻译更新不及时或者翻
-译存在问题，请联系中文版维护者。
-英文版维护者： Grant Likely <grant.likely@secretlab.ca>
-		Linus Walleij <linus.walleij@linaro.org>
-中文版维护者： 傅炜 Fu Wei <tekkamanninja@gmail.com>
-中文版翻译者： 傅炜 Fu Wei <tekkamanninja@gmail.com>
-中文版校译者： 傅炜 Fu Wei <tekkamanninja@gmail.com>
-
-
-以下为正文
----------------------------------------------------------------------
-GPIO 接口
-
-本文档提供了一个在Linux下访问GPIO的公约概述。
-
-这些函数以 gpio_* 作为前缀。其他的函数不允许使用这样的前缀或相关的
-__gpio_* 前缀。
-
-
-什么是GPIO?
-==========
-"通用输入/输出口"(GPIO)是一个灵活的由软件控制的数字信号。他们可
-由多种芯片提供,且对于从事嵌入式和定制硬件的 Linux 开发者来说是
-比较熟悉。每个GPIO 都代表一个连接到特定引脚或球栅阵列(BGA)封装中
-“球珠”的一个位。电路板原理图显示了 GPIO 与外部硬件的连接关系。
-驱动可以编写成通用代码，以使板级启动代码可传递引脚配置数据给驱动。
-
-片上系统 (SOC) 处理器对 GPIO 有很大的依赖。在某些情况下,每个
-非专用引脚都可配置为 GPIO,且大多数芯片都最少有一些 GPIO。
-可编程逻辑器件(类似 FPGA) 可以方便地提供 GPIO。像电源管理和
-音频编解码器这样的多功能芯片经常留有一些这样的引脚来帮助那些引脚
-匮乏的 SOC。同时还有通过 I2C 或 SPI 串行总线连接的“GPIO扩展器”
-芯片。大多数 PC 的南桥有一些拥有 GPIO 能力的引脚 (只有BIOS
-固件才知道如何使用他们)。
-
-GPIO 的实际功能因系统而异。通常用法有:
-
-  - 输出值可写 (高电平=1，低电平=0)。一些芯片也有如何驱动这些值的选项,
-    例如只允许输出一个值、支持“线与”及其他取值类似的模式(值得注意的是
-    “开漏”信号)
-
-  - 输入值可读(1、0)。一些芯片支持引脚在配置为“输出”时回读，这对于类似
-    “线与”的情况(以支持双向信号)是非常有用的。GPIO 控制器可能有输入
-    去毛刺/消抖逻辑,这有时需要软件控制。
-
-  - 输入通常可作为 IRQ 信号,一般是沿触发,但有时是电平触发。这样的 IRQ
-    可能配置为系统唤醒事件,以将系统从低功耗状态下唤醒。
-
-  - 通常一个 GPIO 根据不同产品电路板的需求,可以配置为输入或输出,也有仅
-    支持单向的。
-
-  - 大部分 GPIO 可以在持有自旋锁时访问,但是通常由串行总线扩展的 GPIO
-    不允许持有自旋锁。但某些系统也支持这种类型。
-
-对于给定的电路板,每个 GPIO 都用于某个特定的目的,如监控 MMC/SD 卡的
-插入/移除、检测卡的写保护状态、驱动 LED、配置收发器、模拟串行总线、
-复位硬件看门狗、感知开关状态等等。
-
-
-GPIO 公约
-=========
-注意,这个叫做“公约”，因为这不是强制性的，不遵循这个公约是无伤大雅的，
-因为此时可移植性并不重要。GPIO 常用于板级特定的电路逻辑,甚至可能
-随着电路板的版本而改变，且不可能在不同走线的电路板上使用。仅有在少数
-功能上才具有可移植性，其他功能是平台特定。这也是由于“胶合”的逻辑造成的。
-
-此外，这不需要任何的执行框架，只是一个接口。某个平台可能通过一个简单地
-访问芯片寄存器的内联函数来实现它，其他平台可能通过委托一系列不同的GPIO
-控制器的抽象函数来实现它。(有一些可选的代码能支持这种策略的实现,本文档
-后面会介绍，但作为 GPIO 接口的客户端驱动程序必须与它的实现无关。)
-
-也就是说,如果在他们的平台上支持这个公约，驱动应尽可能的使用它。同时，平台
-必须在 Kconfig 中选择 ARCH_REQUIRE_GPIOLIB 或者 ARCH_WANT_OPTIONAL_GPIOLIB
-选项。那些调用标准 GPIO 函数的驱动应该在 Kconfig 入口中声明依赖GENERIC_GPIO。
-当驱动包含文件:
-
-	#include <linux/gpio.h>
-
-则 GPIO 函数是可用,无论是“真实代码”还是经优化过的语句。如果你遵守
-这个公约，当你的代码完成后，对其他的开发者来说会更容易看懂和维护。
-
-注意，这些操作包含所用平台的 I/O 屏障代码，驱动无须显式地调用他们。
-
-
-标识 GPIO
----------
-GPIO 是通过无符号整型来标识的,范围是 0 到 MAX_INT。保留“负”数
-用于其他目的,例如标识信号“在这个板子上不可用”或指示错误。未接触底层
-硬件的代码会忽略这些整数。
-
-平台会定义这些整数的用法,且通常使用 #define 来定义 GPIO，这样
-板级特定的启动代码可以直接关联相应的原理图。相对来说，驱动应该仅使用
-启动代码传递过来的 GPIO 编号，使用 platform_data 保存板级特定
-引脚配置数据 (同时还有其他须要的板级特定数据)，避免可能出现的问题。
-
-例如一个平台使用编号 32-159 来标识 GPIO,而在另一个平台使用编号0-63
-标识一组 GPIO 控制器,64-79标识另一类 GPIO 控制器,且在一个含有
-FPGA 的特定板子上使用 80-95。编号不一定要连续,那些平台中，也可以
-使用编号2000-2063来标识一个 I2C 接口的 GPIO 扩展器中的 GPIO。
-
-如果你要初始化一个带有无效 GPIO 编号的结构体,可以使用一些负编码
-(如"-EINVAL")，那将使其永远不会是有效。来测试这样一个结构体中的编号
-是否关联一个 GPIO，你可使用以下断言:
-
-	int gpio_is_valid(int number);
-
-如果编号不存在，则请求和释放 GPIO 的函数将拒绝执行相关操作(见下文)。
-其他编号也可能被拒绝,比如一个编号可能存在，但暂时在给定的电路上不可用。
-
-一个平台是否支持多个 GPIO 控制器为平台特定的实现问题，就像是否可以
-在 GPIO 编号空间中有“空洞”和是否可以在运行时添加新的控制器一样。
-这些问题会影响其他事情，包括相邻的 GPIO 编号是否存在等。
-
-使用 GPIO
----------
-对于一个 GPIO，系统应该做的第一件事情就是通过 gpio_request()
-函数分配它，见下文。
-
-接下来是设置I/O方向，这通常是在板级启动代码中为所使用的 GPIO 设置
-platform_device 时完成。
-
-	/* 设置为输入或输出, 返回 0 或负的错误代码 */
-	int gpio_direction_input(unsigned gpio);
-	int gpio_direction_output(unsigned gpio, int value);
-
-返回值为零代表成功，否则返回一个负的错误代码。这个返回值需要检查，因为
-get/set(获取/设置)函数调用没法返回错误,且有可能是配置错误。通常，
-你应该在进程上下文中调用这些函数。然而,对于自旋锁安全的 GPIO，在板子
-启动的早期、进程启动前使用他们也是可以的。
-
-对于作为输出的 GPIO，为其提供初始输出值，对于避免在系统启动期间出现
-信号毛刺是很有帮助的。
-
-为了与传统的 GPIO 接口兼容, 在设置一个 GPIO 方向时，如果它还未被申请，
-则隐含了申请那个 GPIO 的操作(见下文)。这种兼容性正在从可选的 gpiolib
-框架中移除。
-
-如果这个 GPIO 编码不存在，或者特定的 GPIO 不能用于那种模式，则方向
-设置可能失败。依赖启动固件来正确地设置方向通常是一个坏主意，因为它可能
-除了启动Linux，并没有做更多的验证工作。(同理, 板子的启动代码可能需要
-将这个复用的引脚设置为 GPIO，并正确地配置上拉/下拉电阻。)
-
-
-访问自旋锁安全的 GPIO
--------------------
-大多数 GPIO 控制器可以通过内存读/写指令来访问。这些指令不会休眠,可以
-安全地在硬(非线程)中断例程和类似的上下文中完成。
-
-对于那些用 gpio_cansleep()测试总是返回失败的 GPIO(见下文)，使用
-以下的函数访问:
-
-	/* GPIO 输入:返回零或非零 */
-	int gpio_get_value(unsigned gpio);
-
-	/* GPIO 输出 */
-	void gpio_set_value(unsigned gpio, int value);
-
-GPIO值是布尔值，零表示低电平，非零表示高电平。当读取一个输出引脚的值时，
-返回值应该是引脚上的值。这个值不总是和输出值相符，因为存在开漏输出信号和
-输出延迟问题。
-
-以上的 get/set 函数无错误返回值，因为之前 gpio_direction_*()应已检查过
-其是否为“无效GPIO”。此外，还需要注意的是并不是所有平台都可以从输出引脚
-中读取数据，对于不能读取的引脚应总返回零。另外，对那些在原子上下文中无法
-安全访问的 GPIO (译者注：因为访问可能导致休眠)使用这些函数是不合适的
-(见下文)。
-
-在 GPIO 编号(还有输出、值)为常数的情况下,鼓励通过平台特定的实现来优化
-这两个函数来访问 GPIO 值。这种情况(读写一个硬件寄存器)下只需要几条指令
-是很正常的,且无须自旋锁。这种优化函数比起那些在子程序上花费许多指令的
-函数可以使得模拟接口(译者注:例如 GPIO 模拟 I2C、1-wire 或 SPI)的
-应用(在空间和时间上都)更具效率。
-
-
-访问可能休眠的 GPIO
------------------
-某些 GPIO 控制器必须通过基于总线(如 I2C 或 SPI)的消息访问。读或写这些
-GPIO 值的命令需要等待其信息排到队首才发送命令，再获得其反馈。期间需要
-休眠，这不能在 IRQ 例程(中断上下文)中执行。
-
-支持此类 GPIO 的平台通过以下函数返回非零值来区分出这种 GPIO。(此函数需要
-一个之前通过 gpio_request 分配到的有效 GPIO 编号):
-
-	int gpio_cansleep(unsigned gpio);
-
-为了访问这种 GPIO,内核定义了一套不同的函数:
-
-	/* GPIO 输入:返回零或非零 ,可能会休眠 */
-	int gpio_get_value_cansleep(unsigned gpio);
-
-	/* GPIO 输出,可能会休眠 */
-	void gpio_set_value_cansleep(unsigned gpio, int value);
-
-
-访问这样的 GPIO 需要一个允许休眠的上下文，例如线程 IRQ 处理例程，并用以上的
-访问函数替换那些没有 cansleep()后缀的自旋锁安全访问函数。
-
-除了这些访问函数可能休眠，且它们操作的 GPIO 不能在硬件 IRQ 处理例程中访问的
-事实，这些处理例程实际上和自旋锁安全的函数是一样的。
-
-** 除此之外 ** 调用设置和配置此类 GPIO 的函数也必须在允许休眠的上下文中，
-因为它们可能也需要访问 GPIO 控制器芯片: (这些设置函数通常在板级启动代码或者
-驱动探测/断开代码中，所以这是一个容易满足的约束条件。)
-
-	gpio_direction_input()
-	gpio_direction_output()
-	gpio_request()
-
-## 	gpio_request_one()
-##	gpio_request_array()
-## 	gpio_free_array()
-
-	gpio_free()
-	gpio_set_debounce()
-
-
-
-声明和释放 GPIO
-----------------------------
-为了有助于捕获系统配置错误,定义了两个函数。
-
-	/* 申请 GPIO, 返回 0 或负的错误代码.
-	 * 非空标签可能有助于诊断.
-	 */
-	int gpio_request(unsigned gpio, const char *label);
-
-	/* 释放之前声明的 GPIO */
-	void gpio_free(unsigned gpio);
-
-将无效的 GPIO 编码传递给 gpio_request()会导致失败，申请一个已使用这个
-函数声明过的 GPIO 也会失败。gpio_request()的返回值必须检查。你应该在
-进程上下文中调用这些函数。然而,对于自旋锁安全的 GPIO,在板子启动的早期、
-进入进程之前是可以申请的。
-
-这个函数完成两个基本的目标。一是标识那些实际上已作为 GPIO 使用的信号线，
-这样便于更好地诊断;系统可能需要服务几百个可用的 GPIO，但是对于任何一个
-给定的电路板通常只有一些被使用。另一个目的是捕获冲突，查明错误:如两个或
-更多驱动错误地认为他们已经独占了某个信号线,或是错误地认为移除一个管理着
-某个已激活信号的驱动是安全的。也就是说，申请 GPIO 的作用类似一种锁机制。
-
-某些平台可能也使用 GPIO 作为电源管理激活信号(例如通过关闭未使用芯片区和
-简单地关闭未使用时钟)。
-
-对于 GPIO 使用 pinctrl 子系统已知的引脚，子系统应该被告知其使用情况；
-一个 gpiolib 驱动的 .request()操作应调用 pinctrl_gpio_request()，
-而 gpiolib 驱动的 .free()操作应调用 pinctrl_gpio_free()。pinctrl
-子系统允许 pinctrl_gpio_request()在某个引脚或引脚组以复用形式“属于”
-一个设备时都成功返回。
-
-任何须将 GPIO 信号导向适当引脚的引脚复用硬件的编程应该发生在 GPIO
-驱动的 .direction_input()或 .direction_output()函数中，以及
-任何输出 GPIO 值的设置之后。这样可使从引脚特殊功能到 GPIO 的转换
-不会在引脚产生毛刺波形。有时当用一个 GPIO 实现其信号驱动一个非 GPIO
-硬件模块的解决方案时，就需要这种机制。
-
-某些平台允许部分或所有 GPIO 信号使用不同的引脚。类似的，GPIO 或引脚的
-其他方面也需要配置，如上拉/下拉。平台软件应该在对这些 GPIO 调用
-gpio_request()前将这类细节配置好，例如使用 pinctrl 子系统的映射表，
-使得 GPIO 的用户无须关注这些细节。
-
-还有一个值得注意的是在释放 GPIO 前，你必须停止使用它。
-
-
-注意:申请一个 GPIO 并没有以任何方式配置它，只不过标识那个 GPIO 处于使用
-状态。必须有另外的代码来处理引脚配置(如控制 GPIO 使用的引脚、上拉/下拉)。
-考虑到大多数情况下声明 GPIO 之后就会立即配置它们,所以定义了以下三个辅助函数:
-
-	/* 申请一个 GPIO 信号, 同时通过特定的'flags'初始化配置,
-	 * 其他和 gpio_request()的参数和返回值相同
-	 *
-	 */
-	int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
-
-	/* 在单个函数中申请多个 GPIO
-	 */
-	int gpio_request_array(struct gpio *array, size_t num);
-
-	/* 在单个函数中释放多个 GPIO
-	 */
-	void gpio_free_array(struct gpio *array, size_t num);
-
-这里 'flags' 当前定义可指定以下属性:
-
-	* GPIOF_DIR_IN		- 配置方向为输入
-	* GPIOF_DIR_OUT		- 配置方向为输出
-
-	* GPIOF_INIT_LOW	- 在作为输出时,初始值为低电平
-	* GPIOF_INIT_HIGH	- 在作为输出时,初始值为高电平
-	* GPIOF_OPEN_DRAIN	- gpio引脚为开漏信号
-	* GPIOF_OPEN_SOURCE	- gpio引脚为源极开路信号
-
-	* GPIOF_EXPORT_DIR_FIXED	- 将 gpio 导出到 sysfs，并保持方向
-	* GPIOF_EXPORT_DIR_CHANGEABLE	- 同样是导出, 但允许改变方向
-
-因为 GPIOF_INIT_* 仅有在配置为输出的时候才存在,所以有效的组合为:
-
-	* GPIOF_IN		- 配置为输入
-	* GPIOF_OUT_INIT_LOW	- 配置为输出,并初始化为低电平
-	* GPIOF_OUT_INIT_HIGH	- 配置为输出,并初始化为高电平
-
-当设置 flag 为 GPIOF_OPEN_DRAIN 时，则假设引脚是开漏信号。这样的引脚
-将不会在输出模式下置1。这样的引脚需要连接上拉电阻。通过使能这个标志，gpio库
-将会在被要求输出模式下置1时将引脚变为输入状态来使引脚置高。引脚在输出模式下
-通过置0使其输出低电平。
-
-当设置 flag 为 GPIOF_OPEN_SOURCE 时，则假设引脚为源极开路信号。这样的引脚
-将不会在输出模式下置0。这样的引脚需要连接下拉电阻。通过使能这个标志，gpio库
-将会在被要求输出模式下置0时将引脚变为输入状态来使引脚置低。引脚在输出模式下
-通过置1使其输出高电平。
-
-将来这些标志可能扩展到支持更多的属性。
-
-更进一步,为了更简单地声明/释放多个 GPIO,'struct gpio'被引进来封装所有
-这三个领域:
-
-	struct gpio {
-		unsigned	gpio;
-		unsigned long	flags;
-		const char	*label;
-	};
-
-一个典型的用例:
-
-	static struct gpio leds_gpios[] = {
-		{ 32, GPIOF_OUT_INIT_HIGH, "Power LED" }, /* 默认开启 */
-		{ 33, GPIOF_OUT_INIT_LOW,  "Green LED" }, /* 默认关闭 */
-		{ 34, GPIOF_OUT_INIT_LOW,  "Red LED"   }, /* 默认关闭 */
-		{ 35, GPIOF_OUT_INIT_LOW,  "Blue LED"  }, /* 默认关闭 */
-		{ ... },
-	};
-
-	err = gpio_request_one(31, GPIOF_IN, "Reset Button");
-	if (err)
-		...
-
-	err = gpio_request_array(leds_gpios, ARRAY_SIZE(leds_gpios));
-	if (err)
-		...
-
-	gpio_free_array(leds_gpios, ARRAY_SIZE(leds_gpios));
-
-
-GPIO 映射到 IRQ
---------------------
-GPIO 编号是无符号整数;IRQ 编号也是。这些构成了两个逻辑上不同的命名空间
-(GPIO 0 不一定使用 IRQ 0)。你可以通过以下函数在它们之间实现映射:
-
-	/* 映射 GPIO 编号到 IRQ 编号 */
-	int gpio_to_irq(unsigned gpio);
-
-	/* 映射 IRQ 编号到 GPIO 编号 (尽量避免使用) */
-	int irq_to_gpio(unsigned irq);
-
-它们的返回值为对应命名空间的相关编号，或是负的错误代码(如果无法映射)。
-(例如,某些 GPIO 无法做为 IRQ 使用。)以下的编号错误是未经检测的:使用一个
-未通过 gpio_direction_input()配置为输入的 GPIO 编号，或者使用一个
-并非来源于gpio_to_irq()的 IRQ 编号。
-
-这两个映射函数可能会在信号编号的加减计算过程上花些时间。它们不可休眠。
-
-gpio_to_irq()返回的非错误值可以传递给 request_irq()或者 free_irq()。
-它们通常通过板级特定的初始化代码存放到平台设备的 IRQ 资源中。注意:IRQ
-触发选项是 IRQ 接口的一部分，如 IRQF_TRIGGER_FALLING，系统唤醒能力
-也是如此。
-
-irq_to_gpio()返回的非错误值大多数通常可以被 gpio_get_value()所使用，
-比如在 IRQ 是沿触发时初始化或更新驱动状态。注意某些平台不支持反映射,所以
-你应该尽量避免使用它。
-
-
-模拟开漏信号
-----------------------------
-有时在只有低电平信号作为实际驱动结果(译者注:多个输出连接于一点，逻辑电平
-结果为所有输出的逻辑与)的时候,共享的信号线需要使用“开漏”信号。(该术语
-适用于 CMOS 管；而 TTL 用“集电极开路”。)一个上拉电阻使信号为高电平。这
-有时被称为“线与”。实际上，从负逻辑(低电平为真)的角度来看，这是一个“线或”。
-
-一个开漏信号的常见例子是共享的低电平使能 IRQ 信号线。此外,有时双向数据总线
-信号也使用漏极开路信号。
-
-某些 GPIO 控制器直接支持开漏输出，还有许多不支持。当你需要开漏信号，但
-硬件又不直接支持的时候，一个常用的方法是用任何即可作输入也可作输出的 GPIO
-引脚来模拟:
-
- LOW:	gpio_direction_output(gpio, 0) ... 这代码驱动信号并覆盖
-	上拉配置。
-
- HIGH:	gpio_direction_input(gpio) ... 这代码关闭输出,所以上拉电阻
-	(或其他的一些器件)控制了信号。
-
-如果你将信号线“驱动”为高电平，但是 gpio_get_value(gpio)报告了一个
-低电平(在适当的上升时间后)，你就可以知道是其他的一些组件将共享信号线拉低了。
-这不一定是错误的。一个常见的例子就是 I2C 时钟的延长：一个需要较慢时钟的
-从设备延迟 SCK 的上升沿，而 I2C 主设备相应地调整其信号传输速率。
-
-
-这些公约忽略了什么?
-================
-这些公约忽略的最大一件事就是引脚复用，因为这属于高度芯片特定的属性且
-没有可移植性。某个平台可能不需要明确的复用信息；有的对于任意给定的引脚
-可能只有两个功能选项；有的可能每个引脚有八个功能选项；有的可能可以将
-几个引脚中的任何一个作为给定的 GPIO。(是的，这些例子都来自于当前运行
-Linux 的系统。)
-
-在某些系统中,与引脚复用相关的是配置和使能集成的上、下拉模式。并不是所有
-平台都支持这种模式,或者不会以相同的方式来支持这种模式；且任何给定的电路板
-可能使用外置的上拉(或下拉)电阻,这时芯片上的就不应该使用。(当一个电路需要
-5kOhm 的拉动电阻,芯片上的 100 kOhm 电阻就不能做到。)同样的，驱动能力
-(2 mA vs 20 mA)和电压(1.8V vs 3.3V)是平台特定问题,就像模型一样在
-可配置引脚和 GPIO 之间(没)有一一对应的关系。
-
-还有其他一些系统特定的机制没有在这里指出，例如上述的输入去毛刺和线与输出
-选项。硬件可能支持批量读或写 GPIO，但是那一般是配置相关的：对于处于同一
-块区(bank)的GPIO。(GPIO 通常以 16 或 32 个组成一个区块，一个给定的
-片上系统一般有几个这样的区块。)某些系统可以通过输出 GPIO 触发 IRQ，
-或者从并非以 GPIO 管理的引脚取值。这些机制的相关代码没有必要具有可移植性。
-
-当前，动态定义 GPIO 并不是标准的，例如作为配置一个带有某些 GPIO 扩展器的
-附加电路板的副作用。
-
-GPIO 实现者的框架 (可选)
-=====================
-前面提到了，有一个可选的实现框架，让平台使用相同的编程接口，更加简单地支持
-不同种类的 GPIO 控制器。这个框架称为"gpiolib"。
-
-作为一个辅助调试功能，如果 debugfs 可用，就会有一个 /sys/kernel/debug/gpio
-文件。通过这个框架，它可以列出所有注册的控制器,以及当前正在使用中的 GPIO
-的状态。
-
-
-控制器驱动: gpio_chip
--------------------
-在框架中每个 GPIO 控制器都包装为一个 "struct gpio_chip"，他包含了
-该类型的每个控制器的常用信息:
-
- - 设置 GPIO 方向的方法
- - 用于访问 GPIO 值的方法
- - 告知调用其方法是否可能休眠的标志
- - 可选的 debugfs 信息导出方法 (显示类似上拉配置一样的额外状态)
- - 诊断标签
-
-也包含了来自 device.platform_data 的每个实例的数据：它第一个 GPIO 的
-编号和它可用的 GPIO 的数量。
-
-实现 gpio_chip 的代码应支持多控制器实例，这可能使用驱动模型。那些代码要
-配置每个 gpio_chip，并发起gpiochip_add()。卸载一个 GPIO 控制器很少见，
-但在必要的时候可以使用 gpiochip_remove()。
-
-大部分 gpio_chip 是一个实例特定结构体的一部分，而并不将 GPIO 接口单独
-暴露出来,比如编址、电源管理等。类似编解码器这样的芯片会有复杂的非 GPIO
-状态。
-
-任何一个 debugfs 信息导出方法通常应该忽略还未申请作为 GPIO 的信号线。
-他们可以使用 gpiochip_is_requested()测试，当这个 GPIO 已经申请过了
-就返回相关的标签，否则返回 NULL。
-
-
-平台支持
--------
-为了支持这个框架，一个平台的 Kconfig 文件将会 "select"(选择)
-ARCH_REQUIRE_GPIOLIB 或 ARCH_WANT_OPTIONAL_GPIOLIB，并让它的
-<asm/gpio.h> 包含 <asm-generic/gpio.h>，同时定义三个方法:
-gpio_get_value()、gpio_set_value()和 gpio_cansleep()。
-
-它也应提供一个 ARCH_NR_GPIOS 的定义值，这样可以更好地反映该平台 GPIO
-的实际数量,节省静态表的空间。(这个定义值应该包含片上系统内建 GPIO 和
-GPIO 扩展器中的数据。)
-
-ARCH_REQUIRE_GPIOLIB 意味着 gpiolib 核心在这个构架中将总是编译进内核。
-
-ARCH_WANT_OPTIONAL_GPIOLIB 意味着 gpiolib 核心默认关闭,且用户可以
-使能它,并将其编译进内核(可选)。
-
-如果这些选项都没被选择,该平台就不通过 GPIO-lib 支持 GPIO,且代码不可以
-被用户使能。
-
-以下这些方法的实现可以直接使用框架代码,并总是通过 gpio_chip 调度:
-
-  #define gpio_get_value	__gpio_get_value
-  #define gpio_set_value	__gpio_set_value
-  #define gpio_cansleep		__gpio_cansleep
-
-这些定义可以用更理想的实现方法替代，那就是使用经过逻辑优化的内联函数来访问
-基于特定片上系统的 GPIO。例如,若引用的 GPIO (寄存器位偏移)是常量“12”，
-读取或设置它可能只需少则两或三个指令，且不会休眠。当这样的优化无法实现时，
-那些函数必须使用框架提供的代码，那就至少要几十条指令才可以实现。对于用 GPIO
-模拟的 I/O 接口, 如此精简指令是很有意义的。
-
-对于片上系统，平台特定代码为片上 GPIO 每个区(bank)定义并注册 gpio_chip
-实例。那些 GPIO 应该根据芯片厂商的文档进行编码/标签,并直接和电路板原理图
-对应。他们应该开始于零并终止于平台特定的限制。这些 GPIO(代码)通常从
-arch_initcall()或者更早的地方集成进平台初始化代码，使这些 GPIO 总是可用，
-且他们通常可以作为 IRQ 使用。
-
-板级支持
--------
-对于外部 GPIO 控制器(例如 I2C 或 SPI 扩展器、专用芯片、多功能器件、FPGA
-或 CPLD)，大多数常用板级特定代码都可以注册控制器设备，并保证他们的驱动知道
-gpiochip_add()所使用的 GPIO 编号。他们的起始编号通常跟在平台特定的 GPIO
-编号之后。
-
-例如板级启动代码应该创建结构体指明芯片公开的 GPIO 范围，并使用 platform_data
-将其传递给每个 GPIO 扩展器芯片。然后芯片驱动中的 probe()例程可以将这个
-数据传递给 gpiochip_add()。
-
-初始化顺序很重要。例如，如果一个设备依赖基于 I2C 的(扩展)GPIO，那么它的
-probe()例程就应该在那个 GPIO 有效以后才可以被调用。这意味着设备应该在
-GPIO 可以工作之后才可被注册。解决这类依赖的的一种方法是让这种 gpio_chip
-控制器向板级特定代码提供 setup()和 teardown()回调函数。一旦所有必须的
-资源可用之后，这些板级特定的回调函数将会注册设备，并可以在这些 GPIO 控制器
-设备变成无效时移除它们。
-
-
-用户空间的 Sysfs 接口(可选)
-========================
-使用“gpiolib”实现框架的平台可以选择配置一个 GPIO 的 sysfs 用户接口。
-这不同于 debugfs 接口，因为它提供的是对 GPIO方向和值的控制，而不只显示
-一个GPIO 的状态摘要。此外,它可以出现在没有调试支持的产品级系统中。
-
-例如，通过适当的系统硬件文档，用户空间可以知道 GIOP #23 控制 Flash
-存储器的写保护(用于保护其中 Bootloader 分区)。产品的系统升级可能需要
-临时解除这个保护：首先导入一个 GPIO，改变其输出状态，然后在重新使能写保护
-前升级代码。通常情况下,GPIO #23 是不会被触及的，并且内核也不需要知道他。
-
-根据适当的硬件文档，某些系统的用户空间 GPIO 可以用于确定系统配置数据，
-这些数据是标准内核不知道的。在某些任务中，简单的用户空间 GPIO 驱动可能是
-系统真正需要的。
-
-注意：标准内核驱动中已经存在通用的“LED 和按键”GPIO 任务，分别是:
-"leds-gpio" 和 "gpio_keys"。请使用这些来替代直接访问 GPIO，因为集成在
-内核框架中的这类驱动比你在用户空间的代码更好。
-
-
-Sysfs 中的路径
---------------
-在/sys/class/gpio 中有 3 类入口:
-
-   -	用于在用户空间控制 GPIO 的控制接口;
-
-   -	GPIOs 本身;以及
-
-   -	GPIO 控制器 ("gpio_chip" 实例)。
-
-除了这些标准的文件,还包含“device”符号链接。
-
-控制接口是只写的:
-
-    /sys/class/gpio/
-
-    	"export" ... 用户空间可以通过写其编号到这个文件，要求内核导出
-		一个 GPIO 的控制到用户空间。
-
-		例如: 如果内核代码没有申请 GPIO #19,"echo 19 > export"
-		将会为 GPIO #19 创建一个 "gpio19" 节点。
-
-    	"unexport" ... 导出到用户空间的逆操作。
-
-		例如: "echo 19 > unexport" 将会移除使用"export"文件导出的
-		"gpio19" 节点。
-
-GPIO 信号的路径类似 /sys/class/gpio/gpio42/ (对于 GPIO #42 来说)，
-并有如下的读/写属性:
-
-    /sys/class/gpio/gpioN/
-
-	"direction" ... 读取得到 "in" 或 "out"。这个值通常运行写入。
-		写入"out" 时,其引脚的默认输出为低电平。为了确保无故障运行，
-		"low" 或 "high" 的电平值应该写入 GPIO 的配置，作为初始输出值。
-
-		注意:如果内核不支持改变 GPIO 的方向，或者在导出时内核代码没有
-		明确允许用户空间可以重新配置 GPIO 方向，那么这个属性将不存在。
-
-	"value" ... 读取得到 0 (低电平) 或 1 (高电平)。如果 GPIO 配置为
-		输出,这个值允许写操作。任何非零值都以高电平看待。
-
-		如果引脚可以配置为中断信号，且如果已经配置了产生中断的模式
-		（见"edge"的描述），你可以对这个文件使用轮询操作(poll(2))，
-		且轮询操作会在任何中断触发时返回。如果你使用轮询操作(poll(2))，
-		请在 events 中设置 POLLPRI 和 POLLERR。如果你使用轮询操作
-		(select(2))，请在 exceptfds 设置你期望的文件描述符。在
-		轮询操作(poll(2))返回之后，既可以通过 lseek(2)操作读取
-		sysfs 文件的开始部分，也可以关闭这个文件并重新打开它来读取数据。
-
-	"edge" ... 读取得到“none”、“rising”、“falling”或者“both”。
-		将这些字符串写入这个文件可以选择沿触发模式，会使得轮询操作
-		(select(2))在"value"文件中返回。
-
-		这个文件仅有在这个引脚可以配置为可产生中断输入引脚时，才存在。
-
-	"active_low" ... 读取得到 0 (假) 或 1 (真)。写入任何非零值可以
-		翻转这个属性的(读写)值。已存在或之后通过"edge"属性设置了"rising"
-		和 "falling" 沿触发模式的轮询操作(poll(2))将会遵循这个设置。
-
-GPIO 控制器的路径类似 /sys/class/gpio/gpiochip42/ (对于从#42 GPIO
-开始实现控制的控制器),并有着以下只读属性:
-
-    /sys/class/gpio/gpiochipN/
-
-    	"base" ... 与以上的 N 相同,代表此芯片管理的第一个 GPIO 的编号
-
-    	"label" ... 用于诊断 (并不总是只有唯一值)
-
-    	"ngpio" ... 此控制器所管理的 GPIO 数量(而 GPIO 编号从 N 到
-    		N + ngpio - 1)
-
-大多数情况下,电路板的文档应当标明每个 GPIO 的使用目的。但是那些编号并不总是
-固定的,例如在扩展卡上的 GPIO会根据所使用的主板或所在堆叠架构中其他的板子而
-有所不同。在这种情况下,你可能需要使用 gpiochip 节点(尽可能地结合电路图)来
-确定给定信号所用的 GPIO 编号。
-
-
-从内核代码中导出
--------------
-内核代码可以明确地管理那些已通过 gpio_request()申请的 GPIO 的导出:
-
-	/* 导出 GPIO 到用户空间 */
-	int gpio_export(unsigned gpio, bool direction_may_change);
-
-	/* gpio_export()的逆操作 */
-	void gpio_unexport();
-
-	/* 创建一个 sysfs 连接到已导出的 GPIO 节点 */
-	int gpio_export_link(struct device *dev, const char *name,
-		unsigned gpio)
-
-在一个内核驱动申请一个 GPIO 之后，它可以通过 gpio_export()使其在 sysfs
-接口中可见。该驱动可以控制信号方向是否可修改。这有助于防止用户空间代码无意间
-破坏重要的系统状态。
-
-这个明确的导出有助于(通过使某些实验更容易来)调试，也可以提供一个始终存在的接口，
-与文档配合作为板级支持包的一部分。
-
-在 GPIO 被导出之后，gpio_export_link()允许在 sysfs 文件系统的任何地方
-创建一个到这个 GPIO sysfs 节点的符号链接。这样驱动就可以通过一个描述性的
-名字，在 sysfs 中他们所拥有的设备下提供一个(到这个 GPIO sysfs 节点的)接口。
diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst
index bf85baca8b3e..4f04367a4c5e 100644
--- a/Documentation/translations/zh_CN/index.rst
+++ b/Documentation/translations/zh_CN/index.rst
@@ -108,6 +108,7 @@ TODOList:
    :maxdepth: 2
 
    core-api/index
+   driver-api/index
    locking/index
    accounting/index
    cpu-freq/index
@@ -123,7 +124,6 @@ TODOList:
 
 TODOList:
 
-* driver-api/index
 * block/index
 * cdrom/index
 * ide/index
-- 
cgit v1.2.3


From 74a3c2aefe04f381ca14d0e589c62edd4af94b4c Mon Sep 17 00:00:00 2001
From: Eric Lin <dslin1010@gmail.com>
Date: Thu, 11 Aug 2022 17:15:16 +0800
Subject: Documentation: irqdomain: Fix typo of "at least once"

Signed-off-by: Eric Lin <dslin1010@gmail.com>
Link: https://lore.kernel.org/r/20220811091516.2107908-1-dslin1010@gmail.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/core-api/irq/irq-domain.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/core-api/irq/irq-domain.rst b/Documentation/core-api/irq/irq-domain.rst
index d30b4d0a9769..f88a6ee67a35 100644
--- a/Documentation/core-api/irq/irq-domain.rst
+++ b/Documentation/core-api/irq/irq-domain.rst
@@ -71,7 +71,7 @@ variety of methods:
 Note that irq domain lookups must happen in contexts that are
 compatible with a RCU read-side critical section.
 
-The irq_create_mapping() function must be called *atleast once*
+The irq_create_mapping() function must be called *at least once*
 before any call to irq_find_mapping(), lest the descriptor will not
 be allocated.
 
-- 
cgit v1.2.3


From 97024e159ff5ab9a1d1a769d351184fadffebfe7 Mon Sep 17 00:00:00 2001
From: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Date: Mon, 8 Aug 2022 17:31:49 -0400
Subject: maintainer-pgp-guide: use key terminology consistent with upstream

GnuPG does not use the word "master key" when referring to the subkey
marked with the "certification" capability. Our use of this term was not
only inconsistent, but also misleading, because in real life "master
keys" are able to open multiple locks made for different keys, while PGP
Certify key has no such capability.

Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220727-docs-pgp-guide-v2-1-e3e6954affb6@linuxfoundation.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/process/maintainer-pgp-guide.rst | 118 +++++++++++++------------
 1 file changed, 63 insertions(+), 55 deletions(-)

diff --git a/Documentation/process/maintainer-pgp-guide.rst b/Documentation/process/maintainer-pgp-guide.rst
index 29e7d7b1cd44..7dada4eaedca 100644
--- a/Documentation/process/maintainer-pgp-guide.rst
+++ b/Documentation/process/maintainer-pgp-guide.rst
@@ -133,45 +133,56 @@ daily cronjob::
 Check the full path to your ``gpg`` or ``gpg2`` command and use the
 ``gpg2`` command if regular ``gpg`` for you is the legacy GnuPG v.1.
 
-.. _master_key:
+.. _protect_your_key:
 
-Protect your master PGP key
-===========================
+Protect your PGP key
+====================
 
 This guide assumes that you already have a PGP key that you use for Linux
 kernel development purposes. If you do not yet have one, please see the
 "`Protecting Code Integrity`_" document mentioned earlier for guidance
 on how to create a new one.
 
-You should also make a new key if your current one is weaker than 2048 bits
-(RSA).
-
-Master key vs. Subkeys
-----------------------
-
-Subkeys are fully independent PGP keypairs that are tied to the "master"
-key using certifying key signatures (certificates). It is important to
-understand the following:
-
-1. There are no technical differences between the "master key" and "subkeys."
-2. At creation time, we assign functional limitations to each key by
-   giving it specific capabilities.
-3. A PGP key can have 4 capabilities:
-
-   - **[S]** key can be used for signing
-   - **[E]** key can be used for encryption
-   - **[A]** key can be used for authentication
-   - **[C]** key can be used for certifying other keys
-
-4. A single key may have multiple capabilities.
-5. A subkey is fully independent from the master key. A message
-   encrypted to a subkey cannot be decrypted with the master key. If you
-   lose your private subkey, it cannot be recreated from the master key
-   in any way.
-
-The key carrying the **[C]** (certify) capability is considered the
-"master" key because it is the only key that can be used to indicate
-relationship with other keys. Only the **[C]** key can be used to:
+You should also make a new key if your current one is weaker than 2048
+bits (RSA).
+
+Understanding PGP Subkeys
+-------------------------
+
+A PGP key rarely consists of a single keypair -- usually it is a
+collection of independent subkeys that can be used for different
+purposes based on their capabilities, assigned at their creation time.
+PGP defines four capabilities that a key can have:
+
+- **[S]** keys can be used for signing
+- **[E]** keys can be used for encryption
+- **[A]** keys can be used for authentication
+- **[C]** keys can be used for certifying other keys
+
+The key with the **[C]** capability is often called the "master" key,
+but this terminology is misleading because it implies that the Certify
+key can be used in place of any of other subkey on the same chain (like
+a physical "master key" can be used to open the locks made for other
+keys). Since this is not the case, this guide will refer to it as "the
+Certify key" to avoid any ambiguity.
+
+It is critical to fully understand the following:
+
+1. All subkeys are fully independent from each other. If you lose a
+   private subkey, it cannot be restored or recreated from any other
+   private key on your chain.
+2. With the exception of the Certify key, there can be multiple subkeys
+   with identical capabilities (e.g. you can have 2 valid encryption
+   subkeys, 3 valid signing subkeys, but only one valid certification
+   subkey). All subkeys are fully independent -- a message encrypted to
+   one **[E]** subkey cannot be decrypted with any other **[E]** subkey
+   you may also have.
+3. A single subkey may have multiple capabilities (e.g. your **[C]** key
+   can also be your **[S]** key).
+
+The key carrying the **[C]** (certify) capability is the only key that
+can be used to indicate relationship with other keys. Only the **[C]**
+key can be used to:
 
 - add or revoke other keys (subkeys) with S/E/A capabilities
 - add, change or revoke identities (uids) associated with the key
@@ -180,7 +191,7 @@ relationship with other keys. Only the **[C]** key can be used to:
 
 By default, GnuPG creates the following when generating new keys:
 
-- A master key carrying both Certify and Sign capabilities (**[SC]**)
+- One subkey carrying both Certify and Sign capabilities (**[SC]**)
 - A separate subkey with the Encryption capability (**[E]**)
 
 If you used the default parameters when generating your key, then that
@@ -192,9 +203,6 @@ for example::
     uid           [ultimate] Alice Dev <adev@kernel.org>
     ssb   rsa2048 2018-01-23 [E] [expires: 2020-01-23]
 
-Any key carrying the **[C]** capability is your master key, regardless
-of any other capabilities it may have assigned to it.
-
 The long line under the ``sec`` entry is your key fingerprint --
 whenever you see ``[fpr]`` in the examples below, that 40-character
 string is what it refers to.
@@ -215,9 +223,9 @@ strong passphrase. To set it or change it, use::
 Create a separate Signing subkey
 --------------------------------
 
-Our goal is to protect your master key by moving it to offline media, so
-if you only have a combined **[SC]** key, then you should create a separate
-signing subkey::
+Our goal is to protect your Certify key by moving it to offline media,
+so if you only have a combined **[SC]** key, then you should create a
+separate signing subkey::
 
     $ gpg --quick-addkey [fpr] ed25519 sign
 
@@ -230,8 +238,8 @@ your new subkey::
 
     GnuPG 2.1 and later has full support for Elliptic Curve
     Cryptography, with ability to combine ECC subkeys with traditional
-    RSA master keys. The main upside of ECC cryptography is that it is
-    much faster computationally and creates much smaller signatures when
+    RSA keys. The main upside of ECC cryptography is that it is much
+    faster computationally and creates much smaller signatures when
     compared byte for byte with 2048+ bit RSA keys. Unless you plan on
     using a smartcard device that does not support ECC operations, we
     recommend that you create an ECC signing subkey for your kernel
@@ -244,8 +252,8 @@ your new subkey::
     "nistp256" instead or "ed25519."
 
 
-Back up your master key for disaster recovery
----------------------------------------------
+Back up your Certify key for disaster recovery
+----------------------------------------------
 
 The more signatures you have on your PGP key from other developers, the
 more reasons you have to create a backup version that lives on something
@@ -300,7 +308,7 @@ will use for backup purposes. You will need to encrypt them using LUKS
 -- refer to your distro's documentation on how to accomplish this.
 
 For the encryption passphrase, you can use the same one as on your
-master key.
+PGP key.
 
 Once the encryption process is over, re-insert the USB drive and make
 sure it gets properly mounted. Copy your entire ``.gnupg`` directory
@@ -319,7 +327,7 @@ far away, because you'll need to use it every now and again for things
 like editing identities, adding or revoking subkeys, or signing other
 people's keys.
 
-Remove the master key from  your homedir
+Remove the Certify key from your homedir
 ----------------------------------------
 
 The files in our home directory are not as well protected as we like to
@@ -334,7 +342,7 @@ think.  They can be leaked or stolen via many different means:
 Protecting your key with a good passphrase greatly helps reduce the risk
 of any of the above, but passphrases can be discovered via keyloggers,
 shoulder-surfing, or any number of other means. For this reason, the
-recommended setup is to remove your master key from your home directory
+recommended setup is to remove your Certify key from your home directory
 and store it on offline storage.
 
 .. warning::
@@ -343,7 +351,7 @@ and store it on offline storage.
     your GnuPG directory in its entirety. What we are about to do will
     render your key useless if you do not have a usable backup!
 
-First, identify the keygrip of your master key::
+First, identify the keygrip of your Certify key::
 
     $ gpg --with-keygrip --list-key [fpr]
 
@@ -359,7 +367,7 @@ The output will be something like this::
           Keygrip = 3333000000000000000000000000000000000000
 
 Find the keygrip entry that is beneath the ``pub`` line (right under the
-master key fingerprint). This will correspond directly to a file in your
+Certify key fingerprint). This will correspond directly to a file in your
 ``~/.gnupg`` directory::
 
     $ cd ~/.gnupg/private-keys-v1.d
@@ -369,13 +377,13 @@ master key fingerprint). This will correspond directly to a file in your
     3333000000000000000000000000000000000000.key
 
 All you have to do is simply remove the .key file that corresponds to
-the master keygrip::
+the Certify key keygrip::
 
     $ cd ~/.gnupg/private-keys-v1.d
     $ rm 1111000000000000000000000000000000000000.key
 
 Now, if you issue the ``--list-secret-keys`` command, it will show that
-the master key is missing (the ``#`` indicates it is not available)::
+the Certify key is missing (the ``#`` indicates it is not available)::
 
     $ gpg --list-secret-keys
     sec#  rsa2048 2018-01-24 [SC] [expires: 2020-01-24]
@@ -404,7 +412,7 @@ file, which still contains your private keys.
 Move the subkeys to a dedicated crypto device
 =============================================
 
-Even though the master key is now safe from being leaked or stolen, the
+Even though the Certify key is now safe from being leaked or stolen, the
 subkeys are still in your home directory. Anyone who manages to get
 their hands on those will be able to decrypt your communication or fake
 your signatures (if they know the passphrase). Furthermore, each time a
@@ -627,10 +635,10 @@ Other common GnuPG operations
 Here is a quick reference for some common operations you'll need to do
 with your PGP key.
 
-Mounting your master key offline storage
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Mounting your safe offline storage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-You will need your master key for any of the operations below, so you
+You will need your Certify key for any of the operations below, so you
 will first need to mount your backup offline storage and tell GnuPG to
 use it::
 
@@ -644,7 +652,7 @@ your regular home directory location).
 Extending key expiration date
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The master key has the default expiration date of 2 years from the date
+The Certify key has the default expiration date of 2 years from the date
 of creation. This is done both for security reasons and to make obsolete
 keys eventually disappear from keyservers.
 
-- 
cgit v1.2.3


From 7d61aa2cbd62d57a6a3ac5a7635686320f5ded9b Mon Sep 17 00:00:00 2001
From: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Date: Mon, 8 Aug 2022 17:31:50 -0400
Subject: maintainer-pgp-guide: remove keyserver instructions

Keyservers are largely a thing of the past with the replacement systems
like keys.openpgp.net specifically designed to offer no support for the
web of trust. Remove all sections that talk about keyservers and add a
small section with the link to kernel.org documentation that talks about
using the kernel.org public key repository.

Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220727-docs-pgp-guide-v2-2-e3e6954affb6@linuxfoundation.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/process/maintainer-pgp-guide.rst | 93 ++++----------------------
 1 file changed, 14 insertions(+), 79 deletions(-)

diff --git a/Documentation/process/maintainer-pgp-guide.rst b/Documentation/process/maintainer-pgp-guide.rst
index 7dada4eaedca..ead5bc815017 100644
--- a/Documentation/process/maintainer-pgp-guide.rst
+++ b/Documentation/process/maintainer-pgp-guide.rst
@@ -121,18 +121,6 @@ edit your ``~/.gnupg/gpg-agent.conf`` file to set your own values::
     to remove anything you had in place for older versions of GnuPG, as
     it may not be doing the right thing any more.
 
-Set up a refresh cronjob
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-You will need to regularly refresh your keyring in order to get the
-latest changes on other people's public keys, which is best done with a
-daily cronjob::
-
-    @daily /usr/bin/gpg2 --refresh >/dev/null 2>&1
-
-Check the full path to your ``gpg`` or ``gpg2`` command and use the
-``gpg2`` command if regular ``gpg`` for you is the legacy GnuPG v.1.
-
 .. _protect_your_key:
 
 Protect your PGP key
@@ -229,11 +217,6 @@ separate signing subkey::
 
     $ gpg --quick-addkey [fpr] ed25519 sign
 
-Remember to tell the keyservers about this change, so others can pull down
-your new subkey::
-
-    $ gpg --send-key [fpr]
-
 .. note:: ECC support in GnuPG
 
     GnuPG 2.1 and later has full support for Elliptic Curve
@@ -907,65 +890,17 @@ the new default in GnuPG v2). To set it, add (or modify) the
 
     trust-model tofu+pgp
 
-How to use keyservers (more) safely
------------------------------------
-
-If you get a "No public key" error when trying to validate someone's
-tag, then you should attempt to lookup that key using a keyserver. It is
-important to keep in mind that there is absolutely no guarantee that the
-key you retrieve from PGP keyservers belongs to the actual person --
-that much is by design. You are supposed to use the Web of Trust to
-establish key validity.
-
-How to properly maintain the Web of Trust is beyond the scope of this
-document, simply because doing it properly requires both effort and
-dedication that tends to be beyond the caring threshold of most human
-beings. Here are some shortcuts that will help you reduce the risk of
-importing a malicious key.
-
-First, let's say you've tried to run ``git verify-tag`` but it returned
-an error saying the key is not found::
-
-    $ git verify-tag sunxi-fixes-for-4.15-2
-    gpg: Signature made Sun 07 Jan 2018 10:51:55 PM EST
-    gpg:                using RSA key DA73759BF8619E484E5A3B47389A54219C0F2430
-    gpg:                issuer "wens@...org"
-    gpg: Can't check signature: No public key
-
-Let's query the keyserver for more info about that key fingerprint (the
-fingerprint probably belongs to a subkey, so we can't use it directly
-without finding out the ID of the master key it is associated with)::
-
-    $ gpg --search DA73759BF8619E484E5A3B47389A54219C0F2430
-    gpg: data source: hkp://keys.gnupg.net
-    (1) Chen-Yu Tsai <wens@...org>
-          4096 bit RSA key C94035C21B4F2AEB, created: 2017-03-14, expires: 2019-03-15
-    Keys 1-1 of 1 for "DA73759BF8619E484E5A3B47389A54219C0F2430".  Enter number(s), N)ext, or Q)uit > q
-
-Locate the ID of the master key in the output, in our example
-``C94035C21B4F2AEB``. Now display the key of Linus Torvalds that you
-have on your keyring::
-
-    $ gpg --list-key torvalds@kernel.org
-    pub   rsa2048 2011-09-20 [SC]
-          ABAF11C65A2970B130ABE3C479BE3E4300411886
-    uid           [ unknown] Linus Torvalds <torvalds@kernel.org>
-    sub   rsa2048 2011-09-20 [E]
-
-Next, find a trust path from Linus Torvalds to the key-id you found via ``gpg
---search`` of the unknown key.  For this, you can use several tools including
-https://github.com/mricon/wotmate,
-https://git.kernel.org/pub/scm/docs/kernel/pgpkeys.git/tree/graphs, and
-https://the.earth.li/~noodles/pathfind.html.
-
-If you get a few decent trust paths, then it's a pretty good indication
-that it is a valid key. You can add it to your keyring from the
-keyserver now::
-
-    $ gpg --recv-key C94035C21B4F2AEB
-
-This process is not perfect, and you are obviously trusting the
-administrators of the PGP Pathfinder service to not be malicious (in
-fact, this goes against :ref:`devs_not_infra`). However, if you
-do not carefully maintain your own web of trust, then it is a marked
-improvement over blindly trusting keyservers.
+Using the kernel.org web of trust repository
+--------------------------------------------
+
+Kernel.org maintains a git repository with developers' public keys as a
+replacement for replicating keyserver networks that have gone mostly
+dark in the past few years. The full documentation for how to set up
+that repository as your source of public keys can be found here:
+
+- `Kernel developer PGP Keyring`_
+
+If you are a kernel developer, please consider submitting your key for
+inclusion into that keyring.
+
+.. _`Kernel developer PGP Keyring`: https://korg.docs.kernel.org/pgpkeys.html
-- 
cgit v1.2.3


From 0a1a279bdade0230f5021fa8d9e8a71d3de2e859 Mon Sep 17 00:00:00 2001
From: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Date: Mon, 8 Aug 2022 17:31:51 -0400
Subject: maintainer-pgp-guide: update ECC support information

Update ECC sections with the latest details, now that Yubikeys are able
to support ED25519 curves. Tweak a few links to smartcard devices to
reflect the latest URL changes.

Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220727-docs-pgp-guide-v2-3-e3e6954affb6@linuxfoundation.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/process/maintainer-pgp-guide.rst | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/Documentation/process/maintainer-pgp-guide.rst b/Documentation/process/maintainer-pgp-guide.rst
index ead5bc815017..bf288925973e 100644
--- a/Documentation/process/maintainer-pgp-guide.rst
+++ b/Documentation/process/maintainer-pgp-guide.rst
@@ -228,11 +228,9 @@ separate signing subkey::
     recommend that you create an ECC signing subkey for your kernel
     work.
 
-    If for some reason you prefer to stay with RSA subkeys, just replace
-    "ed25519" with "rsa2048" in the above command. Additionally, if you
-    plan to use a hardware device that does not support ED25519 ECC
-    keys, like Nitrokey Pro or a Yubikey, then you should use
-    "nistp256" instead or "ed25519."
+    Note, that if you plan to use a hardware device that does not
+    support ED25519 ECC keys, you should choose "nistp256" instead or
+    "ed25519."
 
 
 Back up your Certify key for disaster recovery
@@ -438,7 +436,8 @@ functionality. There are several options available:
 - `Yubikey 5`_: proprietary hardware and software, but cheaper than
   Nitrokey Pro and comes available in the USB-C form that is more useful
   with newer laptops. Offers additional security features such as FIDO
-  U2F, among others, and now finally supports ECC keys (NISTP).
+  U2F, among others, and now finally supports NISTP and ED25519 ECC
+  keys.
 
 `LWN has a good review`_ of some of the above models, as well as several
 others. Your choice will depend on cost, shipping availability in your
@@ -451,7 +450,7 @@ geographical region, and open/proprietary hardware considerations.
     Foundation.
 
 .. _`Nitrokey Start`: https://shop.nitrokey.com/shop/product/nitrokey-start-6
-.. _`Nitrokey Pro 2`: https://shop.nitrokey.com/shop/product/nitrokey-pro-2-3
+.. _`Nitrokey Pro 2`: https://shop.nitrokey.com/shop/product/nkpr2-nitrokey-pro-2-3
 .. _`Yubikey 5`: https://www.yubico.com/products/yubikey-5-overview/
 .. _Gnuk: https://www.fsij.org/doc-gnuk/
 .. _`LWN has a good review`: https://lwn.net/Articles/736231/
-- 
cgit v1.2.3


From 6043134dcedb4fb74e57858bca913a8117478ac0 Mon Sep 17 00:00:00 2001
From: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Date: Mon, 8 Aug 2022 17:31:52 -0400
Subject: maintainer-pgp-guide: add a section on PGP-signed patches

With more developers beginning to use b4 and patatt, add a section to
the guide that talks about setting up and using patatt for PGP-signing
patch submissions.

Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220727-docs-pgp-guide-v2-4-e3e6954affb6@linuxfoundation.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/process/maintainer-pgp-guide.rst | 58 ++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/Documentation/process/maintainer-pgp-guide.rst b/Documentation/process/maintainer-pgp-guide.rst
index bf288925973e..27c42762edd7 100644
--- a/Documentation/process/maintainer-pgp-guide.rst
+++ b/Documentation/process/maintainer-pgp-guide.rst
@@ -675,6 +675,7 @@ remote end.
 
 .. _`Agent Forwarding over SSH`: https://wiki.gnupg.org/AgentForwarding
 
+.. _pgp_with_git:
 
 Using PGP with Git
 ==================
@@ -818,6 +819,63 @@ You can tell git to always sign commits::
 
 .. _verify_identities:
 
+
+How to work with signed patches
+-------------------------------
+
+It is possible to use your PGP key to sign patches sent to kernel
+developer mailing lists. Since existing email signature mechanisms
+(PGP-Mime or PGP-inline) tend to cause problems with regular code
+review tasks, you should use the tool kernel.org created for this
+purpose that puts cryptographic attestation signatures into message
+headers (a-la DKIM):
+
+- `Patatt Patch Attestation`_
+
+.. _`Patatt Patch Attestation`: https://pypi.org/project/patatt/
+
+Installing and configuring patatt
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Patatt is packaged for many distributions already, so please check there
+first. You can also install it from pypi using "``pip install patatt``".
+
+If you already have your PGP key configured with git (via the
+``user.signingKey`` configuration parameter), then patatt requires no
+further configuration. You can start signing your patches by installing
+the git-send-email hook in the repository you want::
+
+    patatt install-hook
+
+Now any patches you send with ``git send-email`` will be automatically
+signed with your cryptographic signature.
+
+Checking patatt signatures
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you are using ``b4`` to retrieve and apply patches, then it will
+automatically attempt to verify all DKIM and patatt signatures it
+encounters, for example::
+
+    $ b4 am 20220720205013.890942-1-broonie@kernel.org
+    [...]
+    Checking attestation on all messages, may take a moment...
+    ---
+      ✓ [PATCH v1 1/3] kselftest/arm64: Correct buffer allocation for SVE Z registers
+      ✓ [PATCH v1 2/3] arm64/sve: Document our actual ABI for clearing registers on syscall
+      ✓ [PATCH v1 3/3] kselftest/arm64: Enforce actual ABI for SVE syscalls
+      ---
+      ✓ Signed: openpgp/broonie@kernel.org
+      ✓ Signed: DKIM/kernel.org
+
+.. note::
+
+    Patatt and b4 are still in active development and you should check
+    the latest documentation for these projects for any new or updated
+    features.
+
+.. _kernel_identities:
+
 How to verify kernel developer identities
 =========================================
 
-- 
cgit v1.2.3


From e72b3b9810dd2cb6d057dce8666e12f4f8f2ccdf Mon Sep 17 00:00:00 2001
From: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Date: Mon, 8 Aug 2022 17:31:53 -0400
Subject: maintainer-pgp-guide: minor wording tweaks

Tweak some wording to remove redundant information.

Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220727-docs-pgp-guide-v2-5-e3e6954affb6@linuxfoundation.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/process/maintainer-pgp-guide.rst | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/Documentation/process/maintainer-pgp-guide.rst b/Documentation/process/maintainer-pgp-guide.rst
index 27c42762edd7..40bfbd3b7648 100644
--- a/Documentation/process/maintainer-pgp-guide.rst
+++ b/Documentation/process/maintainer-pgp-guide.rst
@@ -266,9 +266,7 @@ home, such as your bank vault.
     Your printer is probably no longer a simple dumb device connected to
     your parallel port, but since the output is still encrypted with
     your passphrase, printing out even to "cloud-integrated" modern
-    printers should remain a relatively safe operation. One option is to
-    change the passphrase on your master key immediately after you are
-    done with paperkey.
+    printers should remain a relatively safe operation.
 
 Back up your whole GnuPG directory
 ----------------------------------
-- 
cgit v1.2.3


From c987918a3fdf2d361c48839ec324f2547418f9d6 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 18 Aug 2022 15:39:03 -0700
Subject: fs-verity: use memcpy_from_page()

Replace extract_hash() with the memcpy_from_page() helper function.

This is simpler, and it has the side effect of replacing the use of
kmap_atomic() with its recommended replacement kmap_local_page().

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Link: https://lore.kernel.org/r/20220818223903.43710-1-ebiggers@kernel.org
---
 fs/verity/verify.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 14e2fb49cff5..bde8c9b7d25f 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -39,16 +39,6 @@ static void hash_at_level(const struct merkle_tree_params *params,
 		   (params->log_blocksize - params->log_arity);
 }
 
-/* Extract a hash from a hash page */
-static void extract_hash(struct page *hpage, unsigned int hoffset,
-			 unsigned int hsize, u8 *out)
-{
-	void *virt = kmap_atomic(hpage);
-
-	memcpy(out, virt + hoffset, hsize);
-	kunmap_atomic(virt);
-}
-
 static inline int cmp_hashes(const struct fsverity_info *vi,
 			     const u8 *want_hash, const u8 *real_hash,
 			     pgoff_t index, int level)
@@ -129,7 +119,7 @@ static bool verify_page(struct inode *inode, const struct fsverity_info *vi,
 		}
 
 		if (PageChecked(hpage)) {
-			extract_hash(hpage, hoffset, hsize, _want_hash);
+			memcpy_from_page(_want_hash, hpage, hoffset, hsize);
 			want_hash = _want_hash;
 			put_page(hpage);
 			pr_debug_ratelimited("Hash page already checked, want %s:%*phN\n",
@@ -158,7 +148,7 @@ descend:
 		if (err)
 			goto out;
 		SetPageChecked(hpage);
-		extract_hash(hpage, hoffset, hsize, _want_hash);
+		memcpy_from_page(_want_hash, hpage, hoffset, hsize);
 		want_hash = _want_hash;
 		put_page(hpage);
 		pr_debug("Verified hash page at level %d, now want %s:%*phN\n",
-- 
cgit v1.2.3


From 8377e8a24bba1ae73b3869bc71ee9df16b6bef61 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 18 Aug 2022 15:40:10 -0700
Subject: fs-verity: use kmap_local_page() instead of kmap()

Convert the use of kmap() to its recommended replacement
kmap_local_page().  This avoids the overhead of doing a non-local
mapping, which is unnecessary in this case.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Fabio M. De Francesco <fmdefrancesco@gmail.com>
Link: https://lore.kernel.org/r/20220818224010.43778-1-ebiggers@kernel.org
---
 fs/verity/read_metadata.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
index 6ee849dc7bc1..2aefc5565152 100644
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@@ -53,14 +53,14 @@ static int fsverity_read_merkle_tree(struct inode *inode,
 			break;
 		}
 
-		virt = kmap(page);
+		virt = kmap_local_page(page);
 		if (copy_to_user(buf, virt + offs_in_page, bytes_to_copy)) {
-			kunmap(page);
+			kunmap_local(virt);
 			put_page(page);
 			err = -EFAULT;
 			break;
 		}
-		kunmap(page);
+		kunmap_local(virt);
 		put_page(page);
 
 		retval += bytes_to_copy;
-- 
cgit v1.2.3


From d25c6948a6aad787d9fd64de6b5362c3f23cc8d0 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Tue, 2 Aug 2022 09:18:47 -0700
Subject: RAS/CEC: Reduce offline page threshold for Intel systems

A large scale study of memory errors on Intel systems in data centers
showed that aggressively taking pages with corrected errors offline is
the best strategy of using corrected errors as a predictor of future
uncorrected errors.

Set the threshold to "2" on Intel systems. AMD guidance is that this is
not necessary for their systems.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Link: https://lore.kernel.org/r/20220607212015.175591-1-tony.luck@intel.com
Link: https://lore.kernel.org/r/YulOZ/Eso0bwUcC4@agluck-desk3.sc.intel.com
---
 drivers/ras/cec.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
index 42f2fc0bc8a9..321af498ee11 100644
--- a/drivers/ras/cec.c
+++ b/drivers/ras/cec.c
@@ -556,6 +556,14 @@ static int __init cec_init(void)
 	if (ce_arr.disabled)
 		return -ENODEV;
 
+	/*
+	 * Intel systems may avoid uncorrectable errors
+	 * if pages with corrected errors are aggressively
+	 * taken offline.
+	 */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		action_threshold = 2;
+
 	ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!ce_arr.array) {
 		pr_err("Error allocating CE array page!\n");
-- 
cgit v1.2.3


From 272ac1500372183ffd54b0c9f43f52afc482e610 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 15 Aug 2022 11:45:01 -0700
Subject: fscrypt: remove fscrypt_set_test_dummy_encryption()

Now that all its callers have been converted to
fscrypt_parse_test_dummy_encryption() and fscrypt_add_test_dummy_key()
instead, fscrypt_set_test_dummy_encryption() can be removed.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220513231605.175121-6-ebiggers@kernel.org
---
 fs/crypto/policy.c      | 13 -------------
 include/linux/fscrypt.h |  2 --
 2 files changed, 15 deletions(-)

diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 80b8ca0f340b..55d57181cd9e 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -833,19 +833,6 @@ bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
 }
 EXPORT_SYMBOL_GPL(fscrypt_dummy_policies_equal);
 
-/* Deprecated, do not use */
-int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
-				      struct fscrypt_dummy_policy *dummy_policy)
-{
-	struct fs_parameter param = {
-		.type = fs_value_is_string,
-		.string = arg ? (char *)arg : "",
-	};
-	return fscrypt_parse_test_dummy_encryption(&param, dummy_policy) ?:
-		fscrypt_add_test_dummy_key(sb, dummy_policy);
-}
-EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
-
 /**
  * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption'
  * @seq: the seq_file to print the option to
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 7d2f1e0f23b1..b95b8601b9c1 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -295,8 +295,6 @@ int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
 				    struct fscrypt_dummy_policy *dummy_policy);
 bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
 				  const struct fscrypt_dummy_policy *p2);
-int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
-				struct fscrypt_dummy_policy *dummy_policy);
 void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
 					struct super_block *sb);
 static inline bool
-- 
cgit v1.2.3


From 0de1978852dff8643a80f20a8f44cef3f16c2db0 Mon Sep 17 00:00:00 2001
From: Hongbin Wang <wh_bin@126.com>
Date: Sun, 21 Aug 2022 23:41:47 -0400
Subject: xfrm: Drop unused argument

Drop unused argument from xfrm_policy_match,
__xfrm_policy_eval_candidates and xfrm_policy_eval_candidates.
No functional changes intended.

Signed-off-by: Hongbin Wang <wh_bin@126.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f1a0bab920a5..6264680b1f08 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1889,7 +1889,7 @@ EXPORT_SYMBOL(xfrm_policy_walk_done);
  */
 static int xfrm_policy_match(const struct xfrm_policy *pol,
 			     const struct flowi *fl,
-			     u8 type, u16 family, int dir, u32 if_id)
+			     u8 type, u16 family, u32 if_id)
 {
 	const struct xfrm_selector *sel = &pol->selector;
 	int ret = -ESRCH;
@@ -2014,7 +2014,7 @@ static struct xfrm_policy *
 __xfrm_policy_eval_candidates(struct hlist_head *chain,
 			      struct xfrm_policy *prefer,
 			      const struct flowi *fl,
-			      u8 type, u16 family, int dir, u32 if_id)
+			      u8 type, u16 family, u32 if_id)
 {
 	u32 priority = prefer ? prefer->priority : ~0u;
 	struct xfrm_policy *pol;
@@ -2028,7 +2028,7 @@ __xfrm_policy_eval_candidates(struct hlist_head *chain,
 		if (pol->priority > priority)
 			break;
 
-		err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
+		err = xfrm_policy_match(pol, fl, type, family, if_id);
 		if (err) {
 			if (err != -ESRCH)
 				return ERR_PTR(err);
@@ -2053,7 +2053,7 @@ static struct xfrm_policy *
 xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
 			    struct xfrm_policy *prefer,
 			    const struct flowi *fl,
-			    u8 type, u16 family, int dir, u32 if_id)
+			    u8 type, u16 family, u32 if_id)
 {
 	struct xfrm_policy *tmp;
 	int i;
@@ -2061,8 +2061,7 @@ xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand,
 	for (i = 0; i < ARRAY_SIZE(cand->res); i++) {
 		tmp = __xfrm_policy_eval_candidates(cand->res[i],
 						    prefer,
-						    fl, type, family, dir,
-						    if_id);
+						    fl, type, family, if_id);
 		if (!tmp)
 			continue;
 
@@ -2101,7 +2100,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 
 	ret = NULL;
 	hlist_for_each_entry_rcu(pol, chain, bydst) {
-		err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
+		err = xfrm_policy_match(pol, fl, type, family, if_id);
 		if (err) {
 			if (err == -ESRCH)
 				continue;
@@ -2120,7 +2119,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 		goto skip_inexact;
 
 	pol = xfrm_policy_eval_candidates(&cand, ret, fl, type,
-					  family, dir, if_id);
+					  family, if_id);
 	if (pol) {
 		ret = pol;
 		if (IS_ERR(pol))
-- 
cgit v1.2.3


From 5926586f291b53cb8a0c9631fc19489be1186e2d Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.ibm.com>
Date: Wed, 17 Aug 2022 17:18:42 -0400
Subject: ima: fix blocking of security.ima xattrs of unsupported algorithms

Limit validating the hash algorithm to just security.ima xattr, not
the security.evm xattr or any of the protected EVM security xattrs,
nor posix acls.

Fixes: 50f742dd9147 ("IMA: block writes of the security.ima xattr with unsupported algorithms")
Reported-by: Christian Brauner <brauner@kernel.org>
Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 security/integrity/ima/ima_appraise.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c
index bde74fcecee3..3e0fbbd99534 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -750,22 +750,26 @@ int ima_inode_setxattr(struct dentry *dentry, const char *xattr_name,
 	const struct evm_ima_xattr_data *xvalue = xattr_value;
 	int digsig = 0;
 	int result;
+	int err;
 
 	result = ima_protect_xattr(dentry, xattr_name, xattr_value,
 				   xattr_value_len);
 	if (result == 1) {
 		if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST))
 			return -EINVAL;
+
+		err = validate_hash_algo(dentry, xvalue, xattr_value_len);
+		if (err)
+			return err;
+
 		digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG);
 	} else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) {
 		digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG);
 	}
 	if (result == 1 || evm_revalidate_status(xattr_name)) {
-		result = validate_hash_algo(dentry, xvalue, xattr_value_len);
-		if (result)
-			return result;
-
 		ima_reset_appraise_flags(d_backing_inode(dentry), digsig);
+		if (result == 1)
+			result = 0;
 	}
 	return result;
 }
-- 
cgit v1.2.3


From 45e9aa1fdbb2ebafec88c64bc53fe45cf8935b49 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 10 Aug 2022 18:14:22 +0200
Subject: ACPI: Rename acpi_bus_get/put_acpi_device()

Because acpi_bus_get_acpi_device() is completely analogous to
acpi_fetch_acpi_dev(), rename it to acpi_get_acpi_dev() and
add a kerneldoc comment to it.

Accordingly, rename acpi_bus_put_acpi_device() to acpi_put_acpi_dev()
and update all of the users of these two functions.

While at it, move the acpi_fetch_acpi_dev() header next to the
acpi_get_acpi_dev() header in the header file holding them.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Punit Agrawal <punit.agrawal@bytedance.com>
---
 drivers/acpi/bus.c               |  6 +++---
 drivers/acpi/device_pm.c         |  4 ++--
 drivers/acpi/irq.c               |  4 ++--
 drivers/acpi/scan.c              | 21 ++++++++++++++++-----
 drivers/hwmon/acpi_power_meter.c |  2 +-
 include/acpi/acpi_bus.h          |  6 +++---
 6 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index c0d20d997891..f43c6a4b0827 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -511,7 +511,7 @@ static void acpi_bus_notify(acpi_handle handle, u32 type, void *data)
 		break;
 	}
 
-	adev = acpi_bus_get_acpi_device(handle);
+	adev = acpi_get_acpi_dev(handle);
 	if (!adev)
 		goto err;
 
@@ -524,14 +524,14 @@ static void acpi_bus_notify(acpi_handle handle, u32 type, void *data)
 	}
 
 	if (!hotplug_event) {
-		acpi_bus_put_acpi_device(adev);
+		acpi_put_acpi_dev(adev);
 		return;
 	}
 
 	if (ACPI_SUCCESS(acpi_hotplug_schedule(adev, type)))
 		return;
 
-	acpi_bus_put_acpi_device(adev);
+	acpi_put_acpi_dev(adev);
 
  err:
 	acpi_evaluate_ost(handle, type, ost_code, NULL);
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 9dce1245689c..2b7e08d54c3f 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -497,7 +497,7 @@ static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used)
 
 	acpi_handle_debug(handle, "Wake notify\n");
 
-	adev = acpi_bus_get_acpi_device(handle);
+	adev = acpi_get_acpi_dev(handle);
 	if (!adev)
 		return;
 
@@ -515,7 +515,7 @@ static void acpi_pm_notify_handler(acpi_handle handle, u32 val, void *not_used)
 
 	mutex_unlock(&acpi_pm_notifier_lock);
 
-	acpi_bus_put_acpi_device(adev);
+	acpi_put_acpi_dev(adev);
 }
 
 /**
diff --git a/drivers/acpi/irq.c b/drivers/acpi/irq.c
index dabe45eba055..4db5bb587599 100644
--- a/drivers/acpi/irq.c
+++ b/drivers/acpi/irq.c
@@ -118,12 +118,12 @@ acpi_get_irq_source_fwhandle(const struct acpi_resource_source *source,
 	if (WARN_ON(ACPI_FAILURE(status)))
 		return NULL;
 
-	device = acpi_bus_get_acpi_device(handle);
+	device = acpi_get_acpi_dev(handle);
 	if (WARN_ON(!device))
 		return NULL;
 
 	result = &device->fwnode;
-	acpi_bus_put_acpi_device(device);
+	acpi_put_acpi_dev(device);
 	return result;
 }
 
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 42cec8120f18..b3ee0823f735 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -429,7 +429,7 @@ void acpi_device_hotplug(struct acpi_device *adev, u32 src)
 	acpi_evaluate_ost(adev->handle, src, ost_code, NULL);
 
  out:
-	acpi_bus_put_acpi_device(adev);
+	acpi_put_acpi_dev(adev);
 	mutex_unlock(&acpi_scan_lock);
 	unlock_device_hotplug();
 }
@@ -599,11 +599,22 @@ static void get_acpi_device(void *dev)
 	acpi_dev_get(dev);
 }
 
-struct acpi_device *acpi_bus_get_acpi_device(acpi_handle handle)
+/**
+ * acpi_get_acpi_dev - Retrieve ACPI device object and reference count it.
+ * @handle: ACPI handle associated with the requested ACPI device object.
+ *
+ * Return a pointer to the ACPI device object associated with @handle and bump
+ * up that object's reference counter (under the ACPI Namespace lock), if
+ * present, or return NULL otherwise.
+ *
+ * The ACPI device object reference acquired by this function needs to be
+ * dropped via acpi_dev_put().
+ */
+struct acpi_device *acpi_get_acpi_dev(acpi_handle handle)
 {
 	return handle_to_device(handle, get_acpi_device);
 }
-EXPORT_SYMBOL_GPL(acpi_bus_get_acpi_device);
+EXPORT_SYMBOL_GPL(acpi_get_acpi_dev);
 
 static struct acpi_device_bus_id *acpi_device_bus_id_match(const char *dev_id)
 {
@@ -2239,7 +2250,7 @@ static int acpi_dev_get_first_consumer_dev_cb(struct acpi_dep_data *dep, void *d
 {
 	struct acpi_device *adev;
 
-	adev = acpi_bus_get_acpi_device(dep->consumer);
+	adev = acpi_get_acpi_dev(dep->consumer);
 	if (adev) {
 		*(struct acpi_device **)data = adev;
 		return 1;
@@ -2292,7 +2303,7 @@ static bool acpi_scan_clear_dep_queue(struct acpi_device *adev)
 
 static int acpi_scan_clear_dep(struct acpi_dep_data *dep, void *data)
 {
-	struct acpi_device *adev = acpi_bus_get_acpi_device(dep->consumer);
+	struct acpi_device *adev = acpi_get_acpi_dev(dep->consumer);
 
 	if (adev) {
 		adev->dep_unmet--;
diff --git a/drivers/hwmon/acpi_power_meter.c b/drivers/hwmon/acpi_power_meter.c
index d2545a1be9fc..44e04c75d0d3 100644
--- a/drivers/hwmon/acpi_power_meter.c
+++ b/drivers/hwmon/acpi_power_meter.c
@@ -598,7 +598,7 @@ static int read_domain_devices(struct acpi_power_meter_resource *resource)
 			continue;
 
 		/* Create a symlink to domain objects */
-		obj = acpi_bus_get_acpi_device(element->reference.handle);
+		obj = acpi_get_acpi_dev(element->reference.handle);
 		resource->domain_devices[i] = obj;
 		if (!obj)
 			continue;
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index e7d27373ff71..7ff067a5a3bd 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -512,7 +512,6 @@ extern int unregister_acpi_notifier(struct notifier_block *);
  * External Functions
  */
 
-struct acpi_device *acpi_fetch_acpi_dev(acpi_handle handle);
 acpi_status acpi_bus_get_status_handle(acpi_handle handle,
 				       unsigned long long *sta);
 int acpi_bus_get_status(struct acpi_device *device);
@@ -767,9 +766,10 @@ static inline void acpi_dev_put(struct acpi_device *adev)
 		put_device(&adev->dev);
 }
 
-struct acpi_device *acpi_bus_get_acpi_device(acpi_handle handle);
+struct acpi_device *acpi_fetch_acpi_dev(acpi_handle handle);
+struct acpi_device *acpi_get_acpi_dev(acpi_handle handle);
 
-static inline void acpi_bus_put_acpi_device(struct acpi_device *adev)
+static inline void acpi_put_acpi_dev(struct acpi_device *adev)
 {
 	acpi_dev_put(adev);
 }
-- 
cgit v1.2.3


From f6f1e12f3add6e9d85d9fa1916bf4b2a39d8c194 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 10 Aug 2022 18:15:24 +0200
Subject: ACPI: scan: Rename acpi_bus_get_parent() and rearrange it

The acpi_bus_get_parent() name doesn't really reflect the
purpose of the function so change it to a more accurate
acpi_find_parent_acpi_dev().

While at it, rearrange the code inside that function to make it
easier to read.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Punit Agrawal <punit.agrawal@bytedance.com>
---
 drivers/acpi/scan.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index b3ee0823f735..25a104d0b743 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -816,10 +816,9 @@ static const char * const acpi_honor_dep_ids[] = {
 	NULL
 };
 
-static struct acpi_device *acpi_bus_get_parent(acpi_handle handle)
+static struct acpi_device *acpi_find_parent_acpi_dev(acpi_handle handle)
 {
-	struct acpi_device *device;
-	acpi_status status;
+	struct acpi_device *adev;
 
 	/*
 	 * Fixed hardware devices do not appear in the namespace and do not
@@ -830,13 +829,18 @@ static struct acpi_device *acpi_bus_get_parent(acpi_handle handle)
 		return acpi_root;
 
 	do {
+		acpi_status status;
+
 		status = acpi_get_parent(handle, &handle);
-		if (ACPI_FAILURE(status))
-			return status == AE_NULL_ENTRY ? NULL : acpi_root;
+		if (ACPI_FAILURE(status)) {
+			if (status != AE_NULL_ENTRY)
+				return acpi_root;
 
-		device = acpi_fetch_acpi_dev(handle);
-	} while (!device);
-	return device;
+			return NULL;
+		}
+		adev = acpi_fetch_acpi_dev(handle);
+	} while (!adev);
+	return adev;
 }
 
 acpi_status
@@ -1778,7 +1782,7 @@ void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
 	INIT_LIST_HEAD(&device->pnp.ids);
 	device->device_type = type;
 	device->handle = handle;
-	device->parent = acpi_bus_get_parent(handle);
+	device->parent = acpi_find_parent_acpi_dev(handle);
 	fwnode_init(&device->fwnode, &acpi_device_fwnode_ops);
 	acpi_set_device_status(device, ACPI_STA_DEFAULT);
 	acpi_device_get_busid(device);
-- 
cgit v1.2.3


From 5c5e1237032aaa39107e2d0bb8e6cb84b3c41161 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 10 Aug 2022 18:16:33 +0200
Subject: ACPI: scan: Rearrange initialization of ACPI device objects

The initialization of ACPI device objects is split between
acpi_init_device_object() and __acpi_device_add() that initializes
the dev field in struct acpi_device.  The "release" function pointer
is passed to __acpi_device_add() for this reason.

However, that split is artificial and all of the initialization can
be carried out by acpi_init_device_object(), so rearrange the code
to that end.  In particular, make acpi_init_device_object() take the
"release" pointer as an argument, along with the "type" which is
related to it, instead of __acpi_device_add().

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Punit Agrawal <punit.agrawal@bytedance.com>
---
 drivers/acpi/internal.h |  5 ++---
 drivers/acpi/power.c    |  5 +++--
 drivers/acpi/scan.c     | 27 ++++++++++++++-------------
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 628bf8f18130..b799585eae2a 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -102,10 +102,9 @@ struct acpi_device_bus_id {
 	struct list_head node;
 };
 
-int acpi_device_add(struct acpi_device *device,
-		    void (*release)(struct device *));
 void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
-			     int type);
+			     int type, void (*release)(struct device *));
+int acpi_device_add(struct acpi_device *device);
 int acpi_device_setup_files(struct acpi_device *dev);
 void acpi_device_remove_files(struct acpi_device *dev);
 void acpi_device_add_finalize(struct acpi_device *device);
diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c
index 8c4a73a1351e..2ae48ab88396 100644
--- a/drivers/acpi/power.c
+++ b/drivers/acpi/power.c
@@ -944,7 +944,8 @@ struct acpi_device *acpi_add_power_resource(acpi_handle handle)
 		return NULL;
 
 	device = &resource->device;
-	acpi_init_device_object(device, handle, ACPI_BUS_TYPE_POWER);
+	acpi_init_device_object(device, handle, ACPI_BUS_TYPE_POWER,
+				acpi_release_power_resource);
 	mutex_init(&resource->resource_lock);
 	INIT_LIST_HEAD(&resource->list_node);
 	INIT_LIST_HEAD(&resource->dependents);
@@ -968,7 +969,7 @@ struct acpi_device *acpi_add_power_resource(acpi_handle handle)
 	pr_info("%s [%s]\n", acpi_device_name(device), acpi_device_bid(device));
 
 	device->flags.match_driver = true;
-	result = acpi_device_add(device, acpi_release_power_resource);
+	result = acpi_device_add(device);
 	if (result)
 		goto err;
 
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 25a104d0b743..75a32f2d0f33 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -673,8 +673,7 @@ static void acpi_store_pld_crc(struct acpi_device *adev)
 	ACPI_FREE(pld);
 }
 
-static int __acpi_device_add(struct acpi_device *device,
-			     void (*release)(struct device *))
+static int __acpi_device_add(struct acpi_device *device)
 {
 	struct acpi_device_bus_id *acpi_device_bus_id;
 	int result;
@@ -730,11 +729,6 @@ static int __acpi_device_add(struct acpi_device *device,
 
 	mutex_unlock(&acpi_device_lock);
 
-	if (device->parent)
-		device->dev.parent = &device->parent->dev;
-
-	device->dev.bus = &acpi_bus_type;
-	device->dev.release = release;
 	result = device_add(&device->dev);
 	if (result) {
 		dev_err(&device->dev, "Error registering device\n");
@@ -761,7 +755,7 @@ err_unlock:
 	return result;
 }
 
-int acpi_device_add(struct acpi_device *adev, void (*release)(struct device *))
+int acpi_device_add(struct acpi_device *adev)
 {
 	int ret;
 
@@ -769,7 +763,7 @@ int acpi_device_add(struct acpi_device *adev, void (*release)(struct device *))
 	if (ret)
 		return ret;
 
-	return __acpi_device_add(adev, release);
+	return __acpi_device_add(adev);
 }
 
 /* --------------------------------------------------------------------------
@@ -1777,12 +1771,19 @@ static bool acpi_device_enumeration_by_parent(struct acpi_device *device)
 }
 
 void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
-			     int type)
+			     int type, void (*release)(struct device *))
 {
+	struct acpi_device *parent = acpi_find_parent_acpi_dev(handle);
+
 	INIT_LIST_HEAD(&device->pnp.ids);
 	device->device_type = type;
 	device->handle = handle;
-	device->parent = acpi_find_parent_acpi_dev(handle);
+	if (parent) {
+		device->parent = parent;
+		device->dev.parent = &parent->dev;
+	}
+	device->dev.release = release;
+	device->dev.bus = &acpi_bus_type;
 	fwnode_init(&device->fwnode, &acpi_device_fwnode_ops);
 	acpi_set_device_status(device, ACPI_STA_DEFAULT);
 	acpi_device_get_busid(device);
@@ -1836,7 +1837,7 @@ static int acpi_add_single_object(struct acpi_device **child,
 	if (!device)
 		return -ENOMEM;
 
-	acpi_init_device_object(device, handle, type);
+	acpi_init_device_object(device, handle, type, acpi_device_release);
 	/*
 	 * Getting the status is delayed till here so that we can call
 	 * acpi_bus_get_status() and use its quirk handling.  Note that
@@ -1866,7 +1867,7 @@ static int acpi_add_single_object(struct acpi_device **child,
 		mutex_unlock(&acpi_dep_list_lock);
 
 	if (!result)
-		result = __acpi_device_add(device, acpi_device_release);
+		result = __acpi_device_add(device);
 
 	if (result) {
 		acpi_device_release(&device->dev);
-- 
cgit v1.2.3


From 6e1850b2f3747942d3813a2fde82f1e46aa593d1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 10 Aug 2022 18:17:23 +0200
Subject: ACPI: scan: Eliminate __acpi_device_add()

Instead of having acpi_device_add() defined as a wrapper around
__acpi_device_add(), export acpi_tie_acpi_dev() so it can be called
directly by acpi_add_power_resource(), fold acpi_device_add() into the
latter and rename __acpi_device_add() to acpi_device_add().

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Punit Agrawal <punit.agrawal@bytedance.com>
---
 drivers/acpi/internal.h |  1 +
 drivers/acpi/power.c    |  6 +++++-
 drivers/acpi/scan.c     | 17 +++--------------
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index b799585eae2a..219c02df9a08 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -104,6 +104,7 @@ struct acpi_device_bus_id {
 
 void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
 			     int type, void (*release)(struct device *));
+int acpi_tie_acpi_dev(struct acpi_device *adev);
 int acpi_device_add(struct acpi_device *device);
 int acpi_device_setup_files(struct acpi_device *dev);
 void acpi_device_remove_files(struct acpi_device *dev);
diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c
index 2ae48ab88396..f2588aba8421 100644
--- a/drivers/acpi/power.c
+++ b/drivers/acpi/power.c
@@ -952,6 +952,7 @@ struct acpi_device *acpi_add_power_resource(acpi_handle handle)
 	strcpy(acpi_device_name(device), ACPI_POWER_DEVICE_NAME);
 	strcpy(acpi_device_class(device), ACPI_POWER_CLASS);
 	device->power.state = ACPI_STATE_UNKNOWN;
+	device->flags.match_driver = true;
 
 	/* Evaluate the object to get the system level and resource order. */
 	status = acpi_evaluate_object(handle, NULL, NULL, &buffer);
@@ -968,7 +969,10 @@ struct acpi_device *acpi_add_power_resource(acpi_handle handle)
 
 	pr_info("%s [%s]\n", acpi_device_name(device), acpi_device_bid(device));
 
-	device->flags.match_driver = true;
+	result = acpi_tie_acpi_dev(device);
+	if (result)
+		goto err;
+
 	result = acpi_device_add(device);
 	if (result)
 		goto err;
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 75a32f2d0f33..0d9c350608bf 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -643,7 +643,7 @@ static int acpi_device_set_name(struct acpi_device *device,
 	return 0;
 }
 
-static int acpi_tie_acpi_dev(struct acpi_device *adev)
+int acpi_tie_acpi_dev(struct acpi_device *adev)
 {
 	acpi_handle handle = adev->handle;
 	acpi_status status;
@@ -673,7 +673,7 @@ static void acpi_store_pld_crc(struct acpi_device *adev)
 	ACPI_FREE(pld);
 }
 
-static int __acpi_device_add(struct acpi_device *device)
+int acpi_device_add(struct acpi_device *device)
 {
 	struct acpi_device_bus_id *acpi_device_bus_id;
 	int result;
@@ -755,17 +755,6 @@ err_unlock:
 	return result;
 }
 
-int acpi_device_add(struct acpi_device *adev)
-{
-	int ret;
-
-	ret = acpi_tie_acpi_dev(adev);
-	if (ret)
-		return ret;
-
-	return __acpi_device_add(adev);
-}
-
 /* --------------------------------------------------------------------------
                                  Device Enumeration
    -------------------------------------------------------------------------- */
@@ -1867,7 +1856,7 @@ static int acpi_add_single_object(struct acpi_device **child,
 		mutex_unlock(&acpi_dep_list_lock);
 
 	if (!result)
-		result = __acpi_device_add(device);
+		result = acpi_device_add(device);
 
 	if (result) {
 		acpi_device_release(&device->dev);
-- 
cgit v1.2.3


From 0efe92b47b9d5e8f31dcb34fd6aff89c7c490ea5 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Thu, 18 Aug 2022 08:51:56 +0300
Subject: ACPI: PM: Fix acpi_dev_state_d0() kerneldoc

The documentation for acpi_dev_state_d0() referred to
Documentation/firmware-guide/acpi/low-power-probe.rst that does not exist,
the right file name is Documentation/firmware-guide/acpi/non-d0-probe.rst.
Fix this.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
[ rjw: Subject edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/device_pm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 9dce1245689c..0d1f911f14e9 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -1460,7 +1460,7 @@ EXPORT_SYMBOL_GPL(acpi_storage_d3);
  * not valid to ask for the ACPI power state of the device in that time frame.
  *
  * This function is intended to be used in a driver's probe or remove
- * function. See Documentation/firmware-guide/acpi/low-power-probe.rst for
+ * function. See Documentation/firmware-guide/acpi/non-d0-probe.rst for
  * more information.
  */
 bool acpi_dev_state_d0(struct device *dev)
-- 
cgit v1.2.3


From b75d2cd06b33956b7ec35c6316e717c25a196ee5 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 18 Aug 2022 22:59:55 +0200
Subject: ACPI: move from strlcpy() with unused retval to strscpy()

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c            | 4 ++--
 drivers/acpi/processor_idle.c | 8 ++++----
 drivers/acpi/utils.c          | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index c0d20d997891..1d29f5dc7d79 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -802,7 +802,7 @@ static bool acpi_of_modalias(struct acpi_device *adev,
 
 	str = obj->string.pointer;
 	chr = strchr(str, ',');
-	strlcpy(modalias, chr ? chr + 1 : str, len);
+	strscpy(modalias, chr ? chr + 1 : str, len);
 
 	return true;
 }
@@ -822,7 +822,7 @@ void acpi_set_modalias(struct acpi_device *adev, const char *default_id,
 		       char *modalias, size_t len)
 {
 	if (!acpi_of_modalias(adev, modalias, len))
-		strlcpy(modalias, default_id, len);
+		strscpy(modalias, default_id, len);
 }
 EXPORT_SYMBOL_GPL(acpi_set_modalias);
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 16a1663d02d4..1778016ea895 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -787,7 +787,7 @@ static int acpi_processor_setup_cstates(struct acpi_processor *pr)
 
 		state = &drv->states[count];
 		snprintf(state->name, CPUIDLE_NAME_LEN, "C%d", i);
-		strlcpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
+		strscpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
 		state->exit_latency = cx->latency;
 		state->target_residency = cx->latency * latency_factor;
 		state->enter = acpi_idle_enter;
@@ -956,7 +956,7 @@ static int acpi_processor_evaluate_lpi(acpi_handle handle,
 
 		obj = pkg_elem + 9;
 		if (obj->type == ACPI_TYPE_STRING)
-			strlcpy(lpi_state->desc, obj->string.pointer,
+			strscpy(lpi_state->desc, obj->string.pointer,
 				ACPI_CX_DESC_LEN);
 
 		lpi_state->index = state_idx;
@@ -1022,7 +1022,7 @@ static bool combine_lpi_states(struct acpi_lpi_state *local,
 	result->arch_flags = parent->arch_flags;
 	result->index = parent->index;
 
-	strlcpy(result->desc, local->desc, ACPI_CX_DESC_LEN);
+	strscpy(result->desc, local->desc, ACPI_CX_DESC_LEN);
 	strlcat(result->desc, "+", ACPI_CX_DESC_LEN);
 	strlcat(result->desc, parent->desc, ACPI_CX_DESC_LEN);
 	return true;
@@ -1196,7 +1196,7 @@ static int acpi_processor_setup_lpi_states(struct acpi_processor *pr)
 
 		state = &drv->states[i];
 		snprintf(state->name, CPUIDLE_NAME_LEN, "LPI-%d", i);
-		strlcpy(state->desc, lpi->desc, CPUIDLE_DESC_LEN);
+		strscpy(state->desc, lpi->desc, CPUIDLE_DESC_LEN);
 		state->exit_latency = lpi->wake_latency;
 		state->target_residency = lpi->min_residency;
 		if (lpi->arch_flags)
diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 5a7b8065e77f..4acd6f7d1395 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -878,7 +878,7 @@ bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
 	struct acpi_dev_match_info match = {};
 	struct device *dev;
 
-	strlcpy(match.hid[0].id, hid, sizeof(match.hid[0].id));
+	strscpy(match.hid[0].id, hid, sizeof(match.hid[0].id));
 	match.uid = uid;
 	match.hrv = hrv;
 
@@ -911,7 +911,7 @@ acpi_dev_get_next_match_dev(struct acpi_device *adev, const char *hid, const cha
 	struct acpi_dev_match_info match = {};
 	struct device *dev;
 
-	strlcpy(match.hid[0].id, hid, sizeof(match.hid[0].id));
+	strscpy(match.hid[0].id, hid, sizeof(match.hid[0].id));
 	match.uid = uid;
 	match.hrv = hrv;
 
@@ -961,7 +961,7 @@ EXPORT_SYMBOL(acpi_video_backlight_string);
 
 static int __init acpi_backlight(char *str)
 {
-	strlcpy(acpi_video_backlight_string, str,
+	strscpy(acpi_video_backlight_string, str,
 		sizeof(acpi_video_backlight_string));
 	return 1;
 }
-- 
cgit v1.2.3


From 6d5afdc97ea71958287364a1f1d07e59ef151b11 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Tue, 16 Aug 2022 14:06:29 +0100
Subject: cpufreq: schedutil: Move max CPU capacity to sugov_policy

There is no need to keep the max CPU capacity in the per_cpu instance.
Furthermore, there is no need to check and update that variable
(sg_cpu->max) every time in the frequency change request, which is part
of hot path. Instead use struct sugov_policy to store that information.
Initialize the max CPU capacity during the setup and start callback.
We can do that since all CPUs in the same frequency domain have the same
max capacity (capacity setup and thermal pressure are based on that).

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 1207c78f85c1..9161d1136d01 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -25,6 +25,9 @@ struct sugov_policy {
 	unsigned int		next_freq;
 	unsigned int		cached_raw_freq;
 
+	/* max CPU capacity, which is equal for all CPUs in freq. domain */
+	unsigned long		max;
+
 	/* The next fields are only needed if fast switch cannot be used: */
 	struct			irq_work irq_work;
 	struct			kthread_work work;
@@ -48,7 +51,6 @@ struct sugov_cpu {
 
 	unsigned long		util;
 	unsigned long		bw_dl;
-	unsigned long		max;
 
 	/* The field below is for single-CPU policies only: */
 #ifdef CONFIG_NO_HZ_COMMON
@@ -158,7 +160,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
 {
 	struct rq *rq = cpu_rq(sg_cpu->cpu);
 
-	sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
 	sg_cpu->bw_dl = cpu_bw_dl(rq);
 	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
 					  FREQUENCY_UTIL, NULL);
@@ -253,6 +254,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
  */
 static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
 {
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	unsigned long boost;
 
 	/* No boost currently required */
@@ -280,7 +282,8 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
 	 * sg_cpu->util is already in capacity scale; convert iowait_boost
 	 * into the same scale so we can compare.
 	 */
-	boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
+	boost = sg_cpu->iowait_boost * sg_policy->max;
+	boost >>= SCHED_CAPACITY_SHIFT;
 	boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
 	if (sg_cpu->util < boost)
 		sg_cpu->util = boost;
@@ -337,7 +340,7 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
 	if (!sugov_update_single_common(sg_cpu, time, flags))
 		return;
 
-	next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
+	next_f = get_next_freq(sg_policy, sg_cpu->util, sg_policy->max);
 	/*
 	 * Do not reduce the frequency if the CPU has not been idle
 	 * recently, as the reduction is likely to be premature then.
@@ -373,6 +376,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 				     unsigned int flags)
 {
 	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	unsigned long prev_util = sg_cpu->util;
 
 	/*
@@ -399,7 +403,8 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 		sg_cpu->util = prev_util;
 
 	cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
-				   map_util_perf(sg_cpu->util), sg_cpu->max);
+				   map_util_perf(sg_cpu->util),
+				   sg_policy->max);
 
 	sg_cpu->sg_policy->last_freq_update_time = time;
 }
@@ -408,25 +413,19 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
 {
 	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	struct cpufreq_policy *policy = sg_policy->policy;
-	unsigned long util = 0, max = 1;
+	unsigned long util = 0;
 	unsigned int j;
 
 	for_each_cpu(j, policy->cpus) {
 		struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
-		unsigned long j_util, j_max;
 
 		sugov_get_util(j_sg_cpu);
 		sugov_iowait_apply(j_sg_cpu, time);
-		j_util = j_sg_cpu->util;
-		j_max = j_sg_cpu->max;
 
-		if (j_util * max > j_max * util) {
-			util = j_util;
-			max = j_max;
-		}
+		util = max(j_sg_cpu->util, util);
 	}
 
-	return get_next_freq(sg_policy, util, max);
+	return get_next_freq(sg_policy, util, sg_policy->max);
 }
 
 static void
@@ -752,7 +751,7 @@ static int sugov_start(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy = policy->governor_data;
 	void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
-	unsigned int cpu;
+	unsigned int cpu = cpumask_first(policy->cpus);
 
 	sg_policy->freq_update_delay_ns	= sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
 	sg_policy->last_freq_update_time	= 0;
@@ -760,6 +759,7 @@ static int sugov_start(struct cpufreq_policy *policy)
 	sg_policy->work_in_progress		= false;
 	sg_policy->limits_changed		= false;
 	sg_policy->cached_raw_freq		= 0;
+	sg_policy->max				= arch_scale_cpu_capacity(cpu);
 
 	sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
 
-- 
cgit v1.2.3


From 30ea3257e8766027c4d8d609dcbd256ff9a76073 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:13 -0400
Subject: fs: dlm: fix race in lowcomms

This patch fixes a race between queue_work() in
_dlm_lowcomms_commit_msg() and srcu_read_unlock(). The queue_work() can
take the final reference of a dlm_msg and so msg->idx can contain
garbage which is signaled by the following warning:

[  676.237050] ------------[ cut here ]------------
[  676.237052] WARNING: CPU: 0 PID: 1060 at include/linux/srcu.h:189 dlm_lowcomms_commit_msg+0x41/0x50
[  676.238945] Modules linked in: dlm_locktorture torture rpcsec_gss_krb5 intel_rapl_msr intel_rapl_common iTCO_wdt iTCO_vendor_support qxl kvm_intel drm_ttm_helper vmw_vsock_virtio_transport kvm vmw_vsock_virtio_transport_common ttm irqbypass crc32_pclmul joydev crc32c_intel serio_raw drm_kms_helper vsock virtio_scsi virtio_console virtio_balloon snd_pcm drm syscopyarea sysfillrect sysimgblt snd_timer fb_sys_fops i2c_i801 lpc_ich snd i2c_smbus soundcore pcspkr
[  676.244227] CPU: 0 PID: 1060 Comm: lock_torture_wr Not tainted 5.19.0-rc3+ #1546
[  676.245216] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.16.0-2.module+el8.7.0+15506+033991b0 04/01/2014
[  676.246460] RIP: 0010:dlm_lowcomms_commit_msg+0x41/0x50
[  676.247132] Code: fe ff ff ff 75 24 48 c7 c6 bd 0f 49 bb 48 c7 c7 38 7c 01 bd e8 00 e7 ca ff 89 de 48 c7 c7 60 78 01 bd e8 42 3d cd ff 5b 5d c3 <0f> 0b eb d8 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48
[  676.249253] RSP: 0018:ffffa401c18ffc68 EFLAGS: 00010282
[  676.249855] RAX: 0000000000000001 RBX: 00000000ffff8b76 RCX: 0000000000000006
[  676.250713] RDX: 0000000000000000 RSI: ffffffffbccf3a10 RDI: ffffffffbcc7b62e
[  676.251610] RBP: ffffa401c18ffc70 R08: 0000000000000001 R09: 0000000000000001
[  676.252481] R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000005
[  676.253421] R13: ffff8b76786ec370 R14: ffff8b76786ec370 R15: ffff8b76786ec480
[  676.254257] FS:  0000000000000000(0000) GS:ffff8b7777800000(0000) knlGS:0000000000000000
[  676.255239] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  676.255897] CR2: 00005590205d88b8 CR3: 000000017656c003 CR4: 0000000000770ee0
[  676.256734] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  676.257567] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  676.258397] PKRU: 55555554
[  676.258729] Call Trace:
[  676.259063]  <TASK>
[  676.259354]  dlm_midcomms_commit_mhandle+0xcc/0x110
[  676.259964]  queue_bast+0x8b/0xb0
[  676.260423]  grant_pending_locks+0x166/0x1b0
[  676.261007]  _unlock_lock+0x75/0x90
[  676.261469]  unlock_lock.isra.57+0x62/0xa0
[  676.262009]  dlm_unlock+0x21e/0x330
[  676.262457]  ? lock_torture_stats+0x80/0x80 [dlm_locktorture]
[  676.263183]  torture_unlock+0x5a/0x90 [dlm_locktorture]
[  676.263815]  ? preempt_count_sub+0xba/0x100
[  676.264361]  ? complete+0x1d/0x60
[  676.264777]  lock_torture_writer+0xb8/0x150 [dlm_locktorture]
[  676.265555]  kthread+0x10a/0x130
[  676.266007]  ? kthread_complete_and_exit+0x20/0x20
[  676.266616]  ret_from_fork+0x22/0x30
[  676.267097]  </TASK>
[  676.267381] irq event stamp: 9579855
[  676.267824] hardirqs last  enabled at (9579863): [<ffffffffbb14e6f8>] __up_console_sem+0x58/0x60
[  676.268896] hardirqs last disabled at (9579872): [<ffffffffbb14e6dd>] __up_console_sem+0x3d/0x60
[  676.270008] softirqs last  enabled at (9579798): [<ffffffffbc200349>] __do_softirq+0x349/0x4c7
[  676.271438] softirqs last disabled at (9579897): [<ffffffffbb0d54c0>] irq_exit_rcu+0xb0/0xf0
[  676.272796] ---[ end trace 0000000000000000 ]---

I reproduced this warning with dlm_locktorture test which is currently
not upstream. However this patch fix the issue by make a additional
refcount between dlm_lowcomms_new_msg() and dlm_lowcomms_commit_msg().
In case of the race the kref_put() in dlm_lowcomms_commit_msg() will be
the final put.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index a4e84e8d94c8..59f64c596233 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1336,6 +1336,8 @@ struct dlm_msg *dlm_lowcomms_new_msg(int nodeid, int len, gfp_t allocation,
 		return NULL;
 	}
 
+	/* for dlm_lowcomms_commit_msg() */
+	kref_get(&msg->ref);
 	/* we assume if successful commit must called */
 	msg->idx = idx;
 	return msg;
@@ -1375,6 +1377,8 @@ void dlm_lowcomms_commit_msg(struct dlm_msg *msg)
 {
 	_dlm_lowcomms_commit_msg(msg);
 	srcu_read_unlock(&connections_srcu, msg->idx);
+	/* because dlm_lowcomms_new_msg() */
+	kref_put(&msg->ref, dlm_msg_release);
 }
 #endif
 
-- 
cgit v1.2.3


From eef6ec9bf390e836a6c4029f3620fe49528aa1fe Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:14 -0400
Subject: fs: dlm: fix race between test_bit() and queue_work()

This patch fixes a race by using ls_cb_mutex around the bit
operations and conditional code blocks for LSFL_CB_DELAY.

The function dlm_callback_stop() expects to stop all callbacks and
flush all currently queued onces. The set_bit() is not enough because
there can still be queue_work() after the workqueue was flushed.
To avoid queue_work() after set_bit(), surround both by ls_cb_mutex.

Cc: stable@vger.kernel.org
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 19ef136f9e4f..a44cc42b6317 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -200,13 +200,13 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
 	if (!prev_seq) {
 		kref_get(&lkb->lkb_ref);
 
+		mutex_lock(&ls->ls_cb_mutex);
 		if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) {
-			mutex_lock(&ls->ls_cb_mutex);
 			list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay);
-			mutex_unlock(&ls->ls_cb_mutex);
 		} else {
 			queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
 		}
+		mutex_unlock(&ls->ls_cb_mutex);
 	}
  out:
 	mutex_unlock(&lkb->lkb_cb_mutex);
@@ -288,7 +288,9 @@ void dlm_callback_stop(struct dlm_ls *ls)
 
 void dlm_callback_suspend(struct dlm_ls *ls)
 {
+	mutex_lock(&ls->ls_cb_mutex);
 	set_bit(LSFL_CB_DELAY, &ls->ls_flags);
+	mutex_unlock(&ls->ls_cb_mutex);
 
 	if (ls->ls_callback_wq)
 		flush_workqueue(ls->ls_callback_wq);
-- 
cgit v1.2.3


From 44637ca41d551d409a481117b07fa209b330fca9 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:15 -0400
Subject: fs: dlm: handle -EBUSY first in lock arg validation

During lock arg validation, first check for -EBUSY cases, then for
-EINVAL cases. The -EINVAL checks look at lkb state variables
which are not stable when an lkb is busy and would cause an
-EBUSY result, e.g. lkb->lkb_grmode.

Cc: stable@vger.kernel.org
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index dac7eb75dba9..c23413da40f5 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2864,17 +2864,9 @@ static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
 static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 			      struct dlm_args *args)
 {
-	int rv = -EINVAL;
+	int rv = -EBUSY;
 
 	if (args->flags & DLM_LKF_CONVERT) {
-		if (lkb->lkb_flags & DLM_IFL_MSTCPY)
-			goto out;
-
-		if (args->flags & DLM_LKF_QUECVT &&
-		    !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
-			goto out;
-
-		rv = -EBUSY;
 		if (lkb->lkb_status != DLM_LKSTS_GRANTED)
 			goto out;
 
@@ -2884,6 +2876,14 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 
 		if (is_overlap(lkb))
 			goto out;
+
+		rv = -EINVAL;
+		if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+			goto out;
+
+		if (args->flags & DLM_LKF_QUECVT &&
+		    !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+			goto out;
 	}
 
 	lkb->lkb_exflags = args->flags;
-- 
cgit v1.2.3


From 420ba3cd035a202757f0848b435d743590deee94 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:16 -0400
Subject: fs: dlm: handle -EBUSY first in unlock validation

This patch checks for -EBUSY conditions in dlm_unlock() before
checking for -EINVAL conditions (except for CANCEL and
FORCEUNLOCK calls where a busy condition is expected.)
There are no problems with the current ordering of checks,
but this makes dlm_unlock() consistent with dlm_lock(), and
may avoid future problems if other checks are added.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index c23413da40f5..16d339d383cd 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2918,23 +2918,12 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
-	int rv = -EINVAL;
-
-	if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
-		log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
-		dlm_print_lkb(lkb);
-		goto out;
-	}
-
-	/* an lkb may still exist even though the lock is EOL'ed due to a
-	   cancel, unlock or failed noqueue request; an app can't use these
-	   locks; return same error as if the lkid had not been found at all */
+	int rv = -EBUSY;
 
-	if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
-		log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
-		rv = -ENOENT;
+	/* normal unlock not allowed if there's any op in progress */
+	if (!(args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) &&
+	    (lkb->lkb_wait_type || lkb->lkb_wait_count))
 		goto out;
-	}
 
 	/* an lkb may be waiting for an rsb lookup to complete where the
 	   lookup was initiated by another lock */
@@ -2949,7 +2938,24 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 			unhold_lkb(lkb); /* undoes create_lkb() */
 		}
 		/* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
-		rv = -EBUSY;
+		goto out;
+	}
+
+	rv = -EINVAL;
+	if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
+		log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
+		dlm_print_lkb(lkb);
+		goto out;
+	}
+
+	/* an lkb may still exist even though the lock is EOL'ed due to a
+	 * cancel, unlock or failed noqueue request; an app can't use these
+	 * locks; return same error as if the lkid had not been found at all
+	 */
+
+	if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
+		log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
+		rv = -ENOENT;
 		goto out;
 	}
 
@@ -3022,14 +3028,8 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 			goto out;
 		}
 		/* add_to_waiters() will set OVERLAP_UNLOCK */
-		goto out_ok;
 	}
 
-	/* normal unlock not allowed if there's any op in progress */
-	rv = -EBUSY;
-	if (lkb->lkb_wait_type || lkb->lkb_wait_count)
-		goto out;
-
  out_ok:
 	/* an overlapping op shouldn't blow away exflags from other op */
 	lkb->lkb_exflags |= args->flags;
-- 
cgit v1.2.3


From c2d76a62d866ae9c03f09ec2a9f9fb266ad586b8 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:17 -0400
Subject: fs: dlm: use __func__ for function name

Avoid hard-coded function names inside message format strings.
(Prevents checkpatch warnings.)

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 16d339d383cd..026c203ff529 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2901,7 +2901,7 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	rv = 0;
  out:
 	if (rv)
-		log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s",
+		log_debug(ls, "%s %d %x %x %x %d %d %s", __func__,
 			  rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
 			  lkb->lkb_status, lkb->lkb_wait_type,
 			  lkb->lkb_resource->res_name);
@@ -3038,7 +3038,7 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 	rv = 0;
  out:
 	if (rv)
-		log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv,
+		log_debug(ls, "%s %d %x %x %x %x %d %s", __func__, rv,
 			  lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
 			  args->flags, lkb->lkb_wait_type,
 			  lkb->lkb_resource->res_name);
-- 
cgit v1.2.3


From 9ac8ba46a701b863be3f197d7eece4c635d0afe4 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:18 -0400
Subject: fs: dlm: handle -EINVAL as log_error()

If the user generates -EINVAL it's probably because they are
using DLM incorrectly.  Change the log level to make these
errors more visible.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 026c203ff529..354f79254d62 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2900,11 +2900,25 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 #endif
 	rv = 0;
  out:
-	if (rv)
+	switch (rv) {
+	case 0:
+		break;
+	case -EINVAL:
+		/* annoy the user because dlm usage is wrong */
+		WARN_ON(1);
+		log_error(ls, "%s %d %x %x %x %d %d %s", __func__,
+			  rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
+			  lkb->lkb_status, lkb->lkb_wait_type,
+			  lkb->lkb_resource->res_name);
+		break;
+	default:
 		log_debug(ls, "%s %d %x %x %x %d %d %s", __func__,
 			  rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
 			  lkb->lkb_status, lkb->lkb_wait_type,
 			  lkb->lkb_resource->res_name);
+		break;
+	}
+
 	return rv;
 }
 
@@ -3037,11 +3051,25 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 	lkb->lkb_astparam = args->astparam;
 	rv = 0;
  out:
-	if (rv)
+	switch (rv) {
+	case 0:
+		break;
+	case -EINVAL:
+		/* annoy the user because dlm usage is wrong */
+		WARN_ON(1);
+		log_error(ls, "%s %d %x %x %x %x %d %s", __func__, rv,
+			  lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
+			  args->flags, lkb->lkb_wait_type,
+			  lkb->lkb_resource->res_name);
+		break;
+	default:
 		log_debug(ls, "%s %d %x %x %x %x %d %s", __func__, rv,
 			  lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
 			  args->flags, lkb->lkb_wait_type,
 			  lkb->lkb_resource->res_name);
+		break;
+	}
+
 	return rv;
 }
 
-- 
cgit v1.2.3


From 7175e131ebba47afef47e6ac4d5bab474d1e6e49 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:19 -0400
Subject: fs: dlm: fix invalid derefence of sb_lvbptr

I experience issues when putting a lkbsb on the stack and have sb_lvbptr
field to a dangled pointer while not using DLM_LKF_VALBLK. It will crash
with the following kernel message, the dangled pointer is here
0xdeadbeef as example:

[  102.749317] BUG: unable to handle page fault for address: 00000000deadbeef
[  102.749320] #PF: supervisor read access in kernel mode
[  102.749323] #PF: error_code(0x0000) - not-present page
[  102.749325] PGD 0 P4D 0
[  102.749332] Oops: 0000 [#1] PREEMPT SMP PTI
[  102.749336] CPU: 0 PID: 1567 Comm: lock_torture_wr Tainted: G        W         5.19.0-rc3+ #1565
[  102.749343] Hardware name: Red Hat KVM/RHEL-AV, BIOS 1.16.0-2.module+el8.7.0+15506+033991b0 04/01/2014
[  102.749344] RIP: 0010:memcpy_erms+0x6/0x10
[  102.749353] Code: cc cc cc cc eb 1e 0f 1f 00 48 89 f8 48 89 d1 48 c1 e9 03 83 e2 07 f3 48 a5 89 d1 f3 a4 c3 66 0f 1f 44 00 00 48 89 f8 48 89 d1 <f3> a4 c3 0f 1f 80 00 00 00 00 48 89 f8 48 83 fa 20 72 7e 40 38 fe
[  102.749355] RSP: 0018:ffff97a58145fd08 EFLAGS: 00010202
[  102.749358] RAX: ffff901778b77070 RBX: 0000000000000000 RCX: 0000000000000040
[  102.749360] RDX: 0000000000000040 RSI: 00000000deadbeef RDI: ffff901778b77070
[  102.749362] RBP: ffff97a58145fd10 R08: ffff901760b67a70 R09: 0000000000000001
[  102.749364] R10: ffff9017008e2cb8 R11: 0000000000000001 R12: ffff901760b67a70
[  102.749366] R13: ffff901760b78f00 R14: 0000000000000003 R15: 0000000000000001
[  102.749368] FS:  0000000000000000(0000) GS:ffff901876e00000(0000) knlGS:0000000000000000
[  102.749372] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  102.749374] CR2: 00000000deadbeef CR3: 000000017c49a004 CR4: 0000000000770ef0
[  102.749376] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  102.749378] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  102.749379] PKRU: 55555554
[  102.749381] Call Trace:
[  102.749382]  <TASK>
[  102.749383]  ? send_args+0xb2/0xd0
[  102.749389]  send_common+0xb7/0xd0
[  102.749395]  _unlock_lock+0x2c/0x90
[  102.749400]  unlock_lock.isra.56+0x62/0xa0
[  102.749405]  dlm_unlock+0x21e/0x330
[  102.749411]  ? lock_torture_stats+0x80/0x80 [dlm_locktorture]
[  102.749416]  torture_unlock+0x5a/0x90 [dlm_locktorture]
[  102.749419]  ? preempt_count_sub+0xba/0x100
[  102.749427]  lock_torture_writer+0xbd/0x150 [dlm_locktorture]
[  102.786186]  kthread+0x10a/0x130
[  102.786581]  ? kthread_complete_and_exit+0x20/0x20
[  102.787156]  ret_from_fork+0x22/0x30
[  102.787588]  </TASK>
[  102.787855] Modules linked in: dlm_locktorture torture rpcsec_gss_krb5 intel_rapl_msr intel_rapl_common kvm_intel iTCO_wdt iTCO_vendor_support kvm vmw_vsock_virtio_transport qxl irqbypass vmw_vsock_virtio_transport_common drm_ttm_helper crc32_pclmul joydev crc32c_intel ttm vsock virtio_scsi virtio_balloon snd_pcm drm_kms_helper virtio_console snd_timer snd drm soundcore syscopyarea i2c_i801 sysfillrect sysimgblt i2c_smbus pcspkr fb_sys_fops lpc_ich serio_raw
[  102.792536] CR2: 00000000deadbeef
[  102.792930] ---[ end trace 0000000000000000 ]---

This patch fixes the issue by checking also on DLM_LKF_VALBLK on exflags
is set when copying the lvbptr array instead of if it's just null which
fixes for me the issue.

I think this patch can fix other dlm users as well, depending how they
handle the init, freeing memory handling of sb_lvbptr and don't set
DLM_LKF_VALBLK for some dlm_lock() calls. It might a there could be a
hidden issue all the time. However with checking on DLM_LKF_VALBLK the
user always need to provide a sb_lvbptr non-null value. There might be
more intelligent handling between per ls lvblen, DLM_LKF_VALBLK and
non-null to report the user the way how DLM API is used is wrong but can
be added for later, this will only fix the current behaviour.

Cc: stable@vger.kernel.org
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 354f79254d62..da95ba3c295e 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3651,7 +3651,7 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
 	case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
 	case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
 	case cpu_to_le32(DLM_MSG_GRANT):
-		if (!lkb->lkb_lvbptr)
+		if (!lkb->lkb_lvbptr || !(lkb->lkb_exflags & DLM_LKF_VALBLK))
 			break;
 		memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
 		break;
-- 
cgit v1.2.3


From b5c9d37c7f6051b45b2da70e434e791080dcb53f Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:20 -0400
Subject: fs: dlm: allow lockspaces have zero lvblen

A dlm user may not use the DLM_LKF_VALBLK flag in the DLM API,
so a zero lvblen should be allowed as a lockspace parameter.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lockspace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 3972f4d86c75..56c79926e7be 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -416,7 +416,7 @@ static int new_lockspace(const char *name, const char *cluster,
 	if (namelen > DLM_LOCKSPACE_LEN || namelen == 0)
 		return -EINVAL;
 
-	if (!lvblen || (lvblen % 8))
+	if (lvblen % 8)
 		return -EINVAL;
 
 	if (!try_module_get(THIS_MODULE))
-- 
cgit v1.2.3


From f45307d395da7ab1aa537cf00e8fd7123a2be4e2 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:21 -0400
Subject: fs: dlm: handle rcom in else if branch

Currently we handle in dlm_receive_buffer() everything else than a
DLM_MSG type as DLM_RCOM message. Although a different message than
DLM_MSG should be a DLM_RCOM we should explicit check on DLM_RCOM and
drop a log_error() if we see something unexpected.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index da95ba3c295e..c41aa8ab3230 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5108,8 +5108,11 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
 	down_read(&ls->ls_recv_active);
 	if (hd->h_cmd == DLM_MSG)
 		dlm_receive_message(ls, &p->message, nodeid);
-	else
+	else if (hd->h_cmd == DLM_RCOM)
 		dlm_receive_rcom(ls, &p->rcom, nodeid);
+	else
+		log_error(ls, "invalid h_cmd %d from %d lockspace %x",
+			  hd->h_cmd, nodeid, le32_to_cpu(hd->u.h_lockspace));
 	up_read(&ls->ls_recv_active);
 
 	dlm_put_lockspace(ls);
-- 
cgit v1.2.3


From e152c38dc0e8f1f7f8aceccbcfb5b019ca3a4352 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:22 -0400
Subject: fs: dlm: remove dlm_del_ast prototype

This patch removes dlm_del_ast() prototype which is not being used in
the dlm subsystem because there is not implementation for it.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index 181ad7d20c4d..e5e05fcc5813 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -11,7 +11,6 @@
 #ifndef __ASTD_DOT_H__
 #define __ASTD_DOT_H__
 
-void dlm_del_ast(struct dlm_lkb *lkb);
 int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
                          int status, uint32_t sbflags, uint64_t seq);
 int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
-- 
cgit v1.2.3


From 296d9d1e9890830bd149105cf0193c2cf7d5bc86 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:23 -0400
Subject: fs: dlm: change ls_clear_proc_locks to spinlock

This patch changes the ls_clear_proc_locks to a spinlock because there
is no need to handle it as a mutex as there is no sleepable context when
ls_clear_proc_locks is held. This allows us to call those functionality
in non-sleepable contexts.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/dlm_internal.h | 2 +-
 fs/dlm/lock.c         | 8 ++++----
 fs/dlm/lockspace.c    | 2 +-
 fs/dlm/user.c         | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 8aca8085d24e..e34c3d2639a5 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -661,7 +661,7 @@ struct dlm_ls {
 	spinlock_t		ls_recover_idr_lock;
 	wait_queue_head_t	ls_wait_general;
 	wait_queue_head_t	ls_recover_lock_wait;
-	struct mutex		ls_clear_proc_locks;
+	spinlock_t		ls_clear_proc_locks;
 
 	struct list_head	ls_root_list;	/* root resources */
 	struct rw_semaphore	ls_root_sem;	/* protect root_list */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index c41aa8ab3230..65a7a0631ec8 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -6215,7 +6215,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
 {
 	struct dlm_lkb *lkb = NULL;
 
-	mutex_lock(&ls->ls_clear_proc_locks);
+	spin_lock(&ls->ls_clear_proc_locks);
 	if (list_empty(&proc->locks))
 		goto out;
 
@@ -6227,7 +6227,7 @@ static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
 	else
 		lkb->lkb_flags |= DLM_IFL_DEAD;
  out:
-	mutex_unlock(&ls->ls_clear_proc_locks);
+	spin_unlock(&ls->ls_clear_proc_locks);
 	return lkb;
 }
 
@@ -6264,7 +6264,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 		dlm_put_lkb(lkb);
 	}
 
-	mutex_lock(&ls->ls_clear_proc_locks);
+	spin_lock(&ls->ls_clear_proc_locks);
 
 	/* in-progress unlocks */
 	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
@@ -6280,7 +6280,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
 		dlm_put_lkb(lkb);
 	}
 
-	mutex_unlock(&ls->ls_clear_proc_locks);
+	spin_unlock(&ls->ls_clear_proc_locks);
 	dlm_unlock_recovery(ls);
 }
 
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 56c79926e7be..41a6504cfab5 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -584,7 +584,7 @@ static int new_lockspace(const char *name, const char *cluster,
 	atomic_set(&ls->ls_requestqueue_cnt, 0);
 	init_waitqueue_head(&ls->ls_requestqueue_wait);
 	mutex_init(&ls->ls_requestqueue_mutex);
-	mutex_init(&ls->ls_clear_proc_locks);
+	spin_lock_init(&ls->ls_clear_proc_locks);
 
 	/* Due backwards compatibility with 3.1 we need to use maximum
 	 * possible dlm message size to be sure the message will fit and
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 99e8f0744513..df6215c73239 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -184,7 +184,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
 		return;
 
 	ls = lkb->lkb_resource->res_ls;
-	mutex_lock(&ls->ls_clear_proc_locks);
+	spin_lock(&ls->ls_clear_proc_locks);
 
 	/* If ORPHAN/DEAD flag is set, it means the process is dead so an ast
 	   can't be delivered.  For ORPHAN's, dlm_clear_proc_locks() freed
@@ -230,7 +230,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
 		spin_unlock(&proc->locks_spin);
 	}
  out:
-	mutex_unlock(&ls->ls_clear_proc_locks);
+	spin_unlock(&ls->ls_clear_proc_locks);
 }
 
 static int device_user_lock(struct dlm_user_proc *proc,
-- 
cgit v1.2.3


From 7a3de7324c2b1299a4f595bb6aa503c878ad7d75 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:24 -0400
Subject: fs: dlm: trace user space callbacks

This patch adds trace callbacks for user locks. Unfortenately user locks
are handled in a different way than kernel locks in some cases. User
locks never call the dlm_lock()/dlm_unlock() kernel API and use the next
step internal API of dlm. Adding those traces from user API callers
should make it possible for dlm trace system to see lock handling for
user locks as well.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c              | 24 ++++++++++++++++++++----
 fs/dlm/user.c              |  7 ++++++-
 include/trace/events/dlm.h | 22 ++++++++++++----------
 3 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 65a7a0631ec8..cef25f8ac82e 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -3466,7 +3466,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
 	if (error == -EINPROGRESS)
 		error = 0;
  out_put:
-	trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error);
+	trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, true);
 
 	if (convert || error)
 		__put_lkb(ls, lkb);
@@ -5842,13 +5842,15 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 		goto out;
 	}
 
+	trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags);
+
 	if (flags & DLM_LKF_VALBLK) {
 		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
 		if (!ua->lksb.sb_lvbptr) {
 			kfree(ua);
 			__put_lkb(ls, lkb);
 			error = -ENOMEM;
-			goto out;
+			goto out_trace_end;
 		}
 	}
 #ifdef CONFIG_DLM_DEPRECATED_API
@@ -5863,7 +5865,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 		ua->lksb.sb_lvbptr = NULL;
 		kfree(ua);
 		__put_lkb(ls, lkb);
-		goto out;
+		goto out_trace_end;
 	}
 
 	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
@@ -5883,7 +5885,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 		fallthrough;
 	default:
 		__put_lkb(ls, lkb);
-		goto out;
+		goto out_trace_end;
 	}
 
 	/* add this new lkb to the per-process list of locks */
@@ -5891,6 +5893,8 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 	hold_lkb(lkb);
 	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
 	spin_unlock(&ua->proc->locks_spin);
+ out_trace_end:
+	trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, false);
  out:
 	dlm_unlock_recovery(ls);
 	return error;
@@ -5916,6 +5920,8 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	if (error)
 		goto out;
 
+	trace_dlm_lock_start(ls, lkb, NULL, 0, mode, flags);
+
 	/* user can change the params on its lock when it converts it, or
 	   add an lvb that didn't exist before */
 
@@ -5953,6 +5959,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK)
 		error = 0;
  out_put:
+	trace_dlm_lock_end(ls, lkb, NULL, 0, mode, flags, error, false);
 	dlm_put_lkb(lkb);
  out:
 	dlm_unlock_recovery(ls);
@@ -6045,6 +6052,8 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	if (error)
 		goto out;
 
+	trace_dlm_unlock_start(ls, lkb, flags);
+
 	ua = lkb->lkb_ua;
 
 	if (lvb_in && ua->lksb.sb_lvbptr)
@@ -6073,6 +6082,7 @@ int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 		list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
 	spin_unlock(&ua->proc->locks_spin);
  out_put:
+	trace_dlm_unlock_end(ls, lkb, flags, error);
 	dlm_put_lkb(lkb);
  out:
 	dlm_unlock_recovery(ls);
@@ -6094,6 +6104,8 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	if (error)
 		goto out;
 
+	trace_dlm_unlock_start(ls, lkb, flags);
+
 	ua = lkb->lkb_ua;
 	if (ua_tmp->castparam)
 		ua->castparam = ua_tmp->castparam;
@@ -6111,6 +6123,7 @@ int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
 	if (error == -EBUSY)
 		error = 0;
  out_put:
+	trace_dlm_unlock_end(ls, lkb, flags, error);
 	dlm_put_lkb(lkb);
  out:
 	dlm_unlock_recovery(ls);
@@ -6132,6 +6145,8 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
 	if (error)
 		goto out;
 
+	trace_dlm_unlock_start(ls, lkb, flags);
+
 	ua = lkb->lkb_ua;
 
 	error = set_unlock_args(flags, ua, &args);
@@ -6160,6 +6175,7 @@ int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
 	if (error == -EBUSY)
 		error = 0;
  out_put:
+	trace_dlm_unlock_end(ls, lkb, flags, error);
 	dlm_put_lkb(lkb);
  out:
 	dlm_unlock_recovery(ls);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index df6215c73239..ca27f276a3f5 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -16,6 +16,8 @@
 #include <linux/slab.h>
 #include <linux/sched/signal.h>
 
+#include <trace/events/dlm.h>
+
 #include "dlm_internal.h"
 #include "lockspace.h"
 #include "lock.h"
@@ -882,7 +884,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 		goto try_another;
 	}
 
-	if (cb.flags & DLM_CB_CAST) {
+	if (cb.flags & DLM_CB_BAST) {
+		trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb.mode);
+	} else if (cb.flags & DLM_CB_CAST) {
 		new_mode = cb.mode;
 
 		if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
@@ -891,6 +895,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 
 		lkb->lkb_lksb->sb_status = cb.sb_status;
 		lkb->lkb_lksb->sb_flags = cb.sb_flags;
+		trace_dlm_ast(lkb->lkb_resource->res_ls, lkb);
 	}
 
 	rv = copy_result_to_user(lkb->lkb_ua,
diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h
index bad21222130e..18575206295f 100644
--- a/include/trace/events/dlm.h
+++ b/include/trace/events/dlm.h
@@ -92,9 +92,10 @@ TRACE_EVENT(dlm_lock_start,
 TRACE_EVENT(dlm_lock_end,
 
 	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, void *name,
-		 unsigned int namelen, int mode, __u32 flags, int error),
+		 unsigned int namelen, int mode, __u32 flags, int error,
+		 bool kernel_lock),
 
-	TP_ARGS(ls, lkb, name, namelen, mode, flags, error),
+	TP_ARGS(ls, lkb, name, namelen, mode, flags, error, kernel_lock),
 
 	TP_STRUCT__entry(
 		__field(__u32, ls_id)
@@ -113,6 +114,7 @@ TRACE_EVENT(dlm_lock_end,
 		__entry->lkb_id = lkb->lkb_id;
 		__entry->mode = mode;
 		__entry->flags = flags;
+		__entry->error = error;
 
 		r = lkb->lkb_resource;
 		if (r)
@@ -122,14 +124,14 @@ TRACE_EVENT(dlm_lock_end,
 			memcpy(__get_dynamic_array(res_name), name,
 			       __get_dynamic_array_len(res_name));
 
-		/* return value will be zeroed in those cases by dlm_lock()
-		 * we do it here again to not introduce more overhead if
-		 * trace isn't running and error reflects the return value.
-		 */
-		if (error == -EAGAIN || error == -EDEADLK)
-			__entry->error = 0;
-		else
-			__entry->error = error;
+		if (kernel_lock) {
+			/* return value will be zeroed in those cases by dlm_lock()
+			 * we do it here again to not introduce more overhead if
+			 * trace isn't running and error reflects the return value.
+			 */
+			if (error == -EAGAIN || error == -EDEADLK)
+				__entry->error = 0;
+		}
 
 	),
 
-- 
cgit v1.2.3


From 12cda13cfd5310bbfefdfe32a82489228e2e0381 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:25 -0400
Subject: fs: dlm: remove DLM_LSFL_FS from uapi

The DLM_LSFL_FS flag is set in lockspaces created directly
for a kernel user, as opposed to those lockspaces created
for user space applications.  The user space libdlm allowed
this flag to be set for lockspaces created from user space,
but then used by a kernel user.  No kernel user has ever
used this method, so remove the ability to do it.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 drivers/md/md-cluster.c  |  4 ++--
 fs/dlm/lockspace.c       | 28 ++++++++++++++++++++++++----
 fs/dlm/lockspace.h       | 13 +++++++++++++
 fs/dlm/user.c            |  6 +++---
 fs/gfs2/lock_dlm.c       |  2 +-
 fs/ocfs2/stack_user.c    |  2 +-
 include/linux/dlm.h      |  3 ---
 include/uapi/linux/dlm.h |  1 -
 8 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 742b2349fea3..10e0c5381d01 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -876,8 +876,8 @@ static int join(struct mddev *mddev, int nodes)
 	memset(str, 0, 64);
 	sprintf(str, "%pU", mddev->uuid);
 	ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
-				DLM_LSFL_FS, LVB_SIZE,
-				&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
+				0, LVB_SIZE, &md_ls_ops, mddev,
+				&ops_rv, &cinfo->lockspace);
 	if (ret)
 		goto err;
 	wait_for_completion(&cinfo->completion);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 41a6504cfab5..bae050df7abf 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -703,10 +703,11 @@ static int new_lockspace(const char *name, const char *cluster,
 	return error;
 }
 
-int dlm_new_lockspace(const char *name, const char *cluster,
-		      uint32_t flags, int lvblen,
-		      const struct dlm_lockspace_ops *ops, void *ops_arg,
-		      int *ops_result, dlm_lockspace_t **lockspace)
+static int __dlm_new_lockspace(const char *name, const char *cluster,
+			       uint32_t flags, int lvblen,
+			       const struct dlm_lockspace_ops *ops,
+			       void *ops_arg, int *ops_result,
+			       dlm_lockspace_t **lockspace)
 {
 	int error = 0;
 
@@ -732,6 +733,25 @@ int dlm_new_lockspace(const char *name, const char *cluster,
 	return error;
 }
 
+int dlm_new_lockspace(const char *name, const char *cluster, uint32_t flags,
+		      int lvblen, const struct dlm_lockspace_ops *ops,
+		      void *ops_arg, int *ops_result,
+		      dlm_lockspace_t **lockspace)
+{
+	return __dlm_new_lockspace(name, cluster, flags | DLM_LSFL_FS, lvblen,
+				   ops, ops_arg, ops_result, lockspace);
+}
+
+int dlm_new_user_lockspace(const char *name, const char *cluster,
+			   uint32_t flags, int lvblen,
+			   const struct dlm_lockspace_ops *ops,
+			   void *ops_arg, int *ops_result,
+			   dlm_lockspace_t **lockspace)
+{
+	return __dlm_new_lockspace(name, cluster, flags, lvblen, ops,
+				   ops_arg, ops_result, lockspace);
+}
+
 static int lkb_idr_is_local(int id, void *p, void *data)
 {
 	struct dlm_lkb *lkb = p;
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index 306fc4f4ea15..03f4a4a3a871 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h
@@ -12,6 +12,14 @@
 #ifndef __LOCKSPACE_DOT_H__
 #define __LOCKSPACE_DOT_H__
 
+/* DLM_LSFL_FS
+ *   The lockspace user is in the kernel (i.e. filesystem).  Enables
+ *   direct bast/cast callbacks.
+ *
+ * internal lockspace flag - will be removed in future
+ */
+#define DLM_LSFL_FS	0x00000004
+
 int dlm_lockspace_init(void);
 void dlm_lockspace_exit(void);
 struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
@@ -20,6 +28,11 @@ struct dlm_ls *dlm_find_lockspace_device(int minor);
 void dlm_put_lockspace(struct dlm_ls *ls);
 void dlm_stop_lockspaces(void);
 void dlm_stop_lockspaces_check(void);
+int dlm_new_user_lockspace(const char *name, const char *cluster,
+			   uint32_t flags, int lvblen,
+			   const struct dlm_lockspace_ops *ops,
+			   void *ops_arg, int *ops_result,
+			   dlm_lockspace_t **lockspace);
 
 #endif				/* __LOCKSPACE_DOT_H__ */
 
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ca27f276a3f5..c5d27bccc3dc 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -423,9 +423,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	error = dlm_new_lockspace(params->name, dlm_config.ci_cluster_name, params->flags,
-				  DLM_USER_LVB_LEN, NULL, NULL, NULL,
-				  &lockspace);
+	error = dlm_new_user_lockspace(params->name, dlm_config.ci_cluster_name,
+				       params->flags, DLM_USER_LVB_LEN, NULL,
+				       NULL, NULL, &lockspace);
 	if (error)
 		return error;
 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 6ce369b096d4..71911bf9ab34 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1302,7 +1302,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
 	memcpy(cluster, table, strlen(table) - strlen(fsname));
 	fsname++;
 
-	flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
+	flags = DLM_LSFL_NEWEXCL;
 
 	/*
 	 * create/join lockspace
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index a75e2b7d67f5..64e6ddcfe329 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -991,7 +991,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 	lc->oc_type = NO_CONTROLD;
 
 	rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
-			       DLM_LSFL_FS | DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
+			       DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
 			       &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
 	if (rc) {
 		if (rc == -EEXIST || rc == -EPROTO)
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
index ff951e9f6f20..f5f55c2138ae 100644
--- a/include/linux/dlm.h
+++ b/include/linux/dlm.h
@@ -56,9 +56,6 @@ struct dlm_lockspace_ops {
  * DLM_LSFL_TIMEWARN
  *   The dlm should emit netlink messages if locks have been waiting
  *   for a configurable amount of time.  (Unused.)
- * DLM_LSFL_FS
- *   The lockspace user is in the kernel (i.e. filesystem).  Enables
- *   direct bast/cast callbacks.
  * DLM_LSFL_NEWEXCL
  *   dlm_new_lockspace() should return -EEXIST if the lockspace exists.
  *
diff --git a/include/uapi/linux/dlm.h b/include/uapi/linux/dlm.h
index 0d2eca287567..1923f4f3b05e 100644
--- a/include/uapi/linux/dlm.h
+++ b/include/uapi/linux/dlm.h
@@ -69,7 +69,6 @@ struct dlm_lksb {
 /* dlm_new_lockspace() flags */
 
 #define DLM_LSFL_TIMEWARN	0x00000002
-#define DLM_LSFL_FS     	0x00000004
 #define DLM_LSFL_NEWEXCL     	0x00000008
 
 
-- 
cgit v1.2.3


From 9cb16d42717b114e8dec9d534a9d807b618e9f92 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:26 -0400
Subject: fs: dlm: LSFL_CB_DELAY only for kernel lockspaces

This patch only set/clear the LSFL_CB_DELAY bit when it's actually a
kernel lockspace signaled by if ls->ls_callback_wq is set or not set in
this case. User lockspaces will never evaluate this flag.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index a44cc42b6317..d60a8d8f109d 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -288,12 +288,13 @@ void dlm_callback_stop(struct dlm_ls *ls)
 
 void dlm_callback_suspend(struct dlm_ls *ls)
 {
-	mutex_lock(&ls->ls_cb_mutex);
-	set_bit(LSFL_CB_DELAY, &ls->ls_flags);
-	mutex_unlock(&ls->ls_cb_mutex);
+	if (ls->ls_callback_wq) {
+		mutex_lock(&ls->ls_cb_mutex);
+		set_bit(LSFL_CB_DELAY, &ls->ls_flags);
+		mutex_unlock(&ls->ls_cb_mutex);
 
-	if (ls->ls_callback_wq)
 		flush_workqueue(ls->ls_callback_wq);
+	}
 }
 
 #define MAX_CB_QUEUE 25
@@ -304,11 +305,11 @@ void dlm_callback_resume(struct dlm_ls *ls)
 	int count = 0, sum = 0;
 	bool empty;
 
-	clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
-
 	if (!ls->ls_callback_wq)
 		return;
 
+	clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
+
 more:
 	mutex_lock(&ls->ls_cb_mutex);
 	list_for_each_entry_safe(lkb, safe, &ls->ls_cb_delay, lkb_cb_list) {
-- 
cgit v1.2.3


From 56171e0db23a5e0edce1596dd2792b95ffe57bd3 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Mon, 15 Aug 2022 15:43:27 -0400
Subject: fs: dlm: const void resource name parameter

The resource name parameter should never be changed by DLM so we declare
it as const. At some point it is handled as a char pointer, a resource
name can be a non printable ascii string as well. This patch change it
to handle it as void pointer as it is offered by DLM API.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c              | 23 +++++++++++++----------
 fs/dlm/lock.h              |  2 +-
 include/linux/dlm.h        |  2 +-
 include/trace/events/dlm.h |  4 ++--
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index cef25f8ac82e..c830feb26384 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -401,7 +401,7 @@ static int pre_rsb_struct(struct dlm_ls *ls)
    unlock any spinlocks, go back and call pre_rsb_struct again.
    Otherwise, take an rsb off the list and return it. */
 
-static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
+static int get_rsb_struct(struct dlm_ls *ls, const void *name, int len,
 			  struct dlm_rsb **r_ret)
 {
 	struct dlm_rsb *r;
@@ -412,7 +412,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
 		count = ls->ls_new_rsb_count;
 		spin_unlock(&ls->ls_new_rsb_spin);
 		log_debug(ls, "find_rsb retry %d %d %s",
-			  count, dlm_config.ci_new_rsb_count, name);
+			  count, dlm_config.ci_new_rsb_count,
+			  (const char *)name);
 		return -EAGAIN;
 	}
 
@@ -448,7 +449,7 @@ static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
 	return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
 }
 
-int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
+int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len,
 			struct dlm_rsb **r_ret)
 {
 	struct rb_node *node = tree->rb_node;
@@ -546,7 +547,7 @@ static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
  * while that rsb has a potentially stale master.)
  */
 
-static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
+static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
 			uint32_t hash, uint32_t b,
 			int dir_nodeid, int from_nodeid,
 			unsigned int flags, struct dlm_rsb **r_ret)
@@ -724,7 +725,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
    dlm_recover_locks) before we've made ourself master (in
    dlm_recover_masters). */
 
-static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len,
+static int find_rsb_nodir(struct dlm_ls *ls, const void *name, int len,
 			  uint32_t hash, uint32_t b,
 			  int dir_nodeid, int from_nodeid,
 			  unsigned int flags, struct dlm_rsb **r_ret)
@@ -818,8 +819,9 @@ static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len,
 	return error;
 }
 
-static int find_rsb(struct dlm_ls *ls, char *name, int len, int from_nodeid,
-		    unsigned int flags, struct dlm_rsb **r_ret)
+static int find_rsb(struct dlm_ls *ls, const void *name, int len,
+		    int from_nodeid, unsigned int flags,
+		    struct dlm_rsb **r_ret)
 {
 	uint32_t hash, b;
 	int dir_nodeid;
@@ -3320,8 +3322,9 @@ static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
  * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
  */
 
-static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
-			int len, struct dlm_args *args)
+static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
+			const void *name, int len,
+			struct dlm_args *args)
 {
 	struct dlm_rsb *r;
 	int error;
@@ -3420,7 +3423,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
 	     int mode,
 	     struct dlm_lksb *lksb,
 	     uint32_t flags,
-	     void *name,
+	     const void *name,
 	     unsigned int namelen,
 	     uint32_t parent_lkid,
 	     void (*ast) (void *astarg),
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index a7b6474f009d..40c76b5544da 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -36,7 +36,7 @@ static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { }
 int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len,
 		      unsigned int flags, int *r_nodeid, int *result);
 
-int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
+int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len,
 			struct dlm_rsb **r_ret);
 
 void dlm_recover_purge(struct dlm_ls *ls);
diff --git a/include/linux/dlm.h b/include/linux/dlm.h
index f5f55c2138ae..c6bc2b5ee7e6 100644
--- a/include/linux/dlm.h
+++ b/include/linux/dlm.h
@@ -131,7 +131,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
 	     int mode,
 	     struct dlm_lksb *lksb,
 	     uint32_t flags,
-	     void *name,
+	     const void *name,
 	     unsigned int namelen,
 	     uint32_t parent_lkid,
 	     void (*lockast) (void *astarg),
diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h
index 18575206295f..da0eaae98fa3 100644
--- a/include/trace/events/dlm.h
+++ b/include/trace/events/dlm.h
@@ -49,7 +49,7 @@
 /* note: we begin tracing dlm_lock_start() only if ls and lkb are found */
 TRACE_EVENT(dlm_lock_start,
 
-	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, void *name,
+	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, const void *name,
 		 unsigned int namelen, int mode, __u32 flags),
 
 	TP_ARGS(ls, lkb, name, namelen, mode, flags),
@@ -91,7 +91,7 @@ TRACE_EVENT(dlm_lock_start,
 
 TRACE_EVENT(dlm_lock_end,
 
-	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, void *name,
+	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, const void *name,
 		 unsigned int namelen, int mode, __u32 flags, int error,
 		 bool kernel_lock),
 
-- 
cgit v1.2.3


From c926087eb38520b268515ae1a842db6db62554cc Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Fri, 5 Aug 2022 10:16:44 -0400
Subject: x86/mm: Print likely CPU at segfault time

In a large enough fleet of computers, it is common to have a few bad CPUs.
Those can often be identified by seeing that some commonly run kernel code,
which runs fine everywhere else, keeps crashing on the same CPU core on one
particular bad system.

However, the failure modes in CPUs that have gone bad over the years are
often oddly specific, and the only bad behavior seen might be segfaults
in programs like bash, python, or various system daemons that run fine
everywhere else.

Add a printk() to show_signal_msg() to print the CPU, core, and socket
at segfault time.

This is not perfect, since the task might get rescheduled on another
CPU between when the fault hit, and when the message is printed, but in
practice this has been good enough to help people identify several bad
CPU cores.

For example:

  segfault[1349]: segfault at 0 ip 000000000040113a sp 00007ffc6d32e360 error 4 in \
	  segfault[401000+1000] likely on CPU 0 (core 0, socket 0)

This printk can be controlled through /proc/sys/debug/exception-trace.

  [ bp: Massage a bit, add "likely" to the printed line to denote that
    the CPU number is not always reliable. ]

Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220805101644.2e674553@imladris.surriel.com
---
 arch/x86/mm/fault.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fa71a5d12e87..a498ae1fbe66 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -769,6 +769,8 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 		unsigned long address, struct task_struct *tsk)
 {
 	const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
+	/* This is a racy snapshot, but it's better than nothing. */
+	int cpu = raw_smp_processor_id();
 
 	if (!unhandled_signal(tsk, SIGSEGV))
 		return;
@@ -782,6 +784,14 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 
 	print_vma_addr(KERN_CONT " in ", regs->ip);
 
+	/*
+	 * Dump the likely CPU where the fatal segfault happened.
+	 * This can help identify faulty hardware.
+	 */
+	printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu,
+	       topology_core_id(cpu), topology_physical_package_id(cpu));
+
+
 	printk(KERN_CONT "\n");
 
 	show_opcodes(regs, loglvl);
-- 
cgit v1.2.3


From 62fcb99bdf10fed34b4fe6e225489fe4be2d0536 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 24 Aug 2022 18:59:48 +0200
Subject: ACPI: Drop parent field from struct acpi_device

The parent field in struct acpi_device is, in fact, redundant,
because the dev.parent field in it effectively points to the same
object and it is used by the driver core.

Accordingly, the parent field can be dropped from struct acpi_device
and for this purpose define acpi_dev_parent() to retrieve a parent
struct acpi_device pointer from the dev.parent field in struct
acpi_device.  Next, update all of the users of the parent field
in struct acpi_device to use acpi_dev_parent() instead of it and
drop it.

While at it, drop the ACPI_IS_ROOT_DEVICE() macro that is only used
in one place in a confusing way.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Wei Liu <wei.liu@kernel.org>
Reviewed-by: Punit Agrawal <punit.agrawal@bytedance.com>
---
 drivers/acpi/acpi_amba.c     |  5 +++--
 drivers/acpi/acpi_platform.c |  6 +++---
 drivers/acpi/acpi_video.c    |  2 +-
 drivers/acpi/device_pm.c     | 19 ++++++++++---------
 drivers/acpi/property.c      |  6 ++++--
 drivers/acpi/sbs.c           |  2 +-
 drivers/acpi/sbshc.c         |  2 +-
 drivers/acpi/scan.c          | 17 ++++++-----------
 drivers/hv/vmbus_drv.c       |  3 ++-
 drivers/perf/arm_dsu_pmu.c   |  4 ++--
 drivers/perf/qcom_l3_pmu.c   |  3 ++-
 drivers/spi/spi.c            |  2 +-
 drivers/thunderbolt/acpi.c   |  2 +-
 include/acpi/acpi_bus.h      | 10 +++++++++-
 14 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/drivers/acpi/acpi_amba.c b/drivers/acpi/acpi_amba.c
index ab8a4e0191b1..9d69546db2a4 100644
--- a/drivers/acpi/acpi_amba.c
+++ b/drivers/acpi/acpi_amba.c
@@ -48,6 +48,7 @@ static void amba_register_dummy_clk(void)
 static int amba_handler_attach(struct acpi_device *adev,
 				const struct acpi_device_id *id)
 {
+	struct acpi_device *parent = acpi_dev_parent(adev);
 	struct amba_device *dev;
 	struct resource_entry *rentry;
 	struct list_head resource_list;
@@ -97,8 +98,8 @@ static int amba_handler_attach(struct acpi_device *adev,
 	 * attached to it, that physical device should be the parent of
 	 * the amba device we are about to create.
 	 */
-	if (adev->parent)
-		dev->dev.parent = acpi_get_first_physical_node(adev->parent);
+	if (parent)
+		dev->dev.parent = acpi_get_first_physical_node(parent);
 
 	ACPI_COMPANION_SET(&dev->dev, adev);
 
diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index de3cbf152dee..1a1c78b23fba 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -78,7 +78,7 @@ static void acpi_platform_fill_resource(struct acpi_device *adev,
 	 * If the device has parent we need to take its resources into
 	 * account as well because this device might consume part of those.
 	 */
-	parent = acpi_get_first_physical_node(adev->parent);
+	parent = acpi_get_first_physical_node(acpi_dev_parent(adev));
 	if (parent && dev_is_pci(parent))
 		dest->parent = pci_find_resource(to_pci_dev(parent), dest);
 }
@@ -97,6 +97,7 @@ static void acpi_platform_fill_resource(struct acpi_device *adev,
 struct platform_device *acpi_create_platform_device(struct acpi_device *adev,
 						    const struct property_entry *properties)
 {
+	struct acpi_device *parent = acpi_dev_parent(adev);
 	struct platform_device *pdev = NULL;
 	struct platform_device_info pdevinfo;
 	struct resource_entry *rentry;
@@ -137,8 +138,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev,
 	 * attached to it, that physical device should be the parent of the
 	 * platform device we are about to create.
 	 */
-	pdevinfo.parent = adev->parent ?
-		acpi_get_first_physical_node(adev->parent) : NULL;
+	pdevinfo.parent = parent ? acpi_get_first_physical_node(parent) : NULL;
 	pdevinfo.name = dev_name(&adev->dev);
 	pdevinfo.id = -1;
 	pdevinfo.res = resources;
diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c
index 5cbe2196176d..92a6c6f9a9a7 100644
--- a/drivers/acpi/acpi_video.c
+++ b/drivers/acpi/acpi_video.c
@@ -2030,7 +2030,7 @@ static int acpi_video_bus_add(struct acpi_device *device)
 	acpi_status status;
 
 	status = acpi_walk_namespace(ACPI_TYPE_DEVICE,
-				device->parent->handle, 1,
+				acpi_dev_parent(device)->handle, 1,
 				acpi_video_bus_match, NULL,
 				device, NULL);
 	if (status == AE_ALREADY_EXISTS) {
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 2b7e08d54c3f..028d8d14cd44 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -74,6 +74,7 @@ static int acpi_dev_pm_explicit_get(struct acpi_device *device, int *state)
  */
 int acpi_device_get_power(struct acpi_device *device, int *state)
 {
+	struct acpi_device *parent = acpi_dev_parent(device);
 	int result = ACPI_STATE_UNKNOWN;
 	int error;
 
@@ -82,8 +83,7 @@ int acpi_device_get_power(struct acpi_device *device, int *state)
 
 	if (!device->flags.power_manageable) {
 		/* TBD: Non-recursive algorithm for walking up hierarchy. */
-		*state = device->parent ?
-			device->parent->power.state : ACPI_STATE_D0;
+		*state = parent ? parent->power.state : ACPI_STATE_D0;
 		goto out;
 	}
 
@@ -122,10 +122,10 @@ int acpi_device_get_power(struct acpi_device *device, int *state)
 	 * point, the fact that the device is in D0 implies that the parent has
 	 * to be in D0 too, except if ignore_parent is set.
 	 */
-	if (!device->power.flags.ignore_parent && device->parent
-	    && device->parent->power.state == ACPI_STATE_UNKNOWN
-	    && result == ACPI_STATE_D0)
-		device->parent->power.state = ACPI_STATE_D0;
+	if (!device->power.flags.ignore_parent && parent &&
+	    parent->power.state == ACPI_STATE_UNKNOWN &&
+	    result == ACPI_STATE_D0)
+		parent->power.state = ACPI_STATE_D0;
 
 	*state = result;
 
@@ -159,6 +159,7 @@ static int acpi_dev_pm_explicit_set(struct acpi_device *adev, int state)
  */
 int acpi_device_set_power(struct acpi_device *device, int state)
 {
+	struct acpi_device *parent = acpi_dev_parent(device);
 	int target_state = state;
 	int result = 0;
 
@@ -191,12 +192,12 @@ int acpi_device_set_power(struct acpi_device *device, int state)
 		return -ENODEV;
 	}
 
-	if (!device->power.flags.ignore_parent && device->parent &&
-	    state < device->parent->power.state) {
+	if (!device->power.flags.ignore_parent && parent &&
+	    state < parent->power.state) {
 		acpi_handle_debug(device->handle,
 				  "Cannot transition to %s for parent in %s\n",
 				  acpi_power_state_string(state),
-				  acpi_power_state_string(device->parent->power.state));
+				  acpi_power_state_string(parent->power.state));
 		return -ENODEV;
 	}
 
diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 7b3ad8ed2f4e..3dab3ac9c4a1 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -304,8 +304,10 @@ static void acpi_init_of_compatible(struct acpi_device *adev)
 		ret = acpi_dev_get_property(adev, "compatible",
 					    ACPI_TYPE_STRING, &of_compatible);
 		if (ret) {
-			if (adev->parent
-			    && adev->parent->flags.of_compatible_ok)
+			struct acpi_device *parent;
+
+			parent = acpi_dev_parent(adev);
+			if (parent && parent->flags.of_compatible_ok)
 				goto out;
 
 			return;
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index 4938010fcac7..e6a01a8df1b8 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -632,7 +632,7 @@ static int acpi_sbs_add(struct acpi_device *device)
 
 	mutex_init(&sbs->lock);
 
-	sbs->hc = acpi_driver_data(device->parent);
+	sbs->hc = acpi_driver_data(acpi_dev_parent(device));
 	sbs->device = device;
 	strcpy(acpi_device_name(device), ACPI_SBS_DEVICE_NAME);
 	strcpy(acpi_device_class(device), ACPI_SBS_CLASS);
diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c
index 7c62e149a7a1..340e0b61587e 100644
--- a/drivers/acpi/sbshc.c
+++ b/drivers/acpi/sbshc.c
@@ -266,7 +266,7 @@ static int acpi_smbus_hc_add(struct acpi_device *device)
 	mutex_init(&hc->lock);
 	init_waitqueue_head(&hc->wait);
 
-	hc->ec = acpi_driver_data(device->parent);
+	hc->ec = acpi_driver_data(acpi_dev_parent(device));
 	hc->offset = (val >> 8) & 0xff;
 	hc->query_bit = val & 0xff;
 	device->driver_data = hc;
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 0d9c350608bf..f6b9ea937de1 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -29,8 +29,6 @@ extern struct acpi_device *acpi_root;
 #define ACPI_BUS_HID			"LNXSYBUS"
 #define ACPI_BUS_DEVICE_NAME		"System Bus"
 
-#define ACPI_IS_ROOT_DEVICE(device)    (!(device)->parent)
-
 #define INVALID_ACPI_HANDLE	((acpi_handle)empty_zero_page)
 
 static const char *dummy_hid = "device";
@@ -1110,7 +1108,7 @@ static void acpi_device_get_busid(struct acpi_device *device)
 	 * The device's Bus ID is simply the object name.
 	 * TBD: Shouldn't this value be unique (within the ACPI namespace)?
 	 */
-	if (ACPI_IS_ROOT_DEVICE(device)) {
+	if (!acpi_dev_parent(device)) {
 		strcpy(device->pnp.bus_id, "ACPI");
 		return;
 	}
@@ -1646,7 +1644,7 @@ static void acpi_init_coherency(struct acpi_device *adev)
 {
 	unsigned long long cca = 0;
 	acpi_status status;
-	struct acpi_device *parent = adev->parent;
+	struct acpi_device *parent = acpi_dev_parent(adev);
 
 	if (parent && parent->flags.cca_seen) {
 		/*
@@ -1690,7 +1688,7 @@ static int acpi_check_serial_bus_slave(struct acpi_resource *ares, void *data)
 
 static bool acpi_is_indirect_io_slave(struct acpi_device *device)
 {
-	struct acpi_device *parent = device->parent;
+	struct acpi_device *parent = acpi_dev_parent(device);
 	static const struct acpi_device_id indirect_io_hosts[] = {
 		{"HISI0191", 0},
 		{}
@@ -1767,10 +1765,7 @@ void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
 	INIT_LIST_HEAD(&device->pnp.ids);
 	device->device_type = type;
 	device->handle = handle;
-	if (parent) {
-		device->parent = parent;
-		device->dev.parent = &parent->dev;
-	}
+	device->dev.parent = parent ? &parent->dev : NULL;
 	device->dev.release = release;
 	device->dev.bus = &acpi_bus_type;
 	fwnode_init(&device->fwnode, &acpi_device_fwnode_ops);
@@ -1867,8 +1862,8 @@ static int acpi_add_single_object(struct acpi_device **child,
 	acpi_device_add_finalize(device);
 
 	acpi_handle_debug(handle, "Added as %s, parent %s\n",
-			  dev_name(&device->dev), device->parent ?
-				dev_name(&device->parent->dev) : "(null)");
+			  dev_name(&device->dev), device->dev.parent ?
+				dev_name(device->dev.parent) : "(null)");
 
 	*child = device;
 	return 0;
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 23c680d1a0f5..c905d40964cb 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -2427,7 +2427,8 @@ static int vmbus_acpi_add(struct acpi_device *device)
 	 * Some ancestor of the vmbus acpi device (Gen1 or Gen2
 	 * firmware) is the VMOD that has the mmio ranges. Get that.
 	 */
-	for (ancestor = device->parent; ancestor; ancestor = ancestor->parent) {
+	for (ancestor = acpi_dev_parent(device); ancestor;
+	     ancestor = acpi_dev_parent(ancestor)) {
 		result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS,
 					     vmbus_walk_resources, NULL);
 
diff --git a/drivers/perf/arm_dsu_pmu.c b/drivers/perf/arm_dsu_pmu.c
index a36698a90d2f..4a15c86f45ef 100644
--- a/drivers/perf/arm_dsu_pmu.c
+++ b/drivers/perf/arm_dsu_pmu.c
@@ -639,6 +639,7 @@ static int dsu_pmu_dt_get_cpus(struct device *dev, cpumask_t *mask)
 static int dsu_pmu_acpi_get_cpus(struct device *dev, cpumask_t *mask)
 {
 #ifdef CONFIG_ACPI
+	struct acpi_device *parent_adev = acpi_dev_parent(ACPI_COMPANION(dev));
 	int cpu;
 
 	/*
@@ -653,8 +654,7 @@ static int dsu_pmu_acpi_get_cpus(struct device *dev, cpumask_t *mask)
 			continue;
 
 		acpi_dev = ACPI_COMPANION(cpu_dev);
-		if (acpi_dev &&
-			acpi_dev->parent == ACPI_COMPANION(dev)->parent)
+		if (acpi_dev && acpi_dev_parent(acpi_dev) == parent_adev)
 			cpumask_set_cpu(cpu, mask);
 	}
 #endif
diff --git a/drivers/perf/qcom_l3_pmu.c b/drivers/perf/qcom_l3_pmu.c
index 1ff2ff6582bf..346311a05460 100644
--- a/drivers/perf/qcom_l3_pmu.c
+++ b/drivers/perf/qcom_l3_pmu.c
@@ -742,7 +742,8 @@ static int qcom_l3_cache_pmu_probe(struct platform_device *pdev)
 
 	l3pmu = devm_kzalloc(&pdev->dev, sizeof(*l3pmu), GFP_KERNEL);
 	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "l3cache_%s_%s",
-		      acpi_dev->parent->pnp.unique_id, acpi_dev->pnp.unique_id);
+		      acpi_dev_parent(acpi_dev)->pnp.unique_id,
+		      acpi_dev->pnp.unique_id);
 	if (!l3pmu || !name)
 		return -ENOMEM;
 
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 83da8862b8f2..58302259a5e3 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -4375,7 +4375,7 @@ static int acpi_spi_notify(struct notifier_block *nb, unsigned long value,
 
 	switch (value) {
 	case ACPI_RECONFIG_DEVICE_ADD:
-		ctlr = acpi_spi_find_controller_by_adev(adev->parent);
+		ctlr = acpi_spi_find_controller_by_adev(acpi_dev_parent(adev));
 		if (!ctlr)
 			break;
 
diff --git a/drivers/thunderbolt/acpi.c b/drivers/thunderbolt/acpi.c
index b1f0dc8df47c..7a8adf5ad5a0 100644
--- a/drivers/thunderbolt/acpi.c
+++ b/drivers/thunderbolt/acpi.c
@@ -42,7 +42,7 @@ static acpi_status tb_acpi_add_link(acpi_handle handle, u32 level, void *data,
 	 */
 	dev = acpi_get_first_physical_node(adev);
 	while (!dev) {
-		adev = adev->parent;
+		adev = acpi_dev_parent(adev);
 		if (!adev)
 			break;
 		dev = acpi_get_first_physical_node(adev);
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 7ff067a5a3bd..6289020fdab8 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -365,7 +365,6 @@ struct acpi_device {
 	int device_type;
 	acpi_handle handle;		/* no handle for fixed hardware */
 	struct fwnode_handle fwnode;
-	struct acpi_device *parent;
 	struct list_head wakeup_list;
 	struct list_head del_list;
 	struct acpi_device_status status;
@@ -458,6 +457,14 @@ static inline void *acpi_driver_data(struct acpi_device *d)
 #define to_acpi_device(d)	container_of(d, struct acpi_device, dev)
 #define to_acpi_driver(d)	container_of(d, struct acpi_driver, drv)
 
+static inline struct acpi_device *acpi_dev_parent(struct acpi_device *adev)
+{
+	if (adev->dev.parent)
+		return to_acpi_device(adev->dev.parent);
+
+	return NULL;
+}
+
 static inline void acpi_set_device_status(struct acpi_device *adev, u32 sta)
 {
 	*((u32 *)&adev->status) = sta;
@@ -478,6 +485,7 @@ void acpi_initialize_hp_context(struct acpi_device *adev,
 /* acpi_device.dev.bus == &acpi_bus_type */
 extern struct bus_type acpi_bus_type;
 
+struct acpi_device *acpi_dev_parent(struct acpi_device *adev);
 int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data);
 int acpi_dev_for_each_child(struct acpi_device *adev,
 			    int (*fn)(struct acpi_device *, void *), void *data);
-- 
cgit v1.2.3


From 7a14a11f93dfb3ef99c06d6adeb53d0759be571f Mon Sep 17 00:00:00 2001
From: Josh Hant <joshuahant@gmail.com>
Date: Fri, 15 Jul 2022 20:53:48 -0400
Subject: EDAC/ie31200: Add Skylake-S support

Add device IDs for Skylake-S CPUs according to datasheet.

Signed-off-by: Josh Hant <joshuahant@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Jason Baron <jbaron@akamai.com>
Link: https://lore.kernel.org/r/20220712102121.20812-1-joshuahant@gmail.com
---
 drivers/edac/ie31200_edac.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c
index 9a9ff5ad611a..9ef13570f2e5 100644
--- a/drivers/edac/ie31200_edac.c
+++ b/drivers/edac/ie31200_edac.c
@@ -20,11 +20,15 @@
  * 0c08: Xeon E3-1200 v3 Processor DRAM Controller
  * 1918: Xeon E3-1200 v5 Skylake Host Bridge/DRAM Registers
  * 5918: Xeon E3-1200 Xeon E3-1200 v6/7th Gen Core Processor Host Bridge/DRAM Registers
+ * 190f: 6th Gen Core Dual-Core Processor Host Bridge/DRAM Registers
+ * 191f: 6th Gen Core Quad-Core Processor Host Bridge/DRAM Registers
  * 3e..: 8th/9th Gen Core Processor Host Bridge/DRAM Registers
  *
  * Based on Intel specification:
  * https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e3-1200v3-vol-2-datasheet.pdf
  * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e3-1200-family-vol-2-datasheet.html
+ * https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/desktop-6th-gen-core-family-datasheet-vol-2.pdf
+ * https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e3-1200v6-vol-2-datasheet.pdf
  * https://www.intel.com/content/www/us/en/processors/core/7th-gen-core-family-mobile-h-processor-lines-datasheet-vol-2.html
  * https://www.intel.com/content/www/us/en/products/docs/processors/core/8th-gen-core-family-datasheet-vol-2.html
  *
@@ -53,15 +57,17 @@
 #define ie31200_printk(level, fmt, arg...) \
 	edac_printk(level, "ie31200", fmt, ##arg)
 
-#define PCI_DEVICE_ID_INTEL_IE31200_HB_1 0x0108
-#define PCI_DEVICE_ID_INTEL_IE31200_HB_2 0x010c
-#define PCI_DEVICE_ID_INTEL_IE31200_HB_3 0x0150
-#define PCI_DEVICE_ID_INTEL_IE31200_HB_4 0x0158
-#define PCI_DEVICE_ID_INTEL_IE31200_HB_5 0x015c
-#define PCI_DEVICE_ID_INTEL_IE31200_HB_6 0x0c04
-#define PCI_DEVICE_ID_INTEL_IE31200_HB_7 0x0c08
-#define PCI_DEVICE_ID_INTEL_IE31200_HB_8 0x1918
-#define PCI_DEVICE_ID_INTEL_IE31200_HB_9 0x5918
+#define PCI_DEVICE_ID_INTEL_IE31200_HB_1  0x0108
+#define PCI_DEVICE_ID_INTEL_IE31200_HB_2  0x010c
+#define PCI_DEVICE_ID_INTEL_IE31200_HB_3  0x0150
+#define PCI_DEVICE_ID_INTEL_IE31200_HB_4  0x0158
+#define PCI_DEVICE_ID_INTEL_IE31200_HB_5  0x015c
+#define PCI_DEVICE_ID_INTEL_IE31200_HB_6  0x0c04
+#define PCI_DEVICE_ID_INTEL_IE31200_HB_7  0x0c08
+#define PCI_DEVICE_ID_INTEL_IE31200_HB_8  0x190F
+#define PCI_DEVICE_ID_INTEL_IE31200_HB_9  0x1918
+#define PCI_DEVICE_ID_INTEL_IE31200_HB_10 0x191F
+#define PCI_DEVICE_ID_INTEL_IE31200_HB_11 0x5918
 
 /* Coffee Lake-S */
 #define PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_MASK 0x3e00
@@ -80,6 +86,8 @@
 #define DEVICE_ID_SKYLAKE_OR_LATER(did)                                        \
 	(((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_8) ||                        \
 	 ((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_9) ||                        \
+	 ((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_10) ||                       \
+	 ((did) == PCI_DEVICE_ID_INTEL_IE31200_HB_11) ||                       \
 	 (((did) & PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_MASK) ==                 \
 	  PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_MASK))
 
@@ -577,6 +585,8 @@ static const struct pci_device_id ie31200_pci_tbl[] = {
 	{ PCI_VEND_DEV(INTEL, IE31200_HB_7),      PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
 	{ PCI_VEND_DEV(INTEL, IE31200_HB_8),      PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
 	{ PCI_VEND_DEV(INTEL, IE31200_HB_9),      PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
+	{ PCI_VEND_DEV(INTEL, IE31200_HB_10),     PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
+	{ PCI_VEND_DEV(INTEL, IE31200_HB_11),     PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
 	{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_1),  PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
 	{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_2),  PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
 	{ PCI_VEND_DEV(INTEL, IE31200_HB_CFL_3),  PCI_ANY_ID, PCI_ANY_ID, 0, 0, IE31200 },
-- 
cgit v1.2.3


From fb4b9685779f25ff063358623f3da4f3344be9bb Mon Sep 17 00:00:00 2001
From: ran jianping <ran.jianping@zte.com.cn>
Date: Sun, 24 Apr 2022 06:21:26 +0000
Subject: EDAC/wq: Remove unneeded flush_workqueue()

destroy_workqueue() already takes care of flushing the workqueue so
there is no need to flush it explicitly.

  [ bp: Massage commit message. ]

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: ran jianping <ran.jianping@zte.com.cn>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220424062127.3219542-1-ran.jianping@zte.com.cn
---
 drivers/edac/wq.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/edac/wq.c b/drivers/edac/wq.c
index d021d287eaec..ad3f516627c5 100644
--- a/drivers/edac/wq.c
+++ b/drivers/edac/wq.c
@@ -37,7 +37,6 @@ int edac_workqueue_setup(void)
 
 void edac_workqueue_teardown(void)
 {
-	flush_workqueue(wq);
 	destroy_workqueue(wq);
 	wq = NULL;
 }
-- 
cgit v1.2.3


From a2a9d1850060e5d995136561d76e81d61414f076 Mon Sep 17 00:00:00 2001
From: Perry Yuan <Perry.Yuan@amd.com>
Date: Mon, 15 Aug 2022 00:35:48 +0800
Subject: ACPI: CPPC: Add ACPI disabled check to acpi_cpc_valid()

Make acpi_cpc_valid() check if ACPI is disabled, so that its callers
don't need to check that separately.  This will also cause the AMD
pstate driver to refuse to load right away when ACPI is disabled.

Also update the warning message in amd_pstate_init() to mention the
ACPI disabled case for completeness.

Signed-off-by: Perry Yuan <Perry.Yuan@amd.com>
[ rjw: Subject edits, new changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/cppc_acpi.c       | 3 +++
 drivers/base/arch_topology.c   | 2 +-
 drivers/cpufreq/amd-pstate.c   | 2 +-
 drivers/cpufreq/cppc_cpufreq.c | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index 1e15a9f25ae9..c2309429146f 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -424,6 +424,9 @@ bool acpi_cpc_valid(void)
 	struct cpc_desc *cpc_ptr;
 	int cpu;
 
+	if (acpi_disabled)
+		return false;
+
 	for_each_present_cpu(cpu) {
 		cpc_ptr = per_cpu(cpc_desc_ptr, cpu);
 		if (!cpc_ptr)
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 0424b59b695e..a9ff81d3a889 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -353,7 +353,7 @@ void topology_init_cpu_capacity_cppc(void)
 	struct cppc_perf_caps perf_caps;
 	int cpu;
 
-	if (likely(acpi_disabled || !acpi_cpc_valid()))
+	if (likely(!acpi_cpc_valid()))
 		return;
 
 	raw_capacity = kcalloc(num_possible_cpus(), sizeof(*raw_capacity),
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 9ac75c1cde9c..a8e386d67a18 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -673,7 +673,7 @@ static int __init amd_pstate_init(void)
 		return -ENODEV;
 
 	if (!acpi_cpc_valid()) {
-		pr_debug("the _CPC object is not present in SBIOS\n");
+		pr_warn_once("the _CPC object is not present in SBIOS or ACPI disabled\n");
 		return -ENODEV;
 	}
 
diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 24eaf0ec344d..9adb7612993e 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -947,7 +947,7 @@ static int __init cppc_cpufreq_init(void)
 {
 	int ret;
 
-	if ((acpi_disabled) || !acpi_cpc_valid())
+	if (!acpi_cpc_valid())
 		return -ENODEV;
 
 	cppc_check_hisi_workaround();
-- 
cgit v1.2.3


From 24867516f06dabedef3be7eea0ef0846b91538bc Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 23 Aug 2022 13:51:31 -0500
Subject: ACPI: OSI: Remove Linux-Dell-Video _OSI string

This string was introduced because drivers for NVIDIA hardware
had bugs supporting RTD3 in the past.

Before proprietary NVIDIA driver started to support RTD3, Ubuntu had
had a mechanism for switching PRIME on and off, though it had required
to logout/login to make the library switch happen.

When the PRIME had been off, the mechanism had unloaded the NVIDIA
driver and put the device into D3cold, but the GPU had never come back
to D0 again which is why ODMs used the _OSI to expose an old _DSM
method to switch the power on/off.

That has been fixed by commit 5775b843a619 ("PCI: Restore config space
on runtime resume despite being unbound"). so vendors shouldn't be
using this string to modify ASL any more.

Reviewed-by: Lyude Paul <lyude@redhat.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/osi.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/acpi/osi.c b/drivers/acpi/osi.c
index 9f6853809138..c2f6b2f553d9 100644
--- a/drivers/acpi/osi.c
+++ b/drivers/acpi/osi.c
@@ -44,15 +44,6 @@ osi_setup_entries[OSI_STRING_ENTRIES_MAX] __initdata = {
 	{"Processor Device", true},
 	{"3.0 _SCP Extensions", true},
 	{"Processor Aggregator Device", true},
-	/*
-	 * Linux-Dell-Video is used by BIOS to disable RTD3 for NVidia graphics
-	 * cards as RTD3 is not supported by drivers now.  Systems with NVidia
-	 * cards will hang without RTD3 disabled.
-	 *
-	 * Once NVidia drivers officially support RTD3, this _OSI strings can
-	 * be removed if both new and old graphics cards are supported.
-	 */
-	{"Linux-Dell-Video", true},
 	/*
 	 * Linux-Lenovo-NV-HDMI-Audio is used by BIOS to power on NVidia's HDMI
 	 * audio device which is turned off for power-saving in Windows OS.
-- 
cgit v1.2.3


From 7c3ad60b3f33d145b75b707c629748e06d7a652d Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 23 Aug 2022 13:51:32 -0500
Subject: ACPI: OSI: Remove Linux-Lenovo-NV-HDMI-Audio _OSI string

This string was introduced because drivers for NVIDIA hardware
didn't handle HDMI properly.

This was fixed by commit b516ea586d71 ("PCI: Enable NVIDIA HDA
controllers"), so vendors shouldn't be using this string to modify
ASL any more.

Suggested-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Reviewed-by: Lyude Paul <lyude@redhat.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/osi.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/drivers/acpi/osi.c b/drivers/acpi/osi.c
index c2f6b2f553d9..0d635eec87c4 100644
--- a/drivers/acpi/osi.c
+++ b/drivers/acpi/osi.c
@@ -44,14 +44,6 @@ osi_setup_entries[OSI_STRING_ENTRIES_MAX] __initdata = {
 	{"Processor Device", true},
 	{"3.0 _SCP Extensions", true},
 	{"Processor Aggregator Device", true},
-	/*
-	 * Linux-Lenovo-NV-HDMI-Audio is used by BIOS to power on NVidia's HDMI
-	 * audio device which is turned off for power-saving in Windows OS.
-	 * This power management feature observed on some Lenovo Thinkpad
-	 * systems which will not be able to output audio via HDMI without
-	 * a BIOS workaround.
-	 */
-	{"Linux-Lenovo-NV-HDMI-Audio", true},
 	/*
 	 * Linux-HPI-Hybrid-Graphics is used by BIOS to enable dGPU to
 	 * output video directly to external monitors on HP Inc. mobile
-- 
cgit v1.2.3


From e54049d481a9b25f5ae292f671e6aecb6d79f532 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 23 Aug 2022 13:51:33 -0500
Subject: ACPI: OSI: Remove Linux-HPI-Hybrid-Graphics _OSI string

This string was introduced because drivers for NVIDIA hardware
didn't handle mux control properly.

This was fixed by commit 8e55f99c510f ("drm/i915: Invoke another _DSM
to enable MUX on HP Workstation laptops"), so vendors shouldn't be
using this string to modify ASL any more.

Suggested-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Reviewed-by: Lyude Paul <lyude@redhat.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/osi.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/acpi/osi.c b/drivers/acpi/osi.c
index 0d635eec87c4..d4405e1ca9b9 100644
--- a/drivers/acpi/osi.c
+++ b/drivers/acpi/osi.c
@@ -44,13 +44,6 @@ osi_setup_entries[OSI_STRING_ENTRIES_MAX] __initdata = {
 	{"Processor Device", true},
 	{"3.0 _SCP Extensions", true},
 	{"Processor Aggregator Device", true},
-	/*
-	 * Linux-HPI-Hybrid-Graphics is used by BIOS to enable dGPU to
-	 * output video directly to external monitors on HP Inc. mobile
-	 * workstations as Nvidia and AMD VGA drivers provide limited
-	 * hybrid graphics supports.
-	 */
-	{"Linux-HPI-Hybrid-Graphics", true},
 };
 
 static u32 acpi_osi_handler(acpi_string interface, u32 supported)
-- 
cgit v1.2.3


From 2f190ac250b32c201bc0046d9c2a6f7acdc5c362 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 23 Aug 2022 13:51:34 -0500
Subject: ACPI: OSI: Update Documentation on custom _OSI strings

Originally, when there was some custom ASL needed for Linux, the
guidance for Linux-friendly OEMs was to introduced a new string in the
APCI tables and discuss it on the kernel mailing lists.

This lead to 3 strings being introduced.  Those strings now have the
original problems fixed in the kernel and have since been removed.
The kernel no longer advertises any custom Linux-* _OSI strings for
OEM use.

It was discovered that these removed strings were actually being abused
to detect Linux for unrelated reasons, so drop the encouragement for
new ones to be added.

Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/firmware-guide/acpi/osi.rst | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/Documentation/firmware-guide/acpi/osi.rst b/Documentation/firmware-guide/acpi/osi.rst
index 05869c0045d7..784850adfcb6 100644
--- a/Documentation/firmware-guide/acpi/osi.rst
+++ b/Documentation/firmware-guide/acpi/osi.rst
@@ -41,26 +41,23 @@ But it is likely that they will all eventually be added.
 What should an OEM do if they want to support Linux and Windows
 using the same BIOS image?  Often they need to do something different
 for Linux to deal with how Linux is different from Windows.
-Here the BIOS should ask exactly what it wants to know:
 
+In this case, the OEM should create custom ASL to be executed by the
+Linux kernel and changes to Linux kernel drivers to execute this custom
+ASL.  The easiest way to accomplish this is to introduce a device specific
+method (_DSM) that is called from the Linux kernel.
+
+In the past the kernel used to support something like:
 _OSI("Linux-OEM-my_interface_name")
 where 'OEM' is needed if this is an OEM-specific hook,
 and 'my_interface_name' describes the hook, which could be a
 quirk, a bug, or a bug-fix.
 
-In addition, the OEM should send a patch to upstream Linux
-via the linux-acpi@vger.kernel.org mailing list.  When that patch
-is checked into Linux, the OS will answer "YES" when the BIOS
-on the OEM's system uses _OSI to ask if the interface is supported
-by the OS.  Linux distributors can back-port that patch for Linux
-pre-installs, and it will be included by all distributions that
-re-base to upstream.  If the distribution can not update the kernel binary,
-they can also add an acpi_osi=Linux-OEM-my_interface_name
-cmdline parameter to the boot loader, as needed.
-
-If the string refers to a feature where the upstream kernel
-eventually grows support, a patch should be sent to remove
-the string when that support is added to the kernel.
+However this was discovered to be abused by other BIOS vendors to change
+completely unrelated code on completely unrelated systems.  This prompted
+an evaluation of all of it's uses. This uncovered that they aren't needed
+for any of the original reasons. As such, the kernel will not respond to
+any custom Linux-* strings by default.
 
 That was easy.  Read on, to find out how to do it wrong.
 
-- 
cgit v1.2.3


From 4b76dfbc44582a6cdf62eadb147f3a88f28de3dd Mon Sep 17 00:00:00 2001
From: ye xingchen <ye.xingchen@zte.com.cn>
Date: Wed, 24 Aug 2022 07:53:06 +0000
Subject: ACPI: bus: Remove the unneeded result variable

Return the value from driver_register() directly instead of storing it
in another redundant variable.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index c0d20d997891..661a63ea3248 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -973,16 +973,13 @@ EXPORT_SYMBOL_GPL(acpi_driver_match_device);
  */
 int acpi_bus_register_driver(struct acpi_driver *driver)
 {
-	int ret;
-
 	if (acpi_disabled)
 		return -ENODEV;
 	driver->drv.name = driver->name;
 	driver->drv.bus = &acpi_bus_type;
 	driver->drv.owner = driver->owner;
 
-	ret = driver_register(&driver->drv);
-	return ret;
+	return driver_register(&driver->drv);
 }
 
 EXPORT_SYMBOL(acpi_bus_register_driver);
-- 
cgit v1.2.3


From 7ac7267fad5908476b357e7e9813d23516c2b0a1 Mon Sep 17 00:00:00 2001
From: Fae <faenkhauser@gmail.com>
Date: Sun, 24 Jul 2022 13:25:02 -0500
Subject: Bluetooth: Add VID/PID 0489/e0e0 for MediaTek MT7921

Tested on HP Envy ey0xxx

output from /sys/kernel/debug/usb/devices:

T:  Bus=01 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#=  2 Spd=480  MxCh= 0
D:  Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=0489 ProdID=e0e0 Rev= 1.00
S:  Manufacturer=MediaTek Inc.
S:  Product=Wireless_Device
S:  SerialNumber=000000000
C:* #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=100mA
A:  FirstIf#= 0 IfCount= 3 Cls=e0(wlcon) Sub=01 Prot=01
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=125us
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
I:  If#= 1 Alt= 6 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  63 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  63 Ivl=1ms
I:* If#= 2 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS=  64 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS=  64 Ivl=125us
I:  If#= 2 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS= 512 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS= 512 Ivl=125us

Signed-off-by: Fae <faenkhauser@gmail.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 15caa6469538..30dd443f395f 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -466,6 +466,9 @@ static const struct usb_device_id blacklist_table[] = {
 	{ USB_DEVICE(0x0489, 0xe0c8), .driver_info = BTUSB_MEDIATEK |
 						     BTUSB_WIDEBAND_SPEECH |
 						     BTUSB_VALID_LE_STATES },
+	{ USB_DEVICE(0x0489, 0xe0e0), .driver_info = BTUSB_MEDIATEK |
+						     BTUSB_WIDEBAND_SPEECH |
+						     BTUSB_VALID_LE_STATES },
 	{ USB_DEVICE(0x04ca, 0x3802), .driver_info = BTUSB_MEDIATEK |
 						     BTUSB_WIDEBAND_SPEECH |
 						     BTUSB_VALID_LE_STATES },
-- 
cgit v1.2.3


From 8ffde2a73f2cd2906a1bff2d315ad32154c425a3 Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Fri, 5 Aug 2022 16:42:28 -0700
Subject: Bluetooth: Convert le_scan_disable timeout to hci_sync

The le_scan_disable timeout was being performed on the deprecated
hci_request.c mechanism.  This timeout is performed in hci_sync.c

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_request.c | 98 +--------------------------------------------
 net/bluetooth/hci_sync.c    | 73 +++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 97 deletions(-)

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index e64d558e5d69..32fefaa0d3ca 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -269,6 +269,7 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen,
 void hci_req_add(struct hci_request *req, u16 opcode, u32 plen,
 		 const void *param)
 {
+	bt_dev_dbg(req->hdev, "HCI_REQ-0x%4.4x", opcode);
 	hci_req_add_ev(req, opcode, plen, param, 0);
 }
 
@@ -1974,101 +1975,6 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 	return 0;
 }
 
-static int le_scan_disable(struct hci_request *req, unsigned long opt)
-{
-	hci_req_add_le_scan_disable(req, false);
-	return 0;
-}
-
-static int bredr_inquiry(struct hci_request *req, unsigned long opt)
-{
-	u8 length = opt;
-	const u8 giac[3] = { 0x33, 0x8b, 0x9e };
-	const u8 liac[3] = { 0x00, 0x8b, 0x9e };
-	struct hci_cp_inquiry cp;
-
-	if (test_bit(HCI_INQUIRY, &req->hdev->flags))
-		return 0;
-
-	bt_dev_dbg(req->hdev, "");
-
-	hci_dev_lock(req->hdev);
-	hci_inquiry_cache_flush(req->hdev);
-	hci_dev_unlock(req->hdev);
-
-	memset(&cp, 0, sizeof(cp));
-
-	if (req->hdev->discovery.limited)
-		memcpy(&cp.lap, liac, sizeof(cp.lap));
-	else
-		memcpy(&cp.lap, giac, sizeof(cp.lap));
-
-	cp.length = length;
-
-	hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp);
-
-	return 0;
-}
-
-static void le_scan_disable_work(struct work_struct *work)
-{
-	struct hci_dev *hdev = container_of(work, struct hci_dev,
-					    le_scan_disable.work);
-	u8 status;
-
-	bt_dev_dbg(hdev, "");
-
-	if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
-		return;
-
-	cancel_delayed_work(&hdev->le_scan_restart);
-
-	hci_req_sync(hdev, le_scan_disable, 0, HCI_CMD_TIMEOUT, &status);
-	if (status) {
-		bt_dev_err(hdev, "failed to disable LE scan: status 0x%02x",
-			   status);
-		return;
-	}
-
-	hdev->discovery.scan_start = 0;
-
-	/* If we were running LE only scan, change discovery state. If
-	 * we were running both LE and BR/EDR inquiry simultaneously,
-	 * and BR/EDR inquiry is already finished, stop discovery,
-	 * otherwise BR/EDR inquiry will stop discovery when finished.
-	 * If we will resolve remote device name, do not change
-	 * discovery state.
-	 */
-
-	if (hdev->discovery.type == DISCOV_TYPE_LE)
-		goto discov_stopped;
-
-	if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED)
-		return;
-
-	if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) {
-		if (!test_bit(HCI_INQUIRY, &hdev->flags) &&
-		    hdev->discovery.state != DISCOVERY_RESOLVING)
-			goto discov_stopped;
-
-		return;
-	}
-
-	hci_req_sync(hdev, bredr_inquiry, DISCOV_INTERLEAVED_INQUIRY_LEN,
-		     HCI_CMD_TIMEOUT, &status);
-	if (status) {
-		bt_dev_err(hdev, "inquiry failed: status 0x%02x", status);
-		goto discov_stopped;
-	}
-
-	return;
-
-discov_stopped:
-	hci_dev_lock(hdev);
-	hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
-	hci_dev_unlock(hdev);
-}
-
 static int le_scan_restart(struct hci_request *req, unsigned long opt)
 {
 	struct hci_dev *hdev = req->hdev;
@@ -2252,7 +2158,6 @@ error:
 
 void hci_request_setup(struct hci_dev *hdev)
 {
-	INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work);
 	INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work);
 	INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
 	INIT_DELAYED_WORK(&hdev->interleave_scan, interleave_scan_work);
@@ -2262,7 +2167,6 @@ void hci_request_cancel_all(struct hci_dev *hdev)
 {
 	__hci_cmd_sync_cancel(hdev, ENODEV);
 
-	cancel_delayed_work_sync(&hdev->le_scan_disable);
 	cancel_delayed_work_sync(&hdev->le_scan_restart);
 
 	if (hdev->adv_instance_timeout) {
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index e6d804b82b67..e22837510ab1 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -321,6 +321,77 @@ static void hci_cmd_sync_cancel_work(struct work_struct *work)
 	wake_up_interruptible(&hdev->req_wait_q);
 }
 
+static int hci_scan_disable_sync(struct hci_dev *hdev);
+static int scan_disable_sync(struct hci_dev *hdev, void *data)
+{
+	return hci_scan_disable_sync(hdev);
+}
+
+static int hci_inquiry_sync(struct hci_dev *hdev, u8 length);
+static int interleaved_inquiry_sync(struct hci_dev *hdev, void *data)
+{
+	return hci_inquiry_sync(hdev, DISCOV_INTERLEAVED_INQUIRY_LEN);
+}
+
+static void le_scan_disable(struct work_struct *work)
+{
+	struct hci_dev *hdev = container_of(work, struct hci_dev,
+					    le_scan_disable.work);
+	int status;
+
+	bt_dev_dbg(hdev, "");
+	hci_dev_lock(hdev);
+
+	if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
+		goto _return;
+
+	cancel_delayed_work(&hdev->le_scan_restart);
+
+	status = hci_cmd_sync_queue(hdev, scan_disable_sync, NULL, NULL);
+	if (status) {
+		bt_dev_err(hdev, "failed to disable LE scan: %d", status);
+		goto _return;
+	}
+
+	hdev->discovery.scan_start = 0;
+
+	/* If we were running LE only scan, change discovery state. If
+	 * we were running both LE and BR/EDR inquiry simultaneously,
+	 * and BR/EDR inquiry is already finished, stop discovery,
+	 * otherwise BR/EDR inquiry will stop discovery when finished.
+	 * If we will resolve remote device name, do not change
+	 * discovery state.
+	 */
+
+	if (hdev->discovery.type == DISCOV_TYPE_LE)
+		goto discov_stopped;
+
+	if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED)
+		goto _return;
+
+	if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) {
+		if (!test_bit(HCI_INQUIRY, &hdev->flags) &&
+		    hdev->discovery.state != DISCOVERY_RESOLVING)
+			goto discov_stopped;
+
+		goto _return;
+	}
+
+	status = hci_cmd_sync_queue(hdev, interleaved_inquiry_sync, NULL, NULL);
+	if (status) {
+		bt_dev_err(hdev, "inquiry failed: status %d", status);
+		goto discov_stopped;
+	}
+
+	goto _return;
+
+discov_stopped:
+	hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+
+_return:
+	hci_dev_unlock(hdev);
+}
+
 void hci_cmd_sync_init(struct hci_dev *hdev)
 {
 	INIT_WORK(&hdev->cmd_sync_work, hci_cmd_sync_work);
@@ -328,6 +399,7 @@ void hci_cmd_sync_init(struct hci_dev *hdev)
 	mutex_init(&hdev->cmd_sync_work_lock);
 
 	INIT_WORK(&hdev->cmd_sync_cancel_work, hci_cmd_sync_cancel_work);
+	INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable);
 }
 
 void hci_cmd_sync_clear(struct hci_dev *hdev)
@@ -4415,6 +4487,7 @@ int hci_dev_close_sync(struct hci_dev *hdev)
 
 	cancel_delayed_work(&hdev->power_off);
 	cancel_delayed_work(&hdev->ncmd_timer);
+	cancel_delayed_work(&hdev->le_scan_disable);
 
 	hci_request_cancel_all(hdev);
 
-- 
cgit v1.2.3


From 27d54b778ad1fb32c2c108cfe97e861c3909a46f Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Fri, 5 Aug 2022 16:42:29 -0700
Subject: Bluetooth: Rework le_scan_restart for hci_sync

le_scan_restart delayed work queue was running as a deprecated
hci_request instead of on the newer thread-safe hci_sync mechanism.

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_request.c | 89 ---------------------------------------------
 net/bluetooth/hci_sync.c    | 75 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 89 deletions(-)

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 32fefaa0d3ca..114af7350363 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1975,92 +1975,6 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 	return 0;
 }
 
-static int le_scan_restart(struct hci_request *req, unsigned long opt)
-{
-	struct hci_dev *hdev = req->hdev;
-
-	/* If controller is not scanning we are done. */
-	if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
-		return 0;
-
-	if (hdev->scanning_paused) {
-		bt_dev_dbg(hdev, "Scanning is paused for suspend");
-		return 0;
-	}
-
-	hci_req_add_le_scan_disable(req, false);
-
-	if (use_ext_scan(hdev)) {
-		struct hci_cp_le_set_ext_scan_enable ext_enable_cp;
-
-		memset(&ext_enable_cp, 0, sizeof(ext_enable_cp));
-		ext_enable_cp.enable = LE_SCAN_ENABLE;
-		ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
-
-		hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE,
-			    sizeof(ext_enable_cp), &ext_enable_cp);
-	} else {
-		struct hci_cp_le_set_scan_enable cp;
-
-		memset(&cp, 0, sizeof(cp));
-		cp.enable = LE_SCAN_ENABLE;
-		cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
-		hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
-	}
-
-	return 0;
-}
-
-static void le_scan_restart_work(struct work_struct *work)
-{
-	struct hci_dev *hdev = container_of(work, struct hci_dev,
-					    le_scan_restart.work);
-	unsigned long timeout, duration, scan_start, now;
-	u8 status;
-
-	bt_dev_dbg(hdev, "");
-
-	hci_req_sync(hdev, le_scan_restart, 0, HCI_CMD_TIMEOUT, &status);
-	if (status) {
-		bt_dev_err(hdev, "failed to restart LE scan: status %d",
-			   status);
-		return;
-	}
-
-	hci_dev_lock(hdev);
-
-	if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) ||
-	    !hdev->discovery.scan_start)
-		goto unlock;
-
-	/* When the scan was started, hdev->le_scan_disable has been queued
-	 * after duration from scan_start. During scan restart this job
-	 * has been canceled, and we need to queue it again after proper
-	 * timeout, to make sure that scan does not run indefinitely.
-	 */
-	duration = hdev->discovery.scan_duration;
-	scan_start = hdev->discovery.scan_start;
-	now = jiffies;
-	if (now - scan_start <= duration) {
-		int elapsed;
-
-		if (now >= scan_start)
-			elapsed = now - scan_start;
-		else
-			elapsed = ULONG_MAX - scan_start + now;
-
-		timeout = duration - elapsed;
-	} else {
-		timeout = 0;
-	}
-
-	queue_delayed_work(hdev->req_workqueue,
-			   &hdev->le_scan_disable, timeout);
-
-unlock:
-	hci_dev_unlock(hdev);
-}
-
 bool hci_req_stop_discovery(struct hci_request *req)
 {
 	struct hci_dev *hdev = req->hdev;
@@ -2158,7 +2072,6 @@ error:
 
 void hci_request_setup(struct hci_dev *hdev)
 {
-	INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work);
 	INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
 	INIT_DELAYED_WORK(&hdev->interleave_scan, interleave_scan_work);
 }
@@ -2167,8 +2080,6 @@ void hci_request_cancel_all(struct hci_dev *hdev)
 {
 	__hci_cmd_sync_cancel(hdev, ENODEV);
 
-	cancel_delayed_work_sync(&hdev->le_scan_restart);
-
 	if (hdev->adv_instance_timeout) {
 		cancel_delayed_work_sync(&hdev->adv_instance_expire);
 		hdev->adv_instance_timeout = 0;
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index e22837510ab1..75e7c0a01ab1 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -392,6 +392,79 @@ _return:
 	hci_dev_unlock(hdev);
 }
 
+static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val,
+				       u8 filter_dup);
+static int hci_le_scan_restart_sync(struct hci_dev *hdev)
+{
+	/* If controller is not scanning we are done. */
+	if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
+		return 0;
+
+	if (hdev->scanning_paused) {
+		bt_dev_dbg(hdev, "Scanning is paused for suspend");
+		return 0;
+	}
+
+	hci_le_set_scan_enable_sync(hdev, LE_SCAN_DISABLE, 0x00);
+	return hci_le_set_scan_enable_sync(hdev, LE_SCAN_ENABLE,
+					   LE_SCAN_FILTER_DUP_ENABLE);
+}
+
+static int le_scan_restart_sync(struct hci_dev *hdev, void *data)
+{
+	return hci_le_scan_restart_sync(hdev);
+}
+
+static void le_scan_restart(struct work_struct *work)
+{
+	struct hci_dev *hdev = container_of(work, struct hci_dev,
+					    le_scan_restart.work);
+	unsigned long timeout, duration, scan_start, now;
+	int status;
+
+	bt_dev_dbg(hdev, "");
+
+	hci_dev_lock(hdev);
+
+	status = hci_cmd_sync_queue(hdev, le_scan_restart_sync, NULL, NULL);
+	if (status) {
+		bt_dev_err(hdev, "failed to restart LE scan: status %d",
+			   status);
+		goto unlock;
+	}
+
+	if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) ||
+	    !hdev->discovery.scan_start)
+		goto unlock;
+
+	/* When the scan was started, hdev->le_scan_disable has been queued
+	 * after duration from scan_start. During scan restart this job
+	 * has been canceled, and we need to queue it again after proper
+	 * timeout, to make sure that scan does not run indefinitely.
+	 */
+	duration = hdev->discovery.scan_duration;
+	scan_start = hdev->discovery.scan_start;
+	now = jiffies;
+	if (now - scan_start <= duration) {
+		int elapsed;
+
+		if (now >= scan_start)
+			elapsed = now - scan_start;
+		else
+			elapsed = ULONG_MAX - scan_start + now;
+
+		timeout = duration - elapsed;
+	} else {
+		timeout = 0;
+	}
+
+	queue_delayed_work(hdev->req_workqueue,
+			   &hdev->le_scan_disable, timeout);
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
 void hci_cmd_sync_init(struct hci_dev *hdev)
 {
 	INIT_WORK(&hdev->cmd_sync_work, hci_cmd_sync_work);
@@ -400,6 +473,7 @@ void hci_cmd_sync_init(struct hci_dev *hdev)
 
 	INIT_WORK(&hdev->cmd_sync_cancel_work, hci_cmd_sync_cancel_work);
 	INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable);
+	INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart);
 }
 
 void hci_cmd_sync_clear(struct hci_dev *hdev)
@@ -4488,6 +4562,7 @@ int hci_dev_close_sync(struct hci_dev *hdev)
 	cancel_delayed_work(&hdev->power_off);
 	cancel_delayed_work(&hdev->ncmd_timer);
 	cancel_delayed_work(&hdev->le_scan_disable);
+	cancel_delayed_work(&hdev->le_scan_restart);
 
 	hci_request_cancel_all(hdev);
 
-- 
cgit v1.2.3


From 9e63767dd58a388ca1c000058f0bf84abf154b48 Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Fri, 5 Aug 2022 16:42:30 -0700
Subject: Bluetooth: Delete unused hci_req_stop_discovery()

hci_req_stop_discovery has been deprecated in favor of
hci_stop_discovery_sync() as part of transition to hci_sync.c

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_request.c | 48 ---------------------------------------------
 net/bluetooth/hci_request.h |  2 --
 2 files changed, 50 deletions(-)

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 114af7350363..ef0a5ec067b6 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1975,54 +1975,6 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 	return 0;
 }
 
-bool hci_req_stop_discovery(struct hci_request *req)
-{
-	struct hci_dev *hdev = req->hdev;
-	struct discovery_state *d = &hdev->discovery;
-	struct hci_cp_remote_name_req_cancel cp;
-	struct inquiry_entry *e;
-	bool ret = false;
-
-	bt_dev_dbg(hdev, "state %u", hdev->discovery.state);
-
-	if (d->state == DISCOVERY_FINDING || d->state == DISCOVERY_STOPPING) {
-		if (test_bit(HCI_INQUIRY, &hdev->flags))
-			hci_req_add(req, HCI_OP_INQUIRY_CANCEL, 0, NULL);
-
-		if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
-			cancel_delayed_work(&hdev->le_scan_disable);
-			cancel_delayed_work(&hdev->le_scan_restart);
-			hci_req_add_le_scan_disable(req, false);
-		}
-
-		ret = true;
-	} else {
-		/* Passive scanning */
-		if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
-			hci_req_add_le_scan_disable(req, false);
-			ret = true;
-		}
-	}
-
-	/* No further actions needed for LE-only discovery */
-	if (d->type == DISCOV_TYPE_LE)
-		return ret;
-
-	if (d->state == DISCOVERY_RESOLVING || d->state == DISCOVERY_STOPPING) {
-		e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY,
-						     NAME_PENDING);
-		if (!e)
-			return ret;
-
-		bacpy(&cp.bdaddr, &e->data.bdaddr);
-		hci_req_add(req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp),
-			    &cp);
-		ret = true;
-	}
-
-	return ret;
-}
-
 static void config_data_path_complete(struct hci_dev *hdev, u8 status,
 				      u16 opcode)
 {
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 39d001fa3acf..faf6d9a51a91 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -113,8 +113,6 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
 void __hci_req_update_class(struct hci_request *req);
 
 /* Returns true if HCI commands were queued */
-bool hci_req_stop_discovery(struct hci_request *req);
-
 int hci_req_configure_datapath(struct hci_dev *hdev, struct bt_codec *codec);
 
 void __hci_req_update_scan(struct hci_request *req);
-- 
cgit v1.2.3


From e07a06b4eb417f5271d33ce2240e93c62d98b7b4 Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Fri, 5 Aug 2022 16:42:31 -0700
Subject: Bluetooth: Convert SCO configure_datapath to hci_sync

Recoding HCI cmds to offload SCO codec to use hci_sync mechanism rather
than deprecated hci_request mechanism.

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_conn.c    | 86 +++++++++++++++++++++++++++++++++++++++------
 net/bluetooth/hci_request.c | 47 -------------------------
 net/bluetooth/hci_request.h |  2 --
 3 files changed, 75 insertions(+), 60 deletions(-)

diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 9777e7b109ee..337e74d0f8b1 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -44,6 +44,11 @@ struct sco_param {
 	u8  retrans_effort;
 };
 
+struct conn_handle_t {
+	struct hci_conn *conn;
+	__u16 handle;
+};
+
 static const struct sco_param esco_param_cvsd[] = {
 	{ EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a,	0x01 }, /* S3 */
 	{ EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007,	0x01 }, /* S2 */
@@ -316,17 +321,60 @@ static bool find_next_esco_param(struct hci_conn *conn,
 	return conn->attempt <= size;
 }
 
-static bool hci_enhanced_setup_sync_conn(struct hci_conn *conn, __u16 handle)
+static int configure_datapath_sync(struct hci_dev *hdev, struct bt_codec *codec)
 {
-	struct hci_dev *hdev = conn->hdev;
+	int err;
+	__u8 vnd_len, *vnd_data = NULL;
+	struct hci_op_configure_data_path *cmd = NULL;
+
+	err = hdev->get_codec_config_data(hdev, ESCO_LINK, codec, &vnd_len,
+					  &vnd_data);
+	if (err < 0)
+		goto error;
+
+	cmd = kzalloc(sizeof(*cmd) + vnd_len, GFP_KERNEL);
+	if (!cmd) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	err = hdev->get_data_path_id(hdev, &cmd->data_path_id);
+	if (err < 0)
+		goto error;
+
+	cmd->vnd_len = vnd_len;
+	memcpy(cmd->vnd_data, vnd_data, vnd_len);
+
+	cmd->direction = 0x00;
+	__hci_cmd_sync_status(hdev, HCI_CONFIGURE_DATA_PATH,
+			      sizeof(*cmd) + vnd_len, cmd, HCI_CMD_TIMEOUT);
+
+	cmd->direction = 0x01;
+	err = __hci_cmd_sync_status(hdev, HCI_CONFIGURE_DATA_PATH,
+				    sizeof(*cmd) + vnd_len, cmd,
+				    HCI_CMD_TIMEOUT);
+error:
+
+	kfree(cmd);
+	kfree(vnd_data);
+	return err;
+}
+
+static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data)
+{
+	struct conn_handle_t *conn_handle = data;
+	struct hci_conn *conn = conn_handle->conn;
+	__u16 handle = conn_handle->handle;
 	struct hci_cp_enhanced_setup_sync_conn cp;
 	const struct sco_param *param;
 
+	kfree(conn_handle);
+
 	bt_dev_dbg(hdev, "hcon %p", conn);
 
 	/* for offload use case, codec needs to configured before opening SCO */
 	if (conn->codec.data_path)
-		hci_req_configure_datapath(hdev, &conn->codec);
+		configure_datapath_sync(hdev, &conn->codec);
 
 	conn->state = BT_CONNECT;
 	conn->out = true;
@@ -344,7 +392,7 @@ static bool hci_enhanced_setup_sync_conn(struct hci_conn *conn, __u16 handle)
 	case BT_CODEC_MSBC:
 		if (!find_next_esco_param(conn, esco_param_msbc,
 					  ARRAY_SIZE(esco_param_msbc)))
-			return false;
+			return -EINVAL;
 
 		param = &esco_param_msbc[conn->attempt - 1];
 		cp.tx_coding_format.id = 0x05;
@@ -396,11 +444,11 @@ static bool hci_enhanced_setup_sync_conn(struct hci_conn *conn, __u16 handle)
 		if (lmp_esco_capable(conn->link)) {
 			if (!find_next_esco_param(conn, esco_param_cvsd,
 						  ARRAY_SIZE(esco_param_cvsd)))
-				return false;
+				return -EINVAL;
 			param = &esco_param_cvsd[conn->attempt - 1];
 		} else {
 			if (conn->attempt > ARRAY_SIZE(sco_param_cvsd))
-				return false;
+				return -EINVAL;
 			param = &sco_param_cvsd[conn->attempt - 1];
 		}
 		cp.tx_coding_format.id = 2;
@@ -423,7 +471,7 @@ static bool hci_enhanced_setup_sync_conn(struct hci_conn *conn, __u16 handle)
 		cp.out_transport_unit_size = 16;
 		break;
 	default:
-		return false;
+		return -EINVAL;
 	}
 
 	cp.retrans_effort = param->retrans_effort;
@@ -431,9 +479,9 @@ static bool hci_enhanced_setup_sync_conn(struct hci_conn *conn, __u16 handle)
 	cp.max_latency = __cpu_to_le16(param->max_latency);
 
 	if (hci_send_cmd(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0)
-		return false;
+		return -EIO;
 
-	return true;
+	return 0;
 }
 
 static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle)
@@ -490,8 +538,24 @@ static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle)
 
 bool hci_setup_sync(struct hci_conn *conn, __u16 handle)
 {
-	if (enhanced_sync_conn_capable(conn->hdev))
-		return hci_enhanced_setup_sync_conn(conn, handle);
+	int result;
+	struct conn_handle_t *conn_handle;
+
+	if (enhanced_sync_conn_capable(conn->hdev)) {
+		conn_handle = kzalloc(sizeof(*conn_handle), GFP_KERNEL);
+
+		if (!conn_handle)
+			return false;
+
+		conn_handle->conn = conn;
+		conn_handle->handle = handle;
+		result = hci_cmd_sync_queue(conn->hdev, hci_enhanced_setup_sync,
+					    conn_handle, NULL);
+		if (result < 0)
+			kfree(conn_handle);
+
+		return result == 0;
+	}
 
 	return hci_setup_sync_conn(conn, handle);
 }
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index ef0a5ec067b6..d14e50951aec 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -1975,53 +1975,6 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 	return 0;
 }
 
-static void config_data_path_complete(struct hci_dev *hdev, u8 status,
-				      u16 opcode)
-{
-	bt_dev_dbg(hdev, "status %u", status);
-}
-
-int hci_req_configure_datapath(struct hci_dev *hdev, struct bt_codec *codec)
-{
-	struct hci_request req;
-	int err;
-	__u8 vnd_len, *vnd_data = NULL;
-	struct hci_op_configure_data_path *cmd = NULL;
-
-	hci_req_init(&req, hdev);
-
-	err = hdev->get_codec_config_data(hdev, ESCO_LINK, codec, &vnd_len,
-					  &vnd_data);
-	if (err < 0)
-		goto error;
-
-	cmd = kzalloc(sizeof(*cmd) + vnd_len, GFP_KERNEL);
-	if (!cmd) {
-		err = -ENOMEM;
-		goto error;
-	}
-
-	err = hdev->get_data_path_id(hdev, &cmd->data_path_id);
-	if (err < 0)
-		goto error;
-
-	cmd->vnd_len = vnd_len;
-	memcpy(cmd->vnd_data, vnd_data, vnd_len);
-
-	cmd->direction = 0x00;
-	hci_req_add(&req, HCI_CONFIGURE_DATA_PATH, sizeof(*cmd) + vnd_len, cmd);
-
-	cmd->direction = 0x01;
-	hci_req_add(&req, HCI_CONFIGURE_DATA_PATH, sizeof(*cmd) + vnd_len, cmd);
-
-	err = hci_req_run(&req, config_data_path_complete);
-error:
-
-	kfree(cmd);
-	kfree(vnd_data);
-	return err;
-}
-
 void hci_request_setup(struct hci_dev *hdev)
 {
 	INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index faf6d9a51a91..41e0b84f2042 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -113,8 +113,6 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
 void __hci_req_update_class(struct hci_request *req);
 
 /* Returns true if HCI commands were queued */
-int hci_req_configure_datapath(struct hci_dev *hdev, struct bt_codec *codec);
-
 void __hci_req_update_scan(struct hci_request *req);
 
 int hci_update_random_address(struct hci_request *req, bool require_privacy,
-- 
cgit v1.2.3


From c249ea9b4309cf3250c5bbb42a05d38d0ed9071c Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Fri, 5 Aug 2022 16:42:32 -0700
Subject: Bluetooth: Move Adv Instance timer to hci_sync

The Advertising Instance expiration timer adv_instance_expire was
handled with the deprecated hci_request mechanism, rather than it's
replacement: hci_sync.

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |   3 +-
 net/bluetooth/hci_request.c      | 111 ---------------------------------
 net/bluetooth/hci_request.h      |   4 --
 net/bluetooth/hci_sync.c         | 129 ++++++++++++++++++++++++++++++++++++---
 net/bluetooth/mgmt.c             |   5 +-
 5 files changed, 125 insertions(+), 127 deletions(-)

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 3843f5060c73..aea950440b9d 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -72,7 +72,8 @@ int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len,
 int hci_remove_advertising_sync(struct hci_dev *hdev, struct sock *sk,
 				u8 instance, bool force);
 int hci_disable_advertising_sync(struct hci_dev *hdev);
-
+int hci_clear_adv_instance_sync(struct hci_dev *hdev, struct sock *sk,
+				u8 instance, bool force);
 int hci_update_passive_scan_sync(struct hci_dev *hdev);
 int hci_update_passive_scan(struct hci_dev *hdev);
 int hci_read_rssi_sync(struct hci_dev *hdev, __le16 handle);
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index d14e50951aec..be32fb0f5557 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -811,14 +811,6 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
 			   addr_resolv);
 }
 
-static void cancel_adv_timeout(struct hci_dev *hdev)
-{
-	if (hdev->adv_instance_timeout) {
-		hdev->adv_instance_timeout = 0;
-		cancel_delayed_work(&hdev->adv_instance_expire);
-	}
-}
-
 static bool adv_cur_instance_is_scannable(struct hci_dev *hdev)
 {
 	return hci_adv_instance_is_scannable(hdev, hdev->cur_adv_instance);
@@ -1140,37 +1132,6 @@ void hci_req_reenable_advertising(struct hci_dev *hdev)
 	hci_req_run(&req, adv_enable_complete);
 }
 
-static void adv_timeout_expire(struct work_struct *work)
-{
-	struct hci_dev *hdev = container_of(work, struct hci_dev,
-					    adv_instance_expire.work);
-
-	struct hci_request req;
-	u8 instance;
-
-	bt_dev_dbg(hdev, "");
-
-	hci_dev_lock(hdev);
-
-	hdev->adv_instance_timeout = 0;
-
-	instance = hdev->cur_adv_instance;
-	if (instance == 0x00)
-		goto unlock;
-
-	hci_req_init(&req, hdev);
-
-	hci_req_clear_adv_instance(hdev, NULL, &req, instance, false);
-
-	if (list_empty(&hdev->adv_instances))
-		__hci_req_disable_advertising(&req);
-
-	hci_req_run(&req, NULL);
-
-unlock:
-	hci_dev_unlock(hdev);
-}
-
 static int hci_req_add_le_interleaved_scan(struct hci_request *req,
 					   unsigned long opt)
 {
@@ -1637,72 +1598,6 @@ int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
 	return 0;
 }
 
-/* For a single instance:
- * - force == true: The instance will be removed even when its remaining
- *   lifetime is not zero.
- * - force == false: the instance will be deactivated but kept stored unless
- *   the remaining lifetime is zero.
- *
- * For instance == 0x00:
- * - force == true: All instances will be removed regardless of their timeout
- *   setting.
- * - force == false: Only instances that have a timeout will be removed.
- */
-void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
-				struct hci_request *req, u8 instance,
-				bool force)
-{
-	struct adv_info *adv_instance, *n, *next_instance = NULL;
-	int err;
-	u8 rem_inst;
-
-	/* Cancel any timeout concerning the removed instance(s). */
-	if (!instance || hdev->cur_adv_instance == instance)
-		cancel_adv_timeout(hdev);
-
-	/* Get the next instance to advertise BEFORE we remove
-	 * the current one. This can be the same instance again
-	 * if there is only one instance.
-	 */
-	if (instance && hdev->cur_adv_instance == instance)
-		next_instance = hci_get_next_instance(hdev, instance);
-
-	if (instance == 0x00) {
-		list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances,
-					 list) {
-			if (!(force || adv_instance->timeout))
-				continue;
-
-			rem_inst = adv_instance->instance;
-			err = hci_remove_adv_instance(hdev, rem_inst);
-			if (!err)
-				mgmt_advertising_removed(sk, hdev, rem_inst);
-		}
-	} else {
-		adv_instance = hci_find_adv_instance(hdev, instance);
-
-		if (force || (adv_instance && adv_instance->timeout &&
-			      !adv_instance->remaining_time)) {
-			/* Don't advertise a removed instance. */
-			if (next_instance &&
-			    next_instance->instance == instance)
-				next_instance = NULL;
-
-			err = hci_remove_adv_instance(hdev, instance);
-			if (!err)
-				mgmt_advertising_removed(sk, hdev, instance);
-		}
-	}
-
-	if (!req || !hdev_is_powered(hdev) ||
-	    hci_dev_test_flag(hdev, HCI_ADVERTISING))
-		return;
-
-	if (next_instance && !ext_adv_capable(hdev))
-		__hci_req_schedule_adv_instance(req, next_instance->instance,
-						false);
-}
-
 int hci_update_random_address(struct hci_request *req, bool require_privacy,
 			      bool use_rpa, u8 *own_addr_type)
 {
@@ -1977,7 +1872,6 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
 
 void hci_request_setup(struct hci_dev *hdev)
 {
-	INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
 	INIT_DELAYED_WORK(&hdev->interleave_scan, interleave_scan_work);
 }
 
@@ -1985,10 +1879,5 @@ void hci_request_cancel_all(struct hci_dev *hdev)
 {
 	__hci_cmd_sync_cancel(hdev, ENODEV);
 
-	if (hdev->adv_instance_timeout) {
-		cancel_delayed_work_sync(&hdev->adv_instance_expire);
-		hdev->adv_instance_timeout = 0;
-	}
-
 	cancel_interleave_scan(hdev);
 }
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 41e0b84f2042..3d1b3d97ccdd 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -90,10 +90,6 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance);
 
 int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
 				    bool force);
-void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
-				struct hci_request *req, u8 instance,
-				bool force);
-
 int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance);
 int __hci_req_setup_per_adv_instance(struct hci_request *req, u8 instance,
 				     u16 min_interval, u16 max_interval);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 75e7c0a01ab1..bc993dd2383d 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -465,6 +465,121 @@ unlock:
 	hci_dev_unlock(hdev);
 }
 
+static void cancel_adv_timeout(struct hci_dev *hdev)
+{
+	if (hdev->adv_instance_timeout) {
+		hdev->adv_instance_timeout = 0;
+		cancel_delayed_work(&hdev->adv_instance_expire);
+	}
+}
+
+/* For a single instance:
+ * - force == true: The instance will be removed even when its remaining
+ *   lifetime is not zero.
+ * - force == false: the instance will be deactivated but kept stored unless
+ *   the remaining lifetime is zero.
+ *
+ * For instance == 0x00:
+ * - force == true: All instances will be removed regardless of their timeout
+ *   setting.
+ * - force == false: Only instances that have a timeout will be removed.
+ */
+int hci_clear_adv_instance_sync(struct hci_dev *hdev, struct sock *sk,
+				u8 instance, bool force)
+{
+	struct adv_info *adv_instance, *n, *next_instance = NULL;
+	int err;
+	u8 rem_inst;
+
+	/* Cancel any timeout concerning the removed instance(s). */
+	if (!instance || hdev->cur_adv_instance == instance)
+		cancel_adv_timeout(hdev);
+
+	/* Get the next instance to advertise BEFORE we remove
+	 * the current one. This can be the same instance again
+	 * if there is only one instance.
+	 */
+	if (instance && hdev->cur_adv_instance == instance)
+		next_instance = hci_get_next_instance(hdev, instance);
+
+	if (instance == 0x00) {
+		list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances,
+					 list) {
+			if (!(force || adv_instance->timeout))
+				continue;
+
+			rem_inst = adv_instance->instance;
+			err = hci_remove_adv_instance(hdev, rem_inst);
+			if (!err)
+				mgmt_advertising_removed(sk, hdev, rem_inst);
+		}
+	} else {
+		adv_instance = hci_find_adv_instance(hdev, instance);
+
+		if (force || (adv_instance && adv_instance->timeout &&
+			      !adv_instance->remaining_time)) {
+			/* Don't advertise a removed instance. */
+			if (next_instance &&
+			    next_instance->instance == instance)
+				next_instance = NULL;
+
+			err = hci_remove_adv_instance(hdev, instance);
+			if (!err)
+				mgmt_advertising_removed(sk, hdev, instance);
+		}
+	}
+
+	if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING))
+		return 0;
+
+	if (next_instance && !ext_adv_capable(hdev))
+		return hci_schedule_adv_instance_sync(hdev,
+						      next_instance->instance,
+						      false);
+
+	return 0;
+}
+
+static int adv_timeout_expire_sync(struct hci_dev *hdev, void *data)
+{
+	u8 instance = *(u8 *)data;
+
+	kfree(data);
+
+	hci_clear_adv_instance_sync(hdev, NULL, instance, false);
+
+	if (list_empty(&hdev->adv_instances))
+		return hci_disable_advertising_sync(hdev);
+
+	return 0;
+}
+
+static void adv_timeout_expire(struct work_struct *work)
+{
+	u8 *inst_ptr;
+	struct hci_dev *hdev = container_of(work, struct hci_dev,
+					    adv_instance_expire.work);
+
+	bt_dev_dbg(hdev, "");
+
+	hci_dev_lock(hdev);
+
+	hdev->adv_instance_timeout = 0;
+
+	if (hdev->cur_adv_instance == 0x00)
+		goto unlock;
+
+	inst_ptr = kmalloc(1, GFP_KERNEL);
+	if (!inst_ptr)
+		goto unlock;
+
+	*inst_ptr = hdev->cur_adv_instance;
+	hci_cmd_sync_queue(hdev, adv_timeout_expire_sync, inst_ptr, NULL);
+
+unlock:
+	hci_dev_unlock(hdev);
+}
+
 void hci_cmd_sync_init(struct hci_dev *hdev)
 {
 	INIT_WORK(&hdev->cmd_sync_work, hci_cmd_sync_work);
@@ -474,6 +589,7 @@ void hci_cmd_sync_init(struct hci_dev *hdev)
 	INIT_WORK(&hdev->cmd_sync_cancel_work, hci_cmd_sync_cancel_work);
 	INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable);
 	INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart);
+	INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
 }
 
 void hci_cmd_sync_clear(struct hci_dev *hdev)
@@ -1479,14 +1595,6 @@ int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason)
 				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
 }
 
-static void cancel_adv_timeout(struct hci_dev *hdev)
-{
-	if (hdev->adv_instance_timeout) {
-		hdev->adv_instance_timeout = 0;
-		cancel_delayed_work(&hdev->adv_instance_expire);
-	}
-}
-
 static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance)
 {
 	struct {
@@ -4566,6 +4674,11 @@ int hci_dev_close_sync(struct hci_dev *hdev)
 
 	hci_request_cancel_all(hdev);
 
+	if (hdev->adv_instance_timeout) {
+		cancel_delayed_work_sync(&hdev->adv_instance_expire);
+		hdev->adv_instance_timeout = 0;
+	}
+
 	if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) &&
 	    !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
 	    test_bit(HCI_UP, &hdev->flags)) {
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 6e31023b84f5..df20e15a05da 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -2058,6 +2058,8 @@ static int set_le_sync(struct hci_dev *hdev, void *data)
 	int err;
 
 	if (!val) {
+		hci_clear_adv_instance_sync(hdev, NULL, 0x00, true);
+
 		if (hci_dev_test_flag(hdev, HCI_LE_ADV))
 			hci_disable_advertising_sync(hdev);
 
@@ -2131,9 +2133,6 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 	val = !!cp->val;
 	enabled = lmp_host_le_capable(hdev);
 
-	if (!val)
-		hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, true);
-
 	if (!hdev_is_powered(hdev) || val == enabled) {
 		bool changed = false;
 
-- 
cgit v1.2.3


From dd50a864ffaece5b75621a84ae8d6e3483ce6732 Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Fri, 5 Aug 2022 16:42:33 -0700
Subject: Bluetooth: Delete unreferenced hci_request code

This patch deletes a whole bunch of code no longer reached because the
functionality was recoded using hci_sync.c

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_request.c | 737 +-------------------------------------------
 net/bluetooth/hci_request.h |  33 --
 2 files changed, 2 insertions(+), 768 deletions(-)

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index be32fb0f5557..685dc0f983b7 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -269,43 +269,10 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen,
 void hci_req_add(struct hci_request *req, u16 opcode, u32 plen,
 		 const void *param)
 {
-	bt_dev_dbg(req->hdev, "HCI_REQ-0x%4.4x", opcode);
+	bt_dev_err(req->hdev, "HCI_REQ-0x%4.4x", opcode);
 	hci_req_add_ev(req, opcode, plen, param, 0);
 }
 
-void __hci_req_write_fast_connectable(struct hci_request *req, bool enable)
-{
-	struct hci_dev *hdev = req->hdev;
-	struct hci_cp_write_page_scan_activity acp;
-	u8 type;
-
-	if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
-		return;
-
-	if (hdev->hci_ver < BLUETOOTH_VER_1_2)
-		return;
-
-	if (enable) {
-		type = PAGE_SCAN_TYPE_INTERLACED;
-
-		/* 160 msec page scan interval */
-		acp.interval = cpu_to_le16(0x0100);
-	} else {
-		type = hdev->def_page_scan_type;
-		acp.interval = cpu_to_le16(hdev->def_page_scan_int);
-	}
-
-	acp.window = cpu_to_le16(hdev->def_page_scan_window);
-
-	if (__cpu_to_le16(hdev->page_scan_interval) != acp.interval ||
-	    __cpu_to_le16(hdev->page_scan_window) != acp.window)
-		hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY,
-			    sizeof(acp), &acp);
-
-	if (hdev->page_scan_type != type)
-		hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type);
-}
-
 static void start_interleave_scan(struct hci_dev *hdev)
 {
 	hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER;
@@ -358,45 +325,6 @@ static bool __hci_update_interleaved_scan(struct hci_dev *hdev)
 	return false;
 }
 
-void __hci_req_update_name(struct hci_request *req)
-{
-	struct hci_dev *hdev = req->hdev;
-	struct hci_cp_write_local_name cp;
-
-	memcpy(cp.name, hdev->dev_name, sizeof(cp.name));
-
-	hci_req_add(req, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp);
-}
-
-void __hci_req_update_eir(struct hci_request *req)
-{
-	struct hci_dev *hdev = req->hdev;
-	struct hci_cp_write_eir cp;
-
-	if (!hdev_is_powered(hdev))
-		return;
-
-	if (!lmp_ext_inq_capable(hdev))
-		return;
-
-	if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
-		return;
-
-	if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
-		return;
-
-	memset(&cp, 0, sizeof(cp));
-
-	eir_create(hdev, cp.data);
-
-	if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0)
-		return;
-
-	memcpy(hdev->eir, cp.data, sizeof(cp.data));
-
-	hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp);
-}
-
 void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn)
 {
 	struct hci_dev *hdev = req->hdev;
@@ -811,212 +739,7 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
 			   addr_resolv);
 }
 
-static bool adv_cur_instance_is_scannable(struct hci_dev *hdev)
-{
-	return hci_adv_instance_is_scannable(hdev, hdev->cur_adv_instance);
-}
-
-void __hci_req_disable_advertising(struct hci_request *req)
-{
-	if (ext_adv_capable(req->hdev)) {
-		__hci_req_disable_ext_adv_instance(req, 0x00);
-	} else {
-		u8 enable = 0x00;
-
-		hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
-	}
-}
-
-static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags)
-{
-	/* If privacy is not enabled don't use RPA */
-	if (!hci_dev_test_flag(hdev, HCI_PRIVACY))
-		return false;
-
-	/* If basic privacy mode is enabled use RPA */
-	if (!hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
-		return true;
-
-	/* If limited privacy mode is enabled don't use RPA if we're
-	 * both discoverable and bondable.
-	 */
-	if ((flags & MGMT_ADV_FLAG_DISCOV) &&
-	    hci_dev_test_flag(hdev, HCI_BONDABLE))
-		return false;
-
-	/* We're neither bondable nor discoverable in the limited
-	 * privacy mode, therefore use RPA.
-	 */
-	return true;
-}
-
-static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable)
-{
-	/* If there is no connection we are OK to advertise. */
-	if (hci_conn_num(hdev, LE_LINK) == 0)
-		return true;
-
-	/* Check le_states if there is any connection in peripheral role. */
-	if (hdev->conn_hash.le_num_peripheral > 0) {
-		/* Peripheral connection state and non connectable mode bit 20.
-		 */
-		if (!connectable && !(hdev->le_states[2] & 0x10))
-			return false;
-
-		/* Peripheral connection state and connectable mode bit 38
-		 * and scannable bit 21.
-		 */
-		if (connectable && (!(hdev->le_states[4] & 0x40) ||
-				    !(hdev->le_states[2] & 0x20)))
-			return false;
-	}
-
-	/* Check le_states if there is any connection in central role. */
-	if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_peripheral) {
-		/* Central connection state and non connectable mode bit 18. */
-		if (!connectable && !(hdev->le_states[2] & 0x02))
-			return false;
-
-		/* Central connection state and connectable mode bit 35 and
-		 * scannable 19.
-		 */
-		if (connectable && (!(hdev->le_states[4] & 0x08) ||
-				    !(hdev->le_states[2] & 0x08)))
-			return false;
-	}
-
-	return true;
-}
-
-void __hci_req_enable_advertising(struct hci_request *req)
-{
-	struct hci_dev *hdev = req->hdev;
-	struct adv_info *adv;
-	struct hci_cp_le_set_adv_param cp;
-	u8 own_addr_type, enable = 0x01;
-	bool connectable;
-	u16 adv_min_interval, adv_max_interval;
-	u32 flags;
-
-	flags = hci_adv_instance_flags(hdev, hdev->cur_adv_instance);
-	adv = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
-
-	/* If the "connectable" instance flag was not set, then choose between
-	 * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
-	 */
-	connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
-		      mgmt_get_connectable(hdev);
-
-	if (!is_advertising_allowed(hdev, connectable))
-		return;
-
-	if (hci_dev_test_flag(hdev, HCI_LE_ADV))
-		__hci_req_disable_advertising(req);
-
-	/* Clear the HCI_LE_ADV bit temporarily so that the
-	 * hci_update_random_address knows that it's safe to go ahead
-	 * and write a new random address. The flag will be set back on
-	 * as soon as the SET_ADV_ENABLE HCI command completes.
-	 */
-	hci_dev_clear_flag(hdev, HCI_LE_ADV);
-
-	/* Set require_privacy to true only when non-connectable
-	 * advertising is used. In that case it is fine to use a
-	 * non-resolvable private address.
-	 */
-	if (hci_update_random_address(req, !connectable,
-				      adv_use_rpa(hdev, flags),
-				      &own_addr_type) < 0)
-		return;
-
-	memset(&cp, 0, sizeof(cp));
-
-	if (adv) {
-		adv_min_interval = adv->min_interval;
-		adv_max_interval = adv->max_interval;
-	} else {
-		adv_min_interval = hdev->le_adv_min_interval;
-		adv_max_interval = hdev->le_adv_max_interval;
-	}
-
-	if (connectable) {
-		cp.type = LE_ADV_IND;
-	} else {
-		if (adv_cur_instance_is_scannable(hdev))
-			cp.type = LE_ADV_SCAN_IND;
-		else
-			cp.type = LE_ADV_NONCONN_IND;
-
-		if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE) ||
-		    hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) {
-			adv_min_interval = DISCOV_LE_FAST_ADV_INT_MIN;
-			adv_max_interval = DISCOV_LE_FAST_ADV_INT_MAX;
-		}
-	}
-
-	cp.min_interval = cpu_to_le16(adv_min_interval);
-	cp.max_interval = cpu_to_le16(adv_max_interval);
-	cp.own_address_type = own_addr_type;
-	cp.channel_map = hdev->le_adv_channel_map;
-
-	hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp);
-
-	hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
-}
-
-void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance)
-{
-	struct hci_dev *hdev = req->hdev;
-	u8 len;
-
-	if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
-		return;
-
-	if (ext_adv_capable(hdev)) {
-		struct {
-			struct hci_cp_le_set_ext_scan_rsp_data cp;
-			u8 data[HCI_MAX_EXT_AD_LENGTH];
-		} pdu;
-
-		memset(&pdu, 0, sizeof(pdu));
-
-		len = eir_create_scan_rsp(hdev, instance, pdu.data);
-
-		if (hdev->scan_rsp_data_len == len &&
-		    !memcmp(pdu.data, hdev->scan_rsp_data, len))
-			return;
-
-		memcpy(hdev->scan_rsp_data, pdu.data, len);
-		hdev->scan_rsp_data_len = len;
-
-		pdu.cp.handle = instance;
-		pdu.cp.length = len;
-		pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
-		pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
-
-		hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA,
-			    sizeof(pdu.cp) + len, &pdu.cp);
-	} else {
-		struct hci_cp_le_set_scan_rsp_data cp;
-
-		memset(&cp, 0, sizeof(cp));
-
-		len = eir_create_scan_rsp(hdev, instance, cp.data);
-
-		if (hdev->scan_rsp_data_len == len &&
-		    !memcmp(cp.data, hdev->scan_rsp_data, len))
-			return;
-
-		memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
-		hdev->scan_rsp_data_len = len;
-
-		cp.length = len;
-
-		hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp);
-	}
-}
-
-void __hci_req_update_adv_data(struct hci_request *req, u8 instance)
+static void __hci_req_update_adv_data(struct hci_request *req, u8 instance)
 {
 	struct hci_dev *hdev = req->hdev;
 	u8 len;
@@ -1080,58 +803,6 @@ int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance)
 	return hci_req_run(&req, NULL);
 }
 
-static void enable_addr_resolution_complete(struct hci_dev *hdev, u8 status,
-					    u16 opcode)
-{
-	BT_DBG("%s status %u", hdev->name, status);
-}
-
-void hci_req_disable_address_resolution(struct hci_dev *hdev)
-{
-	struct hci_request req;
-	__u8 enable = 0x00;
-
-	if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION))
-		return;
-
-	hci_req_init(&req, hdev);
-
-	hci_req_add(&req, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE, 1, &enable);
-
-	hci_req_run(&req, enable_addr_resolution_complete);
-}
-
-static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
-{
-	bt_dev_dbg(hdev, "status %u", status);
-}
-
-void hci_req_reenable_advertising(struct hci_dev *hdev)
-{
-	struct hci_request req;
-
-	if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
-	    list_empty(&hdev->adv_instances))
-		return;
-
-	hci_req_init(&req, hdev);
-
-	if (hdev->cur_adv_instance) {
-		__hci_req_schedule_adv_instance(&req, hdev->cur_adv_instance,
-						true);
-	} else {
-		if (ext_adv_capable(hdev)) {
-			__hci_req_start_ext_adv(&req, 0x00);
-		} else {
-			__hci_req_update_adv_data(&req, 0x00);
-			__hci_req_update_scan_rsp_data(&req, 0x00);
-			__hci_req_enable_advertising(&req);
-		}
-	}
-
-	hci_req_run(&req, adv_enable_complete);
-}
-
 static int hci_req_add_le_interleaved_scan(struct hci_request *req,
 					   unsigned long opt)
 {
@@ -1261,11 +932,6 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
 	return 0;
 }
 
-void __hci_req_clear_ext_adv_sets(struct hci_request *req)
-{
-	hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL);
-}
-
 static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
 {
 	struct hci_dev *hdev = req->hdev;
@@ -1290,314 +956,6 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
 	hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa);
 }
 
-int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
-{
-	struct hci_cp_le_set_ext_adv_params cp;
-	struct hci_dev *hdev = req->hdev;
-	bool connectable;
-	u32 flags;
-	bdaddr_t random_addr;
-	u8 own_addr_type;
-	int err;
-	struct adv_info *adv;
-	bool secondary_adv, require_privacy;
-
-	if (instance > 0) {
-		adv = hci_find_adv_instance(hdev, instance);
-		if (!adv)
-			return -EINVAL;
-	} else {
-		adv = NULL;
-	}
-
-	flags = hci_adv_instance_flags(hdev, instance);
-
-	/* If the "connectable" instance flag was not set, then choose between
-	 * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
-	 */
-	connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
-		      mgmt_get_connectable(hdev);
-
-	if (!is_advertising_allowed(hdev, connectable))
-		return -EPERM;
-
-	/* Set require_privacy to true only when non-connectable
-	 * advertising is used. In that case it is fine to use a
-	 * non-resolvable private address.
-	 */
-	require_privacy = !connectable;
-
-	/* Don't require privacy for periodic adv? */
-	if (adv && adv->periodic)
-		require_privacy = false;
-
-	err = hci_get_random_address(hdev, require_privacy,
-				     adv_use_rpa(hdev, flags), adv,
-				     &own_addr_type, &random_addr);
-	if (err < 0)
-		return err;
-
-	memset(&cp, 0, sizeof(cp));
-
-	if (adv) {
-		hci_cpu_to_le24(adv->min_interval, cp.min_interval);
-		hci_cpu_to_le24(adv->max_interval, cp.max_interval);
-		cp.tx_power = adv->tx_power;
-	} else {
-		hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval);
-		hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval);
-		cp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE;
-	}
-
-	secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK);
-
-	if (connectable) {
-		if (secondary_adv)
-			cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND);
-		else
-			cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND);
-	} else if (hci_adv_instance_is_scannable(hdev, instance) ||
-		   (flags & MGMT_ADV_PARAM_SCAN_RSP)) {
-		if (secondary_adv)
-			cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND);
-		else
-			cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND);
-	} else {
-		/* Secondary and periodic cannot use legacy PDUs */
-		if (secondary_adv || (adv && adv->periodic))
-			cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND);
-		else
-			cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND);
-	}
-
-	cp.own_addr_type = own_addr_type;
-	cp.channel_map = hdev->le_adv_channel_map;
-	cp.handle = instance;
-
-	if (flags & MGMT_ADV_FLAG_SEC_2M) {
-		cp.primary_phy = HCI_ADV_PHY_1M;
-		cp.secondary_phy = HCI_ADV_PHY_2M;
-	} else if (flags & MGMT_ADV_FLAG_SEC_CODED) {
-		cp.primary_phy = HCI_ADV_PHY_CODED;
-		cp.secondary_phy = HCI_ADV_PHY_CODED;
-	} else {
-		/* In all other cases use 1M */
-		cp.primary_phy = HCI_ADV_PHY_1M;
-		cp.secondary_phy = HCI_ADV_PHY_1M;
-	}
-
-	hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp);
-
-	if ((own_addr_type == ADDR_LE_DEV_RANDOM ||
-	     own_addr_type == ADDR_LE_DEV_RANDOM_RESOLVED) &&
-	    bacmp(&random_addr, BDADDR_ANY)) {
-		struct hci_cp_le_set_adv_set_rand_addr cp;
-
-		/* Check if random address need to be updated */
-		if (adv) {
-			if (!bacmp(&random_addr, &adv->random_addr))
-				return 0;
-		} else {
-			if (!bacmp(&random_addr, &hdev->random_addr))
-				return 0;
-			/* Instance 0x00 doesn't have an adv_info, instead it
-			 * uses hdev->random_addr to track its address so
-			 * whenever it needs to be updated this also set the
-			 * random address since hdev->random_addr is shared with
-			 * scan state machine.
-			 */
-			set_random_addr(req, &random_addr);
-		}
-
-		memset(&cp, 0, sizeof(cp));
-
-		cp.handle = instance;
-		bacpy(&cp.bdaddr, &random_addr);
-
-		hci_req_add(req,
-			    HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
-			    sizeof(cp), &cp);
-	}
-
-	return 0;
-}
-
-int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance)
-{
-	struct hci_dev *hdev = req->hdev;
-	struct hci_cp_le_set_ext_adv_enable *cp;
-	struct hci_cp_ext_adv_set *adv_set;
-	u8 data[sizeof(*cp) + sizeof(*adv_set) * 1];
-	struct adv_info *adv_instance;
-
-	if (instance > 0) {
-		adv_instance = hci_find_adv_instance(hdev, instance);
-		if (!adv_instance)
-			return -EINVAL;
-	} else {
-		adv_instance = NULL;
-	}
-
-	cp = (void *) data;
-	adv_set = (void *) cp->data;
-
-	memset(cp, 0, sizeof(*cp));
-
-	cp->enable = 0x01;
-	cp->num_of_sets = 0x01;
-
-	memset(adv_set, 0, sizeof(*adv_set));
-
-	adv_set->handle = instance;
-
-	/* Set duration per instance since controller is responsible for
-	 * scheduling it.
-	 */
-	if (adv_instance && adv_instance->duration) {
-		u16 duration = adv_instance->timeout * MSEC_PER_SEC;
-
-		/* Time = N * 10 ms */
-		adv_set->duration = cpu_to_le16(duration / 10);
-	}
-
-	hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE,
-		    sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets,
-		    data);
-
-	return 0;
-}
-
-int __hci_req_disable_ext_adv_instance(struct hci_request *req, u8 instance)
-{
-	struct hci_dev *hdev = req->hdev;
-	struct hci_cp_le_set_ext_adv_enable *cp;
-	struct hci_cp_ext_adv_set *adv_set;
-	u8 data[sizeof(*cp) + sizeof(*adv_set) * 1];
-	u8 req_size;
-
-	/* If request specifies an instance that doesn't exist, fail */
-	if (instance > 0 && !hci_find_adv_instance(hdev, instance))
-		return -EINVAL;
-
-	memset(data, 0, sizeof(data));
-
-	cp = (void *)data;
-	adv_set = (void *)cp->data;
-
-	/* Instance 0x00 indicates all advertising instances will be disabled */
-	cp->num_of_sets = !!instance;
-	cp->enable = 0x00;
-
-	adv_set->handle = instance;
-
-	req_size = sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets;
-	hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, req_size, data);
-
-	return 0;
-}
-
-int __hci_req_remove_ext_adv_instance(struct hci_request *req, u8 instance)
-{
-	struct hci_dev *hdev = req->hdev;
-
-	/* If request specifies an instance that doesn't exist, fail */
-	if (instance > 0 && !hci_find_adv_instance(hdev, instance))
-		return -EINVAL;
-
-	hci_req_add(req, HCI_OP_LE_REMOVE_ADV_SET, sizeof(instance), &instance);
-
-	return 0;
-}
-
-int __hci_req_start_ext_adv(struct hci_request *req, u8 instance)
-{
-	struct hci_dev *hdev = req->hdev;
-	struct adv_info *adv_instance = hci_find_adv_instance(hdev, instance);
-	int err;
-
-	/* If instance isn't pending, the chip knows about it, and it's safe to
-	 * disable
-	 */
-	if (adv_instance && !adv_instance->pending)
-		__hci_req_disable_ext_adv_instance(req, instance);
-
-	err = __hci_req_setup_ext_adv_instance(req, instance);
-	if (err < 0)
-		return err;
-
-	__hci_req_update_scan_rsp_data(req, instance);
-	__hci_req_enable_ext_advertising(req, instance);
-
-	return 0;
-}
-
-int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
-				    bool force)
-{
-	struct hci_dev *hdev = req->hdev;
-	struct adv_info *adv_instance = NULL;
-	u16 timeout;
-
-	if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
-	    list_empty(&hdev->adv_instances))
-		return -EPERM;
-
-	if (hdev->adv_instance_timeout)
-		return -EBUSY;
-
-	adv_instance = hci_find_adv_instance(hdev, instance);
-	if (!adv_instance)
-		return -ENOENT;
-
-	/* A zero timeout means unlimited advertising. As long as there is
-	 * only one instance, duration should be ignored. We still set a timeout
-	 * in case further instances are being added later on.
-	 *
-	 * If the remaining lifetime of the instance is more than the duration
-	 * then the timeout corresponds to the duration, otherwise it will be
-	 * reduced to the remaining instance lifetime.
-	 */
-	if (adv_instance->timeout == 0 ||
-	    adv_instance->duration <= adv_instance->remaining_time)
-		timeout = adv_instance->duration;
-	else
-		timeout = adv_instance->remaining_time;
-
-	/* The remaining time is being reduced unless the instance is being
-	 * advertised without time limit.
-	 */
-	if (adv_instance->timeout)
-		adv_instance->remaining_time =
-				adv_instance->remaining_time - timeout;
-
-	/* Only use work for scheduling instances with legacy advertising */
-	if (!ext_adv_capable(hdev)) {
-		hdev->adv_instance_timeout = timeout;
-		queue_delayed_work(hdev->req_workqueue,
-			   &hdev->adv_instance_expire,
-			   msecs_to_jiffies(timeout * 1000));
-	}
-
-	/* If we're just re-scheduling the same instance again then do not
-	 * execute any HCI commands. This happens when a single instance is
-	 * being advertised.
-	 */
-	if (!force && hdev->cur_adv_instance == instance &&
-	    hci_dev_test_flag(hdev, HCI_LE_ADV))
-		return 0;
-
-	hdev->cur_adv_instance = instance;
-	if (ext_adv_capable(hdev)) {
-		__hci_req_start_ext_adv(req, instance);
-	} else {
-		__hci_req_update_adv_data(req, instance);
-		__hci_req_update_scan_rsp_data(req, instance);
-		__hci_req_enable_advertising(req);
-	}
-
-	return 0;
-}
-
 int hci_update_random_address(struct hci_request *req, bool require_privacy,
 			      bool use_rpa, u8 *own_addr_type)
 {
@@ -1686,97 +1044,6 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy,
 	return 0;
 }
 
-static bool disconnected_accept_list_entries(struct hci_dev *hdev)
-{
-	struct bdaddr_list *b;
-
-	list_for_each_entry(b, &hdev->accept_list, list) {
-		struct hci_conn *conn;
-
-		conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr);
-		if (!conn)
-			return true;
-
-		if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG)
-			return true;
-	}
-
-	return false;
-}
-
-void __hci_req_update_scan(struct hci_request *req)
-{
-	struct hci_dev *hdev = req->hdev;
-	u8 scan;
-
-	if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
-		return;
-
-	if (!hdev_is_powered(hdev))
-		return;
-
-	if (mgmt_powering_down(hdev))
-		return;
-
-	if (hdev->scanning_paused)
-		return;
-
-	if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) ||
-	    disconnected_accept_list_entries(hdev))
-		scan = SCAN_PAGE;
-	else
-		scan = SCAN_DISABLED;
-
-	if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
-		scan |= SCAN_INQUIRY;
-
-	if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE) &&
-	    test_bit(HCI_ISCAN, &hdev->flags) == !!(scan & SCAN_INQUIRY))
-		return;
-
-	hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
-}
-
-static u8 get_service_classes(struct hci_dev *hdev)
-{
-	struct bt_uuid *uuid;
-	u8 val = 0;
-
-	list_for_each_entry(uuid, &hdev->uuids, list)
-		val |= uuid->svc_hint;
-
-	return val;
-}
-
-void __hci_req_update_class(struct hci_request *req)
-{
-	struct hci_dev *hdev = req->hdev;
-	u8 cod[3];
-
-	bt_dev_dbg(hdev, "");
-
-	if (!hdev_is_powered(hdev))
-		return;
-
-	if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
-		return;
-
-	if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
-		return;
-
-	cod[0] = hdev->minor_class;
-	cod[1] = hdev->major_class;
-	cod[2] = get_service_classes(hdev);
-
-	if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
-		cod[1] |= 0x20;
-
-	if (memcmp(cod, hdev->dev_class, 3) == 0)
-		return;
-
-	hci_req_add(req, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod);
-}
-
 void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
 		      u8 reason)
 {
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 3d1b3d97ccdd..55205cc9f281 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -68,49 +68,16 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
 struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen,
 				const void *param);
 
-void __hci_req_write_fast_connectable(struct hci_request *req, bool enable);
-void __hci_req_update_name(struct hci_request *req);
-void __hci_req_update_eir(struct hci_request *req);
-
 void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn);
 void hci_req_add_le_passive_scan(struct hci_request *req);
 
 void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next);
 
-void hci_req_disable_address_resolution(struct hci_dev *hdev);
-void hci_req_reenable_advertising(struct hci_dev *hdev);
-void __hci_req_enable_advertising(struct hci_request *req);
-void __hci_req_disable_advertising(struct hci_request *req);
-void __hci_req_update_adv_data(struct hci_request *req, u8 instance);
 int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance);
-int hci_req_start_per_adv(struct hci_dev *hdev, u8 instance, u32 flags,
-			  u16 min_interval, u16 max_interval,
-			  u16 sync_interval);
-void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance);
-
-int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
-				    bool force);
-int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance);
-int __hci_req_setup_per_adv_instance(struct hci_request *req, u8 instance,
-				     u16 min_interval, u16 max_interval);
-int __hci_req_start_ext_adv(struct hci_request *req, u8 instance);
-int __hci_req_start_per_adv(struct hci_request *req, u8 instance, u32 flags,
-			    u16 min_interval, u16 max_interval,
-			    u16 sync_interval);
-int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance);
-int __hci_req_enable_per_advertising(struct hci_request *req, u8 instance);
-int __hci_req_disable_ext_adv_instance(struct hci_request *req, u8 instance);
-int __hci_req_remove_ext_adv_instance(struct hci_request *req, u8 instance);
-void __hci_req_clear_ext_adv_sets(struct hci_request *req);
 int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
 			   bool use_rpa, struct adv_info *adv_instance,
 			   u8 *own_addr_type, bdaddr_t *rand_addr);
 
-void __hci_req_update_class(struct hci_request *req);
-
-/* Returns true if HCI commands were queued */
-void __hci_req_update_scan(struct hci_request *req);
-
 int hci_update_random_address(struct hci_request *req, bool require_privacy,
 			      bool use_rpa, u8 *own_addr_type);
 
-- 
cgit v1.2.3


From 3fe318ee72c54506534f51b4b4dfb19e0e0df2db Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Fri, 5 Aug 2022 16:42:34 -0700
Subject: Bluetooth: move hci_get_random_address() to hci_sync

This function has no dependencies on the deprecated hci_request
mechanism, so has been moved unchanged to hci_sync.c

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |   5 +
 net/bluetooth/hci_request.c      | 255 ++++++++++++++-------------------------
 net/bluetooth/hci_request.h      |   9 --
 net/bluetooth/hci_sync.c         |  73 +++++++++++
 4 files changed, 170 insertions(+), 172 deletions(-)

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index aea950440b9d..b6b975c2ed3e 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -16,6 +16,7 @@ struct hci_cmd_sync_work_entry {
 	hci_cmd_sync_work_destroy_t destroy;
 };
 
+struct adv_info;
 /* Function with sync suffix shall not be called with hdev->lock held as they
  * wait the command to complete and in the meantime an event could be received
  * which could attempt to acquire hdev->lock causing a deadlock.
@@ -51,6 +52,10 @@ int hci_update_class_sync(struct hci_dev *hdev);
 int hci_update_name_sync(struct hci_dev *hdev);
 int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode);
 
+int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
+			   bool use_rpa, struct adv_info *adv_instance,
+			   u8 *own_addr_type, bdaddr_t *rand_addr);
+
 int hci_update_random_address_sync(struct hci_dev *hdev, bool require_privacy,
 				   bool rpa, u8 *own_addr_type);
 
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 685dc0f983b7..b9875224ac7b 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -650,6 +650,96 @@ static inline bool hci_is_le_conn_scanning(struct hci_dev *hdev)
 	return false;
 }
 
+static void set_random_addr(struct hci_request *req, bdaddr_t *rpa);
+static int hci_update_random_address(struct hci_request *req,
+				     bool require_privacy, bool use_rpa,
+				     u8 *own_addr_type)
+{
+	struct hci_dev *hdev = req->hdev;
+	int err;
+
+	/* If privacy is enabled use a resolvable private address. If
+	 * current RPA has expired or there is something else than
+	 * the current RPA in use, then generate a new one.
+	 */
+	if (use_rpa) {
+		/* If Controller supports LL Privacy use own address type is
+		 * 0x03
+		 */
+		if (use_ll_privacy(hdev))
+			*own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED;
+		else
+			*own_addr_type = ADDR_LE_DEV_RANDOM;
+
+		if (rpa_valid(hdev))
+			return 0;
+
+		err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
+		if (err < 0) {
+			bt_dev_err(hdev, "failed to generate new RPA");
+			return err;
+		}
+
+		set_random_addr(req, &hdev->rpa);
+
+		return 0;
+	}
+
+	/* In case of required privacy without resolvable private address,
+	 * use an non-resolvable private address. This is useful for active
+	 * scanning and non-connectable advertising.
+	 */
+	if (require_privacy) {
+		bdaddr_t nrpa;
+
+		while (true) {
+			/* The non-resolvable private address is generated
+			 * from random six bytes with the two most significant
+			 * bits cleared.
+			 */
+			get_random_bytes(&nrpa, 6);
+			nrpa.b[5] &= 0x3f;
+
+			/* The non-resolvable private address shall not be
+			 * equal to the public address.
+			 */
+			if (bacmp(&hdev->bdaddr, &nrpa))
+				break;
+		}
+
+		*own_addr_type = ADDR_LE_DEV_RANDOM;
+		set_random_addr(req, &nrpa);
+		return 0;
+	}
+
+	/* If forcing static address is in use or there is no public
+	 * address use the static address as random address (but skip
+	 * the HCI command if the current random address is already the
+	 * static one.
+	 *
+	 * In case BR/EDR has been disabled on a dual-mode controller
+	 * and a static address has been configured, then use that
+	 * address instead of the public BR/EDR address.
+	 */
+	if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+	    !bacmp(&hdev->bdaddr, BDADDR_ANY) ||
+	    (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
+	     bacmp(&hdev->static_addr, BDADDR_ANY))) {
+		*own_addr_type = ADDR_LE_DEV_RANDOM;
+		if (bacmp(&hdev->static_addr, &hdev->random_addr))
+			hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6,
+				    &hdev->static_addr);
+		return 0;
+	}
+
+	/* Neither privacy nor static address is being used so use a
+	 * public address.
+	 */
+	*own_addr_type = ADDR_LE_DEV_PUBLIC;
+
+	return 0;
+}
+
 /* Ensure to call hci_req_add_le_scan_disable() first to disable the
  * controller based address resolution to be able to reconfigure
  * resolving list.
@@ -859,79 +949,6 @@ static void interleave_scan_work(struct work_struct *work)
 				   &hdev->interleave_scan, timeout);
 }
 
-int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
-			   bool use_rpa, struct adv_info *adv_instance,
-			   u8 *own_addr_type, bdaddr_t *rand_addr)
-{
-	int err;
-
-	bacpy(rand_addr, BDADDR_ANY);
-
-	/* If privacy is enabled use a resolvable private address. If
-	 * current RPA has expired then generate a new one.
-	 */
-	if (use_rpa) {
-		/* If Controller supports LL Privacy use own address type is
-		 * 0x03
-		 */
-		if (use_ll_privacy(hdev))
-			*own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED;
-		else
-			*own_addr_type = ADDR_LE_DEV_RANDOM;
-
-		if (adv_instance) {
-			if (adv_rpa_valid(adv_instance))
-				return 0;
-		} else {
-			if (rpa_valid(hdev))
-				return 0;
-		}
-
-		err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
-		if (err < 0) {
-			bt_dev_err(hdev, "failed to generate new RPA");
-			return err;
-		}
-
-		bacpy(rand_addr, &hdev->rpa);
-
-		return 0;
-	}
-
-	/* In case of required privacy without resolvable private address,
-	 * use an non-resolvable private address. This is useful for
-	 * non-connectable advertising.
-	 */
-	if (require_privacy) {
-		bdaddr_t nrpa;
-
-		while (true) {
-			/* The non-resolvable private address is generated
-			 * from random six bytes with the two most significant
-			 * bits cleared.
-			 */
-			get_random_bytes(&nrpa, 6);
-			nrpa.b[5] &= 0x3f;
-
-			/* The non-resolvable private address shall not be
-			 * equal to the public address.
-			 */
-			if (bacmp(&hdev->bdaddr, &nrpa))
-				break;
-		}
-
-		*own_addr_type = ADDR_LE_DEV_RANDOM;
-		bacpy(rand_addr, &nrpa);
-
-		return 0;
-	}
-
-	/* No privacy so use a public address. */
-	*own_addr_type = ADDR_LE_DEV_PUBLIC;
-
-	return 0;
-}
-
 static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
 {
 	struct hci_dev *hdev = req->hdev;
@@ -956,96 +973,8 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
 	hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa);
 }
 
-int hci_update_random_address(struct hci_request *req, bool require_privacy,
-			      bool use_rpa, u8 *own_addr_type)
-{
-	struct hci_dev *hdev = req->hdev;
-	int err;
-
-	/* If privacy is enabled use a resolvable private address. If
-	 * current RPA has expired or there is something else than
-	 * the current RPA in use, then generate a new one.
-	 */
-	if (use_rpa) {
-		/* If Controller supports LL Privacy use own address type is
-		 * 0x03
-		 */
-		if (use_ll_privacy(hdev))
-			*own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED;
-		else
-			*own_addr_type = ADDR_LE_DEV_RANDOM;
-
-		if (rpa_valid(hdev))
-			return 0;
-
-		err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
-		if (err < 0) {
-			bt_dev_err(hdev, "failed to generate new RPA");
-			return err;
-		}
-
-		set_random_addr(req, &hdev->rpa);
-
-		return 0;
-	}
-
-	/* In case of required privacy without resolvable private address,
-	 * use an non-resolvable private address. This is useful for active
-	 * scanning and non-connectable advertising.
-	 */
-	if (require_privacy) {
-		bdaddr_t nrpa;
-
-		while (true) {
-			/* The non-resolvable private address is generated
-			 * from random six bytes with the two most significant
-			 * bits cleared.
-			 */
-			get_random_bytes(&nrpa, 6);
-			nrpa.b[5] &= 0x3f;
-
-			/* The non-resolvable private address shall not be
-			 * equal to the public address.
-			 */
-			if (bacmp(&hdev->bdaddr, &nrpa))
-				break;
-		}
-
-		*own_addr_type = ADDR_LE_DEV_RANDOM;
-		set_random_addr(req, &nrpa);
-		return 0;
-	}
-
-	/* If forcing static address is in use or there is no public
-	 * address use the static address as random address (but skip
-	 * the HCI command if the current random address is already the
-	 * static one.
-	 *
-	 * In case BR/EDR has been disabled on a dual-mode controller
-	 * and a static address has been configured, then use that
-	 * address instead of the public BR/EDR address.
-	 */
-	if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
-	    !bacmp(&hdev->bdaddr, BDADDR_ANY) ||
-	    (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
-	     bacmp(&hdev->static_addr, BDADDR_ANY))) {
-		*own_addr_type = ADDR_LE_DEV_RANDOM;
-		if (bacmp(&hdev->static_addr, &hdev->random_addr))
-			hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6,
-				    &hdev->static_addr);
-		return 0;
-	}
-
-	/* Neither privacy nor static address is being used so use a
-	 * public address.
-	 */
-	*own_addr_type = ADDR_LE_DEV_PUBLIC;
-
-	return 0;
-}
-
-void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
-		      u8 reason)
+static void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
+			     u8 reason)
 {
 	switch (conn->state) {
 	case BT_CONNECTED:
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 55205cc9f281..c7185ad77c6e 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -74,16 +74,7 @@ void hci_req_add_le_passive_scan(struct hci_request *req);
 void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next);
 
 int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance);
-int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
-			   bool use_rpa, struct adv_info *adv_instance,
-			   u8 *own_addr_type, bdaddr_t *rand_addr);
-
-int hci_update_random_address(struct hci_request *req, bool require_privacy,
-			      bool use_rpa, u8 *own_addr_type);
 
 int hci_abort_conn(struct hci_conn *conn, u8 reason);
-void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
-		      u8 reason);
-
 void hci_request_setup(struct hci_dev *hdev);
 void hci_request_cancel_all(struct hci_dev *hdev);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index bc993dd2383d..46eb535ff466 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -5992,3 +5992,76 @@ int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle)
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_TERM_SYNC,
 				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
 }
+
+int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
+			   bool use_rpa, struct adv_info *adv_instance,
+			   u8 *own_addr_type, bdaddr_t *rand_addr)
+{
+	int err;
+
+	bacpy(rand_addr, BDADDR_ANY);
+
+	/* If privacy is enabled use a resolvable private address. If
+	 * current RPA has expired then generate a new one.
+	 */
+	if (use_rpa) {
+		/* If Controller supports LL Privacy use own address type is
+		 * 0x03
+		 */
+		if (use_ll_privacy(hdev))
+			*own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED;
+		else
+			*own_addr_type = ADDR_LE_DEV_RANDOM;
+
+		if (adv_instance) {
+			if (adv_rpa_valid(adv_instance))
+				return 0;
+		} else {
+			if (rpa_valid(hdev))
+				return 0;
+		}
+
+		err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
+		if (err < 0) {
+			bt_dev_err(hdev, "failed to generate new RPA");
+			return err;
+		}
+
+		bacpy(rand_addr, &hdev->rpa);
+
+		return 0;
+	}
+
+	/* In case of required privacy without resolvable private address,
+	 * use an non-resolvable private address. This is useful for
+	 * non-connectable advertising.
+	 */
+	if (require_privacy) {
+		bdaddr_t nrpa;
+
+		while (true) {
+			/* The non-resolvable private address is generated
+			 * from random six bytes with the two most significant
+			 * bits cleared.
+			 */
+			get_random_bytes(&nrpa, 6);
+			nrpa.b[5] &= 0x3f;
+
+			/* The non-resolvable private address shall not be
+			 * equal to the public address.
+			 */
+			if (bacmp(&hdev->bdaddr, &nrpa))
+				break;
+		}
+
+		*own_addr_type = ADDR_LE_DEV_RANDOM;
+		bacpy(rand_addr, &nrpa);
+
+		return 0;
+	}
+
+	/* No privacy so use a public address. */
+	*own_addr_type = ADDR_LE_DEV_PUBLIC;
+
+	return 0;
+}
-- 
cgit v1.2.3


From 651cd3d65b0f76a2198fcf3a80ce5d53dd267717 Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Fri, 5 Aug 2022 16:42:35 -0700
Subject: Bluetooth: convert hci_update_adv_data to hci_sync

hci_update_adv_data() is called from hci_event and hci_core due to
events from the controller. The prior function used the deprecated
hci_request method, and the new one uses hci_sync.c

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sync.h |  1 +
 net/bluetooth/hci_core.c         |  2 +-
 net/bluetooth/hci_event.c        |  2 +-
 net/bluetooth/hci_request.c      | 64 ----------------------------------------
 net/bluetooth/hci_request.h      |  2 --
 net/bluetooth/hci_sync.c         | 20 +++++++++++++
 6 files changed, 23 insertions(+), 68 deletions(-)

diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index b6b975c2ed3e..17f5a4c32f36 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -61,6 +61,7 @@ int hci_update_random_address_sync(struct hci_dev *hdev, bool require_privacy,
 
 int hci_update_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance);
 int hci_update_adv_data_sync(struct hci_dev *hdev, u8 instance);
+int hci_update_adv_data(struct hci_dev *hdev, u8 instance);
 int hci_schedule_adv_instance_sync(struct hci_dev *hdev, u8 instance,
 				   bool force);
 
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index b3a5a3cc9372..9d2c33f6b065 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -714,7 +714,7 @@ static void hci_update_passive_scan_state(struct hci_dev *hdev, u8 scan)
 		hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
 
 		if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
-			hci_req_update_adv_data(hdev, hdev->cur_adv_instance);
+			hci_update_adv_data(hdev, hdev->cur_adv_instance);
 
 		mgmt_new_settings(hdev);
 	}
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 485c814cf44a..395c6479456f 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2152,7 +2152,7 @@ static u8 hci_cc_set_ext_adv_param(struct hci_dev *hdev, void *data,
 			adv_instance->tx_power = rp->tx_power;
 	}
 	/* Update adv data as tx power is known now */
-	hci_req_update_adv_data(hdev, cp->handle);
+	hci_update_adv_data(hdev, cp->handle);
 
 	hci_dev_unlock(hdev);
 
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index b9875224ac7b..2e19a271d7a1 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -829,70 +829,6 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
 			   addr_resolv);
 }
 
-static void __hci_req_update_adv_data(struct hci_request *req, u8 instance)
-{
-	struct hci_dev *hdev = req->hdev;
-	u8 len;
-
-	if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
-		return;
-
-	if (ext_adv_capable(hdev)) {
-		struct {
-			struct hci_cp_le_set_ext_adv_data cp;
-			u8 data[HCI_MAX_EXT_AD_LENGTH];
-		} pdu;
-
-		memset(&pdu, 0, sizeof(pdu));
-
-		len = eir_create_adv_data(hdev, instance, pdu.data);
-
-		/* There's nothing to do if the data hasn't changed */
-		if (hdev->adv_data_len == len &&
-		    memcmp(pdu.data, hdev->adv_data, len) == 0)
-			return;
-
-		memcpy(hdev->adv_data, pdu.data, len);
-		hdev->adv_data_len = len;
-
-		pdu.cp.length = len;
-		pdu.cp.handle = instance;
-		pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
-		pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
-
-		hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_DATA,
-			    sizeof(pdu.cp) + len, &pdu.cp);
-	} else {
-		struct hci_cp_le_set_adv_data cp;
-
-		memset(&cp, 0, sizeof(cp));
-
-		len = eir_create_adv_data(hdev, instance, cp.data);
-
-		/* There's nothing to do if the data hasn't changed */
-		if (hdev->adv_data_len == len &&
-		    memcmp(cp.data, hdev->adv_data, len) == 0)
-			return;
-
-		memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
-		hdev->adv_data_len = len;
-
-		cp.length = len;
-
-		hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp);
-	}
-}
-
-int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance)
-{
-	struct hci_request req;
-
-	hci_req_init(&req, hdev);
-	__hci_req_update_adv_data(&req, instance);
-
-	return hci_req_run(&req, NULL);
-}
-
 static int hci_req_add_le_interleaved_scan(struct hci_request *req,
 					   unsigned long opt)
 {
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index c7185ad77c6e..7e1de871fca4 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -73,8 +73,6 @@ void hci_req_add_le_passive_scan(struct hci_request *req);
 
 void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next);
 
-int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance);
-
 int hci_abort_conn(struct hci_conn *conn, u8 reason);
 void hci_request_setup(struct hci_dev *hdev);
 void hci_request_cancel_all(struct hci_dev *hdev);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 46eb535ff466..6de2ad730995 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -6065,3 +6065,23 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
 
 	return 0;
 }
+
+static int _update_adv_data_sync(struct hci_dev *hdev, void *data)
+{
+	u8 instance = *(u8 *)data;
+
+	kfree(data);
+
+	return hci_update_adv_data_sync(hdev, instance);
+}
+
+int hci_update_adv_data(struct hci_dev *hdev, u8 instance)
+{
+	u8 *inst_ptr = kmalloc(1, GFP_KERNEL);
+
+	if (!inst_ptr)
+		return -ENOMEM;
+
+	*inst_ptr = instance;
+	return hci_cmd_sync_queue(hdev, _update_adv_data_sync, inst_ptr, NULL);
+}
-- 
cgit v1.2.3


From afcb3369f46ed5dc883a7b92f2dd1e264d79d388 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 7 Aug 2022 22:57:40 +0200
Subject: Bluetooth: hci_event: Fix vendor (unknown) opcode status handling

Commit c8992cffbe74 ("Bluetooth: hci_event: Use of a function table to
handle Command Complete") was (presumably) meant to only refactor things
without any functional changes.

But it does have one undesirable side-effect, before *status would always
be set to skb->data[0] and it might be overridden by some of the opcode
specific handling. While now it always set by the opcode specific handlers.
This means that if the opcode is not known *status does not get set any
more at all!

This behavior change has broken bluetooth support for BCM4343A0 HCIs,
the hci_bcm.c code tries to configure UART attached HCIs at a higher
baudraute using vendor specific opcodes. The BCM4343A0 does not
support this and this used to simply fail:

[   25.646442] Bluetooth: hci0: BCM: failed to write clock (-56)
[   25.646481] Bluetooth: hci0: Failed to set baudrate

After which things would continue with the initial baudraute. But now
that hci_cmd_complete_evt() no longer sets status for unknown opcodes
*status is left at 0. This causes the hci_bcm.c code to think the baudraute
has been changed on the HCI side and to also adjust the UART baudrate,
after which communication with the HCI is broken, leading to:

[   28.579042] Bluetooth: hci0: command 0x0c03 tx timeout
[   36.961601] Bluetooth: hci0: BCM: Reset failed (-110)

And non working bluetooth. Fix this by restoring the previous
default "*status = skb->data[0]" handling for unknown opcodes.

Fixes: c8992cffbe74 ("Bluetooth: hci_event: Use of a function table to handle Command Complete")
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 395c6479456f..938abe6352bf 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4179,6 +4179,17 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data,
 		}
 	}
 
+	if (i == ARRAY_SIZE(hci_cc_table)) {
+		/* Unknown opcode, assume byte 0 contains the status, so
+		 * that e.g. __hci_cmd_sync() properly returns errors
+		 * for vendor specific commands send by HCI drivers.
+		 * If a vendor doesn't actually follow this convention we may
+		 * need to introduce a vendor CC table in order to properly set
+		 * the status.
+		 */
+		*status = skb->data[0];
+	}
+
 	handle_cmd_cnt_and_timer(hdev, ev->ncmd);
 
 	hci_req_cmd_complete(hdev, *opcode, *status, req_complete,
-- 
cgit v1.2.3


From 123f6d3ae773f769695830518690ac3e4a477e82 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 11 Aug 2022 14:20:46 -0700
Subject: Bluetooth: hci_sync: Fix suspend performance regression

This attempts to fix suspend performance when there is no connections by
not updating the event mask.

Fixes: ef61b6ea1544 ("Bluetooth: Always set event mask on suspend")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_sync.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 6de2ad730995..74a0cd5d0b37 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -5549,17 +5549,21 @@ int hci_suspend_sync(struct hci_dev *hdev)
 	/* Prevent disconnects from causing scanning to be re-enabled */
 	hci_pause_scan_sync(hdev);
 
-	/* Soft disconnect everything (power off) */
-	err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF);
-	if (err) {
-		/* Set state to BT_RUNNING so resume doesn't notify */
-		hdev->suspend_state = BT_RUNNING;
-		hci_resume_sync(hdev);
-		return err;
-	}
+	if (hci_conn_count(hdev)) {
+		/* Soft disconnect everything (power off) */
+		err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF);
+		if (err) {
+			/* Set state to BT_RUNNING so resume doesn't notify */
+			hdev->suspend_state = BT_RUNNING;
+			hci_resume_sync(hdev);
+			return err;
+		}
 
-	/* Update event mask so only the allowed event can wakeup the host */
-	hci_set_event_mask_sync(hdev);
+		/* Update event mask so only the allowed event can wakeup the
+		 * host.
+		 */
+		hci_set_event_mask_sync(hdev);
+	}
 
 	/* Only configure accept list if disconnect succeeded and wake
 	 * isn't being prevented.
-- 
cgit v1.2.3


From fc5ae5b44eb26db973a6d4cfa0f75fe0650a95c6 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 12 Aug 2022 15:33:57 -0700
Subject: Bluetooth: L2CAP: Fix build errors in some archs

This attempts to fix the follow errors:

In function 'memcmp',
    inlined from 'bacmp' at ./include/net/bluetooth/bluetooth.h:347:9,
    inlined from 'l2cap_global_chan_by_psm' at
    net/bluetooth/l2cap_core.c:2003:15:
./include/linux/fortify-string.h:44:33: error: '__builtin_memcmp'
specified bound 6 exceeds source size 0 [-Werror=stringop-overread]
   44 | #define __underlying_memcmp     __builtin_memcmp
      |                                 ^
./include/linux/fortify-string.h:420:16: note: in expansion of macro
'__underlying_memcmp'
  420 |         return __underlying_memcmp(p, q, size);
      |                ^~~~~~~~~~~~~~~~~~~
In function 'memcmp',
    inlined from 'bacmp' at ./include/net/bluetooth/bluetooth.h:347:9,
    inlined from 'l2cap_global_chan_by_psm' at
    net/bluetooth/l2cap_core.c:2004:15:
./include/linux/fortify-string.h:44:33: error: '__builtin_memcmp'
specified bound 6 exceeds source size 0 [-Werror=stringop-overread]
   44 | #define __underlying_memcmp     __builtin_memcmp
      |                                 ^
./include/linux/fortify-string.h:420:16: note: in expansion of macro
'__underlying_memcmp'
  420 |         return __underlying_memcmp(p, q, size);
      |                ^~~~~~~~~~~~~~~~~~~

Fixes: 332f1795ca20 ("Bluetooth: L2CAP: Fix l2cap_global_chan_by_psm regression")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/l2cap_core.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index cbe0cae73434..2c9de67daadc 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -1992,11 +1992,11 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
 			src_match = !bacmp(&c->src, src);
 			dst_match = !bacmp(&c->dst, dst);
 			if (src_match && dst_match) {
-				c = l2cap_chan_hold_unless_zero(c);
-				if (c) {
-					read_unlock(&chan_list_lock);
-					return c;
-				}
+				if (!l2cap_chan_hold_unless_zero(c))
+					continue;
+
+				read_unlock(&chan_list_lock);
+				return c;
 			}
 
 			/* Closest match */
-- 
cgit v1.2.3


From 529d4492aed7c005206383b84c38d515c31c1585 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 15 Aug 2022 16:14:32 -0700
Subject: Bluetooth: MGMT: Fix Get Device Flags

Get Device Flags don't check if device does actually use an RPA in which
case it shall only set HCI_CONN_FLAG_REMOTE_WAKEUP if LL Privacy is
enabled.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/mgmt.c | 71 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 29 deletions(-)

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index df20e15a05da..1cd7d6572892 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -4546,6 +4546,22 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev,
 			       MGMT_STATUS_NOT_SUPPORTED);
 }
 
+static u32 get_params_flags(struct hci_dev *hdev,
+			    struct hci_conn_params *params)
+{
+	u32 flags = hdev->conn_flags;
+
+	/* Devices using RPAs can only be programmed in the acceptlist if
+	 * LL Privacy has been enable otherwise they cannot mark
+	 * HCI_CONN_FLAG_REMOTE_WAKEUP.
+	 */
+	if ((flags & HCI_CONN_FLAG_REMOTE_WAKEUP) && !use_ll_privacy(hdev) &&
+	    hci_find_irk_by_addr(hdev, &params->addr, params->addr_type))
+		flags &= ~HCI_CONN_FLAG_REMOTE_WAKEUP;
+
+	return flags;
+}
+
 static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
 			    u16 data_len)
 {
@@ -4577,10 +4593,10 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
 	} else {
 		params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
 						le_addr_type(cp->addr.type));
-
 		if (!params)
 			goto done;
 
+		supported_flags = get_params_flags(hdev, params);
 		current_flags = params->flags;
 	}
 
@@ -4648,38 +4664,35 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
 			bt_dev_warn(hdev, "No such BR/EDR device %pMR (0x%x)",
 				    &cp->addr.bdaddr, cp->addr.type);
 		}
-	} else {
-		params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
-						le_addr_type(cp->addr.type));
-		if (params) {
-			/* Devices using RPAs can only be programmed in the
-			 * acceptlist LL Privacy has been enable otherwise they
-			 * cannot mark HCI_CONN_FLAG_REMOTE_WAKEUP.
-			 */
-			if ((current_flags & HCI_CONN_FLAG_REMOTE_WAKEUP) &&
-			    !use_ll_privacy(hdev) &&
-			    hci_find_irk_by_addr(hdev, &params->addr,
-						 params->addr_type)) {
-				bt_dev_warn(hdev,
-					    "Cannot set wakeable for RPA");
-				goto unlock;
-			}
 
-			params->flags = current_flags;
-			status = MGMT_STATUS_SUCCESS;
+		goto unlock;
+	}
 
-			/* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY
-			 * has been set.
-			 */
-			if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY)
-				hci_update_passive_scan(hdev);
-		} else {
-			bt_dev_warn(hdev, "No such LE device %pMR (0x%x)",
-				    &cp->addr.bdaddr,
-				    le_addr_type(cp->addr.type));
-		}
+	params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
+					le_addr_type(cp->addr.type));
+	if (!params) {
+		bt_dev_warn(hdev, "No such LE device %pMR (0x%x)",
+			    &cp->addr.bdaddr, le_addr_type(cp->addr.type));
+		goto unlock;
+	}
+
+	supported_flags = get_params_flags(hdev, params);
+
+	if ((supported_flags | current_flags) != supported_flags) {
+		bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)",
+			    current_flags, supported_flags);
+		goto unlock;
 	}
 
+	params->flags = current_flags;
+	status = MGMT_STATUS_SUCCESS;
+
+	/* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY
+	 * has been set.
+	 */
+	if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY)
+		hci_update_passive_scan(hdev);
+
 unlock:
 	hci_dev_unlock(hdev);
 
-- 
cgit v1.2.3


From 029bde79fb7969dcd9a4b2940efc06e9404a9df1 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Wed, 17 Aug 2022 20:14:36 +0900
Subject: Bluetooth: hci_sync: fix double mgmt_pending_free() in
 remove_adv_monitor()

syzbot is reporting double kfree() at remove_adv_monitor() [1], for
commit 7cf5c2978f23fdbb ("Bluetooth: hci_sync: Refactor remove Adv
Monitor") forgot to remove duplicated mgmt_pending_remove() when
merging "if (err) {" path and "if (!pending) {" path.

Link: https://syzkaller.appspot.com/bug?extid=915a8416bf15895b8e07 [1]
Reported-by: syzbot <syzbot+915a8416bf15895b8e07@syzkaller.appspotmail.com>
Fixes: 7cf5c2978f23fdbb ("Bluetooth: hci_sync: Refactor remove Adv Monitor")
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/mgmt.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 1cd7d6572892..77e7aa63c0c0 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -5066,7 +5066,6 @@ static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev,
 		else
 			status = MGMT_STATUS_FAILED;
 
-		mgmt_pending_remove(cmd);
 		goto unlock;
 	}
 
-- 
cgit v1.2.3


From b5e1acea065fa1339f4049c2bd9782889dc68368 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 18 Aug 2022 14:31:42 -0700
Subject: Bluetooth: ISO: Fix not handling shutdown condition

In order to properly handle shutdown syscall the code shall not assume
that the how argument is always SHUT_RDWR resulting in SHUTDOWN_MASK as
that would result in poll to immediately report EPOLLHUP instead of
properly waiting for disconnect_cfm (Disconnect Complete) which is
rather important for the likes of BAP as the CIG may need to be
reprogrammed.

Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/iso.c | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index ced8ad4fed4f..613039ba5dbf 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1309,7 +1309,7 @@ static int iso_sock_shutdown(struct socket *sock, int how)
 	struct sock *sk = sock->sk;
 	int err = 0;
 
-	BT_DBG("sock %p, sk %p", sock, sk);
+	BT_DBG("sock %p, sk %p, how %d", sock, sk, how);
 
 	if (!sk)
 		return 0;
@@ -1317,17 +1317,32 @@ static int iso_sock_shutdown(struct socket *sock, int how)
 	sock_hold(sk);
 	lock_sock(sk);
 
-	if (!sk->sk_shutdown) {
-		sk->sk_shutdown = SHUTDOWN_MASK;
-		iso_sock_clear_timer(sk);
-		__iso_sock_close(sk);
-
-		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
-		    !(current->flags & PF_EXITING))
-			err = bt_sock_wait_state(sk, BT_CLOSED,
-						 sk->sk_lingertime);
+	switch (how) {
+	case SHUT_RD:
+		if (sk->sk_shutdown & RCV_SHUTDOWN)
+			goto unlock;
+		sk->sk_shutdown |= RCV_SHUTDOWN;
+		break;
+	case SHUT_WR:
+		if (sk->sk_shutdown & SEND_SHUTDOWN)
+			goto unlock;
+		sk->sk_shutdown |= SEND_SHUTDOWN;
+		break;
+	case SHUT_RDWR:
+		if (sk->sk_shutdown & SHUTDOWN_MASK)
+			goto unlock;
+		sk->sk_shutdown |= SHUTDOWN_MASK;
+		break;
 	}
 
+	iso_sock_clear_timer(sk);
+	__iso_sock_close(sk);
+
+	if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+	    !(current->flags & PF_EXITING))
+		err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+
+unlock:
 	release_sock(sk);
 	sock_put(sk);
 
-- 
cgit v1.2.3


From 5356266552bbaaddb8d6b4b53450e290be2cd717 Mon Sep 17 00:00:00 2001
From: Archie Pusaka <apusaka@chromium.org>
Date: Tue, 23 Aug 2022 12:39:22 +0800
Subject: Bluetooth: hci_event: Fix checking conn for le_conn_complete_evt

To prevent multiple conn complete events, we shouldn't look up the
conn with hci_lookup_le_connect, since it requires the state to be
BT_CONNECT. By the time the duplicate event is processed, the state
might have changed, so we end up processing the new event anyway.

Change the lookup function to hci_conn_hash_lookup_ba.

Fixes: d5ebaa7c5f6f6 ("Bluetooth: hci_event: Ignore multiple conn complete events")
Signed-off-by: Archie Pusaka <apusaka@chromium.org>
Reviewed-by: Sonny Sasaka <sonnysasaka@chromium.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 938abe6352bf..1906822a061b 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -5801,7 +5801,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 	 */
 	hci_dev_clear_flag(hdev, HCI_LE_ADV);
 
-	conn = hci_lookup_le_connect(hdev);
+	conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, bdaddr);
 	if (!conn) {
 		/* In case of error status and there is no connection pending
 		 * just unlock as there is nothing to cleanup.
-- 
cgit v1.2.3


From a112ff247a8c1e9352c8b23081da7a8f2aedeae7 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 18 Aug 2022 23:02:07 +0200
Subject: Bluetooth: move from strlcpy with unused retval to strscpy

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hidp/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 5940744a8cd8..cc20e706c639 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -83,14 +83,14 @@ static void hidp_copy_session(struct hidp_session *session, struct hidp_conninfo
 		ci->product = session->input->id.product;
 		ci->version = session->input->id.version;
 		if (session->input->name)
-			strlcpy(ci->name, session->input->name, 128);
+			strscpy(ci->name, session->input->name, 128);
 		else
-			strlcpy(ci->name, "HID Boot Device", 128);
+			strscpy(ci->name, "HID Boot Device", 128);
 	} else if (session->hid) {
 		ci->vendor  = session->hid->vendor;
 		ci->product = session->hid->product;
 		ci->version = session->hid->version;
-		strlcpy(ci->name, session->hid->name, 128);
+		strscpy(ci->name, session->hid->name, 128);
 	}
 }
 
-- 
cgit v1.2.3


From b828854871f6851c75a5b19f1cd967bf4e7c85dd Mon Sep 17 00:00:00 2001
From: Zhengping Jiang <jiangzp@google.com>
Date: Tue, 23 Aug 2022 10:28:08 -0700
Subject: Bluetooth: hci_sync: hold hdev->lock when cleanup hci_conn

When disconnecting all devices, hci_conn_failed is used to cleanup
hci_conn object when the hci_conn object cannot be aborted.
The function hci_conn_failed requires the caller holds hdev->lock.

Fixes: 9b3628d79b46f ("Bluetooth: hci_sync: Cleanup hci_conn if it cannot be aborted")
Signed-off-by: Zhengping Jiang <jiangzp@google.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_sync.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 74a0cd5d0b37..e08c0503027d 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -5034,9 +5034,11 @@ int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason)
 		/* Cleanup hci_conn object if it cannot be cancelled as it
 		 * likelly means the controller and host stack are out of sync.
 		 */
-		if (err)
+		if (err) {
+			hci_dev_lock(hdev);
 			hci_conn_failed(conn, err);
-
+			hci_dev_unlock(hdev);
+		}
 		return err;
 	case BT_CONNECT2:
 		return hci_reject_conn_sync(hdev, conn, reason);
-- 
cgit v1.2.3


From 8c61eafd22d7207039bff85c6e1d386f15abd17e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 25 Aug 2022 09:51:57 +0200
Subject: x86/microcode: Remove ->request_microcode_user()

  181b6f40e9ea ("x86/microcode: Rip out the OLD_INTERFACE")

removed the old microcode loading interface but forgot to remove the
related ->request_microcode_user() functionality which it uses.

Rip it out now too.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220825075445.28171-1-bp@alien8.de
---
 arch/x86/include/asm/microcode.h      |  3 ---
 arch/x86/kernel/cpu/microcode/amd.c   |  7 -------
 arch/x86/kernel/cpu/microcode/intel.c | 17 -----------------
 3 files changed, 27 deletions(-)

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 0c3d3440fe27..7f7800e15ed0 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -32,9 +32,6 @@ enum ucode_state {
 };
 
 struct microcode_ops {
-	enum ucode_state (*request_microcode_user) (int cpu,
-				const void __user *buf, size_t size);
-
 	enum ucode_state (*request_microcode_fw) (int cpu, struct device *,
 						  bool refresh_fw);
 
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 8b2fcdfa6d31..5f38dd75cbc5 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -924,12 +924,6 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
 	return ret;
 }
 
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
-{
-	return UCODE_ERROR;
-}
-
 static void microcode_fini_cpu_amd(int cpu)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -938,7 +932,6 @@ static void microcode_fini_cpu_amd(int cpu)
 }
 
 static struct microcode_ops microcode_amd_ops = {
-	.request_microcode_user           = request_microcode_user,
 	.request_microcode_fw             = request_microcode_amd,
 	.collect_cpu_info                 = collect_cpu_info_amd,
 	.apply_microcode                  = apply_microcode_amd,
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 025c8f0cd948..1fcbd671f1df 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -916,24 +916,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
 	return ret;
 }
 
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
-{
-	struct iov_iter iter;
-	struct iovec iov;
-
-	if (is_blacklisted(cpu))
-		return UCODE_NFOUND;
-
-	iov.iov_base = (void __user *)buf;
-	iov.iov_len = size;
-	iov_iter_init(&iter, WRITE, &iov, 1, size);
-
-	return generic_load_microcode(cpu, &iter);
-}
-
 static struct microcode_ops microcode_intel_ops = {
-	.request_microcode_user		  = request_microcode_user,
 	.request_microcode_fw             = request_microcode_fw,
 	.collect_cpu_info                 = collect_cpu_info,
 	.apply_microcode                  = apply_microcode_intel,
-- 
cgit v1.2.3


From e84d9f5214cb854fcd584aa78b5634794604d306 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Thu, 25 Aug 2022 15:32:37 -0400
Subject: audit: audit_context pid unused, context enum comment fix

The pid member of struct audit_context is never used.  Remove it.

The audit_reset_context() comment about unconditionally resetting
"ctx->state" should read "ctx->context".

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.h   | 2 +-
 kernel/auditsc.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/audit.h b/kernel/audit.h
index 58b66543b4d5..d6eb7b59c791 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -133,7 +133,7 @@ struct audit_context {
 	struct sockaddr_storage *sockaddr;
 	size_t sockaddr_len;
 				/* Save things to print about task_struct */
-	pid_t		    pid, ppid;
+	pid_t		    ppid;
 	kuid_t		    uid, euid, suid, fsuid;
 	kgid_t		    gid, egid, sgid, fsgid;
 	unsigned long	    personality;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9226746dcf0a..21e50e6d0fc0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -965,7 +965,7 @@ static void audit_reset_context(struct audit_context *ctx)
 	if (!ctx)
 		return;
 
-	/* if ctx is non-null, reset the "ctx->state" regardless */
+	/* if ctx is non-null, reset the "ctx->context" regardless */
 	ctx->context = AUDIT_CTX_UNUSED;
 	if (ctx->dummy)
 		return;
@@ -1002,7 +1002,7 @@ static void audit_reset_context(struct audit_context *ctx)
 	kfree(ctx->sockaddr);
 	ctx->sockaddr = NULL;
 	ctx->sockaddr_len = 0;
-	ctx->pid = ctx->ppid = 0;
+	ctx->ppid = 0;
 	ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0);
 	ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0);
 	ctx->personality = 0;
-- 
cgit v1.2.3


From 3ed66951f952ed8f1a5d03e171722bf2631e8d58 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Thu, 25 Aug 2022 15:32:38 -0400
Subject: audit: explicitly check audit_context->context enum value

Be explicit in checking the struct audit_context "context" member enum
value rather than assuming the order of context enum values.

Fixes: 12c5e81d3fd0 ("audit: prepare audit_context for use in calling contexts beyond syscalls")
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 21e50e6d0fc0..d77c9805c6b1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2068,7 +2068,7 @@ void __audit_syscall_exit(int success, long return_code)
 	/* run through both filters to ensure we set the filterkey properly */
 	audit_filter_syscall(current, context);
 	audit_filter_inodes(current, context);
-	if (context->current_state < AUDIT_STATE_RECORD)
+	if (context->current_state != AUDIT_STATE_RECORD)
 		goto out;
 
 	audit_return_fixup(context, success, return_code);
-- 
cgit v1.2.3


From c3f3ea8af44d0c5fba79fe8b198087342d0c7e04 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Thu, 25 Aug 2022 15:32:39 -0400
Subject: audit: free audit_proctitle only on task exit

Since audit_proctitle is generated at syscall exit time, its value is
used immediately and cached for the next syscall.  Since this is the
case, then only clear it at task exit time.  Otherwise, there is no
point in caching the value OR bearing the overhead of regenerating it.

Fixes: 12c5e81d3fd0 ("audit: prepare audit_context for use in calling contexts beyond syscalls")
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d77c9805c6b1..280b4720c7a0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1016,7 +1016,6 @@ static void audit_reset_context(struct audit_context *ctx)
 	WARN_ON(!list_empty(&ctx->killed_trees));
 	audit_free_module(ctx);
 	ctx->fds[0] = -1;
-	audit_proctitle_free(ctx);
 	ctx->type = 0; /* reset last for audit_free_*() */
 }
 
@@ -1077,6 +1076,7 @@ static inline void audit_free_context(struct audit_context *context)
 {
 	/* resetting is extra work, but it is likely just noise */
 	audit_reset_context(context);
+	audit_proctitle_free(context);
 	free_tree_refs(context);
 	kfree(context->filterkey);
 	kfree(context);
-- 
cgit v1.2.3


From f9781bb18ed828e7b83b7bac4a4ad7cd497ee7d7 Mon Sep 17 00:00:00 2001
From: Jane Chu <jane.chu@oracle.com>
Date: Fri, 26 Aug 2022 17:38:51 -0600
Subject: x86/mce: Retrieve poison range from hardware

When memory poison consumption machine checks fire, MCE notifier
handlers like nfit_handle_mce() record the impacted physical address
range which is reported by the hardware in the MCi_MISC MSR. The error
information includes data about blast radius, i.e. how many cachelines
did the hardware determine are impacted. A recent change

  7917f9cdb503 ("acpi/nfit: rely on mce->misc to determine poison granularity")

updated nfit_handle_mce() to stop hard coding the blast radius value of
1 cacheline, and instead rely on the blast radius reported in 'struct
mce' which can be up to 4K (64 cachelines).

It turns out that apei_mce_report_mem_error() had a similar problem in
that it hard coded a blast radius of 4K rather than reading the blast
radius from the error information. Fix apei_mce_report_mem_error() to
convey the proper poison granularity.

Signed-off-by: Jane Chu <jane.chu@oracle.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/7ed50fd8-521e-cade-77b1-738b8bfb8502@oracle.com
Link: https://lore.kernel.org/r/20220826233851.1319100-1-jane.chu@oracle.com
---
 arch/x86/kernel/cpu/mce/apei.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 717192915f28..8ed341714686 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -29,15 +29,26 @@
 void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
 {
 	struct mce m;
+	int lsb;
 
 	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
 		return;
 
+	/*
+	 * Even if the ->validation_bits are set for address mask,
+	 * to be extra safe, check and reject an error radius '0',
+	 * and fall back to the default page size.
+	 */
+	if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
+		lsb = find_first_bit((void *)&mem_err->physical_addr_mask, PAGE_SHIFT);
+	else
+		lsb = PAGE_SHIFT;
+
 	mce_setup(&m);
 	m.bank = -1;
 	/* Fake a memory read error with unknown channel */
 	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
-	m.misc = (MCI_MISC_ADDR_PHYS << 6) | PAGE_SHIFT;
+	m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb;
 
 	if (severity >= GHES_SEV_RECOVERABLE)
 		m.status |= MCI_STATUS_UC;
-- 
cgit v1.2.3


From 26dbd66eab8080be51759e48280da04015221e22 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 25 Aug 2022 17:16:51 +0200
Subject: esp: choose the correct inner protocol for GSO on inter address
 family tunnels

Commit 23c7f8d7989e ("net: Fix esp GSO on inter address family
tunnels.") is incomplete. It passes to skb_eth_gso_segment the
protocol for the outer IP version, instead of the inner IP version, so
we end up calling inet_gso_segment on an inner IPv6 packet and
ipv6_gso_segment on an inner IPv4 packet and the packets are dropped.

This patch completes the fix by selecting the correct protocol based
on the inner mode's family.

Fixes: c35fe4106b92 ("xfrm: Add mode handlers for IPsec on layer 2")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/esp4_offload.c | 5 ++++-
 net/ipv6/esp6_offload.c | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 935026f4c807..170152772d33 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -110,7 +110,10 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x,
 						struct sk_buff *skb,
 						netdev_features_t features)
 {
-	return skb_eth_gso_segment(skb, features, htons(ETH_P_IP));
+	__be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6)
+						       : htons(ETH_P_IP);
+
+	return skb_eth_gso_segment(skb, features, type);
 }
 
 static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 3a293838a91d..79d43548279c 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -145,7 +145,10 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x,
 						struct sk_buff *skb,
 						netdev_features_t features)
 {
-	return skb_eth_gso_segment(skb, features, htons(ETH_P_IPV6));
+	__be16 type = x->inner_mode.family == AF_INET ? htons(ETH_P_IP)
+						      : htons(ETH_P_IPV6);
+
+	return skb_eth_gso_segment(skb, features, type);
 }
 
 static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x,
-- 
cgit v1.2.3


From 5182a5d48c3d1992b2db8748f96914e07eee0956 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Fri, 26 Aug 2022 14:46:58 +0300
Subject: net: allow storing xfrm interface metadata in metadata_dst

XFRM interfaces provide the association of various XFRM transformations
to a netdevice using an 'if_id' identifier common to both the XFRM data
structures (polcies, states) and the interface. The if_id is configured by
the controlling entity (usually the IKE daemon) and can be used by the
administrator to define logical relations between different connections.

For example, different connections can share the if_id identifier so
that they pass through the same interface, . However, currently it is
not possible for connections using a different if_id to use the same
interface while retaining the logical separation between them, without
using additional criteria such as skb marks or different traffic
selectors.

When having a large number of connections, it is useful to have a the
logical separation offered by the if_id identifier but use a single
network interface. Similar to the way collect_md mode is used in IP
tunnels.

This patch attempts to enable different configuration mechanisms - such
as ebpf programs, LWT encapsulations, and TC - to attach metadata
to skbs which would carry the if_id. This way a single xfrm interface in
collect_md mode can demux traffic based on this configuration on tx and
provide this metadata on rx.

The XFRM metadata is somewhat similar to ip tunnel metadata in that it
has an "id", and shares similar configuration entities (bpf, tc, ...),
however, it does not necessarily represent an IP tunnel or use other
ip tunnel information, and also has an optional "link" property which
can be used for affecting underlying routing decisions.

Additional xfrm related criteria may also be added in the future.

Therefore, a new metadata type is introduced, to be used in subsequent
patches in the xfrm interface and configuration entities.

Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/dst_metadata.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index adab27ba1ecb..e4b059908cc7 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -9,6 +9,7 @@
 enum metadata_type {
 	METADATA_IP_TUNNEL,
 	METADATA_HW_PORT_MUX,
+	METADATA_XFRM,
 };
 
 struct hw_port_info {
@@ -16,12 +17,18 @@ struct hw_port_info {
 	u32 port_id;
 };
 
+struct xfrm_md_info {
+	u32 if_id;
+	int link;
+};
+
 struct metadata_dst {
 	struct dst_entry		dst;
 	enum metadata_type		type;
 	union {
 		struct ip_tunnel_info	tun_info;
 		struct hw_port_info	port_info;
+		struct xfrm_md_info	xfrm_info;
 	} u;
 };
 
@@ -53,6 +60,16 @@ skb_tunnel_info(const struct sk_buff *skb)
 	return NULL;
 }
 
+static inline struct xfrm_md_info *skb_xfrm_md_info(const struct sk_buff *skb)
+{
+	struct metadata_dst *md_dst = skb_metadata_dst(skb);
+
+	if (md_dst && md_dst->type == METADATA_XFRM)
+		return &md_dst->u.xfrm_info;
+
+	return NULL;
+}
+
 static inline bool skb_valid_dst(const struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
@@ -82,6 +99,9 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
 		return memcmp(&a->u.tun_info, &b->u.tun_info,
 			      sizeof(a->u.tun_info) +
 					 a->u.tun_info.options_len);
+	case METADATA_XFRM:
+		return memcmp(&a->u.xfrm_info, &b->u.xfrm_info,
+			      sizeof(a->u.xfrm_info));
 	default:
 		return 1;
 	}
-- 
cgit v1.2.3


From abc340b38ba25cd6c7aa2c0bd9150d30738c82d0 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Fri, 26 Aug 2022 14:46:59 +0300
Subject: xfrm: interface: support collect metadata mode

This commit adds support for 'collect_md' mode on xfrm interfaces.

Each net can have one collect_md device, created by providing the
IFLA_XFRM_COLLECT_METADATA flag at creation. This device cannot be
altered and has no if_id or link device attributes.

On transmit to this device, the if_id is fetched from the attached dst
metadata on the skb. If exists, the link property is also fetched from
the metadata. The dst metadata type used is METADATA_XFRM which holds
these properties.

On the receive side, xfrmi_rcv_cb() populates a dst metadata for each
packet received and attaches it to the skb. The if_id used in this case is
fetched from the xfrm state, and the link is fetched from the incoming
device. This information can later be used by upper layers such as tc,
ebpf, and ip rules.

Because the skb is scrubed in xfrmi_rcv_cb(), the attachment of the dst
metadata is postponed until after scrubing. Similarly, xfrm_input() is
adapted to avoid dropping metadata dsts by only dropping 'valid'
(skb_valid_dst(skb) == true) dsts.

Policy matching on packets arriving from collect_md xfrmi devices is
done by using the xfrm state existing in the skb's sec_path.
The xfrm_if_cb.decode_cb() interface implemented by xfrmi_decode_session()
is changed to keep the details of the if_id extraction tucked away
in xfrm_interface.c.

Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h           |  11 +++-
 include/uapi/linux/if_link.h |   1 +
 net/xfrm/xfrm_input.c        |   7 ++-
 net/xfrm/xfrm_interface.c    | 121 ++++++++++++++++++++++++++++++++++++-------
 net/xfrm/xfrm_policy.c       |  10 ++--
 5 files changed, 121 insertions(+), 29 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 6e8fa98f786f..28b988577ed2 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -312,9 +312,15 @@ struct km_event {
 	struct net *net;
 };
 
+struct xfrm_if_decode_session_result {
+	struct net *net;
+	u32 if_id;
+};
+
 struct xfrm_if_cb {
-	struct xfrm_if	*(*decode_session)(struct sk_buff *skb,
-					   unsigned short family);
+	bool (*decode_session)(struct sk_buff *skb,
+			       unsigned short family,
+			       struct xfrm_if_decode_session_result *res);
 };
 
 void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb);
@@ -985,6 +991,7 @@ void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
 struct xfrm_if_parms {
 	int link;		/* ifindex of underlying L2 interface */
 	u32 if_id;		/* interface identifyer */
+	bool collect_md;
 };
 
 struct xfrm_if {
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index e36d9d2c65a7..d96f13a42589 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -694,6 +694,7 @@ enum {
 	IFLA_XFRM_UNSPEC,
 	IFLA_XFRM_LINK,
 	IFLA_XFRM_IF_ID,
+	IFLA_XFRM_COLLECT_METADATA,
 	__IFLA_XFRM_MAX
 };
 
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 144238a50f3d..25e822fb5771 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -20,6 +20,7 @@
 #include <net/xfrm.h>
 #include <net/ip_tunnels.h>
 #include <net/ip6_tunnel.h>
+#include <net/dst_metadata.h>
 
 #include "xfrm_inout.h"
 
@@ -720,7 +721,8 @@ resume:
 		sp = skb_sec_path(skb);
 		if (sp)
 			sp->olen = 0;
-		skb_dst_drop(skb);
+		if (skb_valid_dst(skb))
+			skb_dst_drop(skb);
 		gro_cells_receive(&gro_cells, skb);
 		return 0;
 	} else {
@@ -738,7 +740,8 @@ resume:
 			sp = skb_sec_path(skb);
 			if (sp)
 				sp->olen = 0;
-			skb_dst_drop(skb);
+			if (skb_valid_dst(skb))
+				skb_dst_drop(skb);
 			gro_cells_receive(&gro_cells, skb);
 			return err;
 		}
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index 5113fa0fbcee..e9a355047468 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -41,6 +41,7 @@
 #include <net/addrconf.h>
 #include <net/xfrm.h>
 #include <net/net_namespace.h>
+#include <net/dst_metadata.h>
 #include <net/netns/generic.h>
 #include <linux/etherdevice.h>
 
@@ -56,6 +57,7 @@ static const struct net_device_ops xfrmi_netdev_ops;
 struct xfrmi_net {
 	/* lists for storing interfaces in use */
 	struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE];
+	struct xfrm_if __rcu *collect_md_xfrmi;
 };
 
 #define for_each_xfrmi_rcu(start, xi) \
@@ -77,17 +79,23 @@ static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x)
 			return xi;
 	}
 
+	xi = rcu_dereference(xfrmn->collect_md_xfrmi);
+	if (xi && (xi->dev->flags & IFF_UP))
+		return xi;
+
 	return NULL;
 }
 
-static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb,
-					    unsigned short family)
+static bool xfrmi_decode_session(struct sk_buff *skb,
+				 unsigned short family,
+				 struct xfrm_if_decode_session_result *res)
 {
 	struct net_device *dev;
+	struct xfrm_if *xi;
 	int ifindex = 0;
 
 	if (!secpath_exists(skb) || !skb->dev)
-		return NULL;
+		return false;
 
 	switch (family) {
 	case AF_INET6:
@@ -107,11 +115,18 @@ static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb,
 	}
 
 	if (!dev || !(dev->flags & IFF_UP))
-		return NULL;
+		return false;
 	if (dev->netdev_ops != &xfrmi_netdev_ops)
-		return NULL;
+		return false;
 
-	return netdev_priv(dev);
+	xi = netdev_priv(dev);
+	res->net = xi->net;
+
+	if (xi->p.collect_md)
+		res->if_id = xfrm_input_state(skb)->if_id;
+	else
+		res->if_id = xi->p.if_id;
+	return true;
 }
 
 static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
@@ -157,7 +172,10 @@ static int xfrmi_create(struct net_device *dev)
 	if (err < 0)
 		goto out;
 
-	xfrmi_link(xfrmn, xi);
+	if (xi->p.collect_md)
+		rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi);
+	else
+		xfrmi_link(xfrmn, xi);
 
 	return 0;
 
@@ -185,7 +203,10 @@ static void xfrmi_dev_uninit(struct net_device *dev)
 	struct xfrm_if *xi = netdev_priv(dev);
 	struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id);
 
-	xfrmi_unlink(xfrmn, xi);
+	if (xi->p.collect_md)
+		RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL);
+	else
+		xfrmi_unlink(xfrmn, xi);
 }
 
 static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
@@ -214,6 +235,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
 	struct xfrm_state *x;
 	struct xfrm_if *xi;
 	bool xnet;
+	int link;
 
 	if (err && !secpath_exists(skb))
 		return 0;
@@ -224,6 +246,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
 	if (!xi)
 		return 1;
 
+	link = skb->dev->ifindex;
 	dev = xi->dev;
 	skb->dev = dev;
 
@@ -254,6 +277,17 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
 	}
 
 	xfrmi_scrub_packet(skb, xnet);
+	if (xi->p.collect_md) {
+		struct metadata_dst *md_dst;
+
+		md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC);
+		if (!md_dst)
+			return -ENOMEM;
+
+		md_dst->u.xfrm_info.if_id = x->if_id;
+		md_dst->u.xfrm_info.link = link;
+		skb_dst_set(skb, (struct dst_entry *)md_dst);
+	}
 	dev_sw_netstats_rx_add(dev, skb->len);
 
 	return 0;
@@ -269,10 +303,23 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 	struct net_device *tdev;
 	struct xfrm_state *x;
 	int err = -1;
+	u32 if_id;
 	int mtu;
 
+	if (xi->p.collect_md) {
+		struct xfrm_md_info *md_info = skb_xfrm_md_info(skb);
+
+		if (unlikely(!md_info))
+			return -EINVAL;
+
+		if_id = md_info->if_id;
+		fl->flowi_oif = md_info->link;
+	} else {
+		if_id = xi->p.if_id;
+	}
+
 	dst_hold(dst);
-	dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id);
+	dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id);
 	if (IS_ERR(dst)) {
 		err = PTR_ERR(dst);
 		dst = NULL;
@@ -283,7 +330,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 	if (!x)
 		goto tx_err_link_failure;
 
-	if (x->if_id != xi->p.if_id)
+	if (x->if_id != if_id)
 		goto tx_err_link_failure;
 
 	tdev = dst->dev;
@@ -633,6 +680,9 @@ static void xfrmi_netlink_parms(struct nlattr *data[],
 
 	if (data[IFLA_XFRM_IF_ID])
 		parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]);
+
+	if (data[IFLA_XFRM_COLLECT_METADATA])
+		parms->collect_md = true;
 }
 
 static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
@@ -645,14 +695,27 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
 	int err;
 
 	xfrmi_netlink_parms(data, &p);
-	if (!p.if_id) {
-		NL_SET_ERR_MSG(extack, "if_id must be non zero");
-		return -EINVAL;
-	}
+	if (p.collect_md) {
+		struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
 
-	xi = xfrmi_locate(net, &p);
-	if (xi)
-		return -EEXIST;
+		if (p.link || p.if_id) {
+			NL_SET_ERR_MSG(extack, "link and if_id must be zero");
+			return -EINVAL;
+		}
+
+		if (rtnl_dereference(xfrmn->collect_md_xfrmi))
+			return -EEXIST;
+
+	} else {
+		if (!p.if_id) {
+			NL_SET_ERR_MSG(extack, "if_id must be non zero");
+			return -EINVAL;
+		}
+
+		xi = xfrmi_locate(net, &p);
+		if (xi)
+			return -EEXIST;
+	}
 
 	xi = netdev_priv(dev);
 	xi->p = p;
@@ -682,12 +745,22 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
 		return -EINVAL;
 	}
 
+	if (p.collect_md) {
+		NL_SET_ERR_MSG(extack, "collect_md can't be changed");
+		return -EINVAL;
+	}
+
 	xi = xfrmi_locate(net, &p);
 	if (!xi) {
 		xi = netdev_priv(dev);
 	} else {
 		if (xi->dev != dev)
 			return -EEXIST;
+		if (xi->p.collect_md) {
+			NL_SET_ERR_MSG(extack,
+				       "device can't be changed to collect_md");
+			return -EINVAL;
+		}
 	}
 
 	return xfrmi_update(xi, &p);
@@ -700,6 +773,8 @@ static size_t xfrmi_get_size(const struct net_device *dev)
 		nla_total_size(4) +
 		/* IFLA_XFRM_IF_ID */
 		nla_total_size(4) +
+		/* IFLA_XFRM_COLLECT_METADATA */
+		nla_total_size(0) +
 		0;
 }
 
@@ -709,7 +784,8 @@ static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	struct xfrm_if_parms *parm = &xi->p;
 
 	if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) ||
-	    nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id))
+	    nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) ||
+	    (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA)))
 		goto nla_put_failure;
 	return 0;
 
@@ -725,8 +801,10 @@ static struct net *xfrmi_get_link_net(const struct net_device *dev)
 }
 
 static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = {
-	[IFLA_XFRM_LINK]	= { .type = NLA_U32 },
-	[IFLA_XFRM_IF_ID]	= { .type = NLA_U32 },
+	[IFLA_XFRM_UNSPEC]		= { .strict_start_type = IFLA_XFRM_COLLECT_METADATA },
+	[IFLA_XFRM_LINK]		= { .type = NLA_U32 },
+	[IFLA_XFRM_IF_ID]		= { .type = NLA_U32 },
+	[IFLA_XFRM_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static struct rtnl_link_ops xfrmi_link_ops __read_mostly = {
@@ -762,6 +840,9 @@ static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list)
 			     xip = &xi->next)
 				unregister_netdevice_queue(xi->dev, &list);
 		}
+		xi = rtnl_dereference(xfrmn->collect_md_xfrmi);
+		if (xi)
+			unregister_netdevice_queue(xi->dev, &list);
 	}
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 6264680b1f08..3c65059a508a 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -3515,17 +3515,17 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 	int xerr_idx = -1;
 	const struct xfrm_if_cb *ifcb;
 	struct sec_path *sp;
-	struct xfrm_if *xi;
 	u32 if_id = 0;
 
 	rcu_read_lock();
 	ifcb = xfrm_if_get_cb();
 
 	if (ifcb) {
-		xi = ifcb->decode_session(skb, family);
-		if (xi) {
-			if_id = xi->p.if_id;
-			net = xi->net;
+		struct xfrm_if_decode_session_result r;
+
+		if (ifcb->decode_session(skb, family, &r)) {
+			if_id = r.if_id;
+			net = r.net;
 		}
 	}
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 2c2493b9da9166478fe072e3054f8a5741dadb02 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Fri, 26 Aug 2022 14:47:00 +0300
Subject: xfrm: lwtunnel: add lwtunnel support for xfrm interfaces in
 collect_md mode

Allow specifying the xfrm interface if_id and link as part of a route
metadata using the lwtunnel infrastructure.

This allows for example using a single xfrm interface in collect_md
mode as the target of multiple routes each specifying a different if_id.

With the appropriate changes to iproute2, considering an xfrm device
ipsec1 in collect_md mode one can for example add a route specifying
an if_id like so:

ip route add <SUBNET> dev ipsec1 encap xfrm if_id 1

In which case traffic routed to the device via this route would use
if_id in the xfrm interface policy lookup.

Or in the context of vrf, one can also specify the "link" property:

ip route add <SUBNET> dev ipsec1 encap xfrm if_id 1 link_dev eth15

Note: LWT_XFRM_LINK uses NLA_U32 similar to IFLA_XFRM_LINK even though
internally "link" is signed. This is consistent with other _LINK
attributes in other devices as well as in bpf and should not have an
effect as device indexes can't be negative.

Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/dst_metadata.h    | 11 ++++++
 include/uapi/linux/lwtunnel.h | 10 +++++
 net/core/lwtunnel.c           |  1 +
 net/xfrm/xfrm_interface.c     | 85 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 107 insertions(+)

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index e4b059908cc7..57f75960fa28 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -60,13 +60,24 @@ skb_tunnel_info(const struct sk_buff *skb)
 	return NULL;
 }
 
+static inline struct xfrm_md_info *lwt_xfrm_info(struct lwtunnel_state *lwt)
+{
+	return (struct xfrm_md_info *)lwt->data;
+}
+
 static inline struct xfrm_md_info *skb_xfrm_md_info(const struct sk_buff *skb)
 {
 	struct metadata_dst *md_dst = skb_metadata_dst(skb);
+	struct dst_entry *dst;
 
 	if (md_dst && md_dst->type == METADATA_XFRM)
 		return &md_dst->u.xfrm_info;
 
+	dst = skb_dst(skb);
+	if (dst && dst->lwtstate &&
+	    dst->lwtstate->type == LWTUNNEL_ENCAP_XFRM)
+		return lwt_xfrm_info(dst->lwtstate);
+
 	return NULL;
 }
 
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 2e206919125c..229655ef792f 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -15,6 +15,7 @@ enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_SEG6_LOCAL,
 	LWTUNNEL_ENCAP_RPL,
 	LWTUNNEL_ENCAP_IOAM6,
+	LWTUNNEL_ENCAP_XFRM,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
@@ -111,4 +112,13 @@ enum {
 
 #define LWT_BPF_MAX_HEADROOM 256
 
+enum {
+	LWT_XFRM_UNSPEC,
+	LWT_XFRM_IF_ID,
+	LWT_XFRM_LINK,
+	__LWT_XFRM_MAX,
+};
+
+#define LWT_XFRM_MAX (__LWT_XFRM_MAX - 1)
+
 #endif /* _UAPI_LWTUNNEL_H_ */
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 9ccd64e8a666..6fac2f0ef074 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -50,6 +50,7 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
 		return "IOAM6";
 	case LWTUNNEL_ENCAP_IP6:
 	case LWTUNNEL_ENCAP_IP:
+	case LWTUNNEL_ENCAP_XFRM:
 	case LWTUNNEL_ENCAP_NONE:
 	case __LWTUNNEL_ENCAP_MAX:
 		/* should not have got here */
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
index e9a355047468..5a67b120c4db 100644
--- a/net/xfrm/xfrm_interface.c
+++ b/net/xfrm/xfrm_interface.c
@@ -60,6 +60,88 @@ struct xfrmi_net {
 	struct xfrm_if __rcu *collect_md_xfrmi;
 };
 
+static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = {
+	[LWT_XFRM_IF_ID]	= NLA_POLICY_MIN(NLA_U32, 1),
+	[LWT_XFRM_LINK]		= NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+static void xfrmi_destroy_state(struct lwtunnel_state *lwt)
+{
+}
+
+static int xfrmi_build_state(struct net *net, struct nlattr *nla,
+			     unsigned int family, const void *cfg,
+			     struct lwtunnel_state **ts,
+			     struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[LWT_XFRM_MAX + 1];
+	struct lwtunnel_state *new_state;
+	struct xfrm_md_info *info;
+	int ret;
+
+	ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[LWT_XFRM_IF_ID]) {
+		NL_SET_ERR_MSG(extack, "if_id must be set");
+		return -EINVAL;
+	}
+
+	new_state = lwtunnel_state_alloc(sizeof(*info));
+	if (!new_state) {
+		NL_SET_ERR_MSG(extack, "failed to create encap info");
+		return -ENOMEM;
+	}
+
+	new_state->type = LWTUNNEL_ENCAP_XFRM;
+
+	info = lwt_xfrm_info(new_state);
+
+	info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]);
+
+	if (tb[LWT_XFRM_LINK])
+		info->link = nla_get_u32(tb[LWT_XFRM_LINK]);
+
+	*ts = new_state;
+	return 0;
+}
+
+static int xfrmi_fill_encap_info(struct sk_buff *skb,
+				 struct lwtunnel_state *lwt)
+{
+	struct xfrm_md_info *info = lwt_xfrm_info(lwt);
+
+	if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) ||
+	    (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link)))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */
+		nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */
+}
+
+static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+	struct xfrm_md_info *a_info = lwt_xfrm_info(a);
+	struct xfrm_md_info *b_info = lwt_xfrm_info(b);
+
+	return memcmp(a_info, b_info, sizeof(*a_info));
+}
+
+static const struct lwtunnel_encap_ops xfrmi_encap_ops = {
+	.build_state	= xfrmi_build_state,
+	.destroy_state	= xfrmi_destroy_state,
+	.fill_encap	= xfrmi_fill_encap_info,
+	.get_encap_size = xfrmi_encap_nlsize,
+	.cmp_encap	= xfrmi_encap_cmp,
+	.owner		= THIS_MODULE,
+};
+
 #define for_each_xfrmi_rcu(start, xi) \
 	for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next))
 
@@ -1080,6 +1162,8 @@ static int __init xfrmi_init(void)
 	if (err < 0)
 		goto rtnl_link_failed;
 
+	lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
+
 	xfrm_if_register_cb(&xfrm_if_cb);
 
 	return err;
@@ -1098,6 +1182,7 @@ pernet_dev_failed:
 static void __exit xfrmi_fini(void)
 {
 	xfrm_if_unregister_cb();
+	lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
 	rtnl_link_unregister(&xfrmi_link_ops);
 	xfrmi4_fini();
 	xfrmi6_fini();
-- 
cgit v1.2.3


From bc12b70f7d216b36bd87701349374a13e486f8eb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 25 Aug 2022 13:36:40 +0200
Subject: x86/earlyprintk: Clean up pciserial

While working on a GRUB patch to support PCI-serial, a number of
cleanups were suggested that apply to the code I took inspiration from.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>   # pci_ids.h
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lkml.kernel.org/r/YwdeyCEtW+wa+QhH@worktop.programming.kicks-ass.net
---
 arch/x86/kernel/early_printk.c | 14 +++++++-------
 include/linux/pci_ids.h        |  3 +++
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 68b38925a74f..44f937015e1e 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -264,11 +264,11 @@ static __init void early_pci_serial_init(char *s)
 	bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
 
 	/*
-	 * Verify it is a UART type device
+	 * Verify it is a 16550-UART type device
 	 */
 	if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) &&
 	     (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) ||
-	   (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ {
+	    (((classcode >> 8) & 0xff) != PCI_SERIAL_16550_COMPATIBLE)) {
 		if (!force)
 			return;
 	}
@@ -276,22 +276,22 @@ static __init void early_pci_serial_init(char *s)
 	/*
 	 * Determine if it is IO or memory mapped
 	 */
-	if (bar0 & 0x01) {
+	if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) {
 		/* it is IO mapped */
 		serial_in = io_serial_in;
 		serial_out = io_serial_out;
-		early_serial_base = bar0&0xfffffffc;
+		early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK;
 		write_pci_config(bus, slot, func, PCI_COMMAND,
-						cmdreg|PCI_COMMAND_IO);
+				 cmdreg|PCI_COMMAND_IO);
 	} else {
 		/* It is memory mapped - assume 32-bit alignment */
 		serial_in = mem32_serial_in;
 		serial_out = mem32_serial_out;
 		/* WARNING! assuming the address is always in the first 4G */
 		early_serial_base =
-			(unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10);
+			(unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10);
 		write_pci_config(bus, slot, func, PCI_COMMAND,
-						cmdreg|PCI_COMMAND_MEMORY);
+				 cmdreg|PCI_COMMAND_MEMORY);
 	}
 
 	/*
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 6feade66efdb..41b3fffdbb8e 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -75,6 +75,9 @@
 #define PCI_CLASS_COMMUNICATION_MODEM	0x0703
 #define PCI_CLASS_COMMUNICATION_OTHER	0x0780
 
+/* Interface for SERIAL/MODEM */
+#define PCI_SERIAL_16550_COMPATIBLE	0x02
+
 #define PCI_BASE_CLASS_SYSTEM		0x08
 #define PCI_CLASS_SYSTEM_PIC		0x0800
 #define PCI_CLASS_SYSTEM_PIC_IOAPIC	0x080010
-- 
cgit v1.2.3


From 501e4bb102ee2f380ab94abbf3d35dff9d4719a2 Mon Sep 17 00:00:00 2001
From: wuchi <wuchi.zero@gmail.com>
Date: Tue, 30 Aug 2022 00:26:09 +0800
Subject: audit: use time_after to compare time

Using time_{*} macro to compare time is better

Signed-off-by: wuchi <wuchi.zero@gmail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/kernel/audit.c b/kernel/audit.c
index a75978ae38ad..9bc0b0301198 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -321,7 +321,6 @@ static inline int audit_rate_check(void)
 	static DEFINE_SPINLOCK(lock);
 	unsigned long		flags;
 	unsigned long		now;
-	unsigned long		elapsed;
 	int			retval	   = 0;
 
 	if (!audit_rate_limit) return 1;
@@ -330,9 +329,8 @@ static inline int audit_rate_check(void)
 	if (++messages < audit_rate_limit) {
 		retval = 1;
 	} else {
-		now     = jiffies;
-		elapsed = now - last_check;
-		if (elapsed > HZ) {
+		now = jiffies;
+		if (time_after(now, last_check + HZ)) {
 			last_check = now;
 			messages   = 0;
 			retval     = 1;
@@ -366,7 +364,7 @@ void audit_log_lost(const char *message)
 	if (!print) {
 		spin_lock_irqsave(&lock, flags);
 		now = jiffies;
-		if (now - last_msg > HZ) {
+		if (time_after(now, last_msg + HZ)) {
 			print = 1;
 			last_msg = now;
 		}
-- 
cgit v1.2.3


From 3359d52755e5bf1268d96ce7e9d6e8c1b4700225 Mon Sep 17 00:00:00 2001
From: Zhang Jianhua <chris.zjh@huawei.com>
Date: Tue, 30 Aug 2022 16:07:32 +0800
Subject: cpufreq: bmips-cpufreq: Use module_init and add module_exit

- Use module_init instead of device_initcall.
- Add a function for module_exit to unregister driver.

Signed-off-by: Zhang Jianhua <chris.zjh@huawei.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/bmips-cpufreq.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/bmips-cpufreq.c b/drivers/cpufreq/bmips-cpufreq.c
index f7c23fa468f0..39221a9a187a 100644
--- a/drivers/cpufreq/bmips-cpufreq.c
+++ b/drivers/cpufreq/bmips-cpufreq.c
@@ -156,7 +156,7 @@ static struct cpufreq_driver bmips_cpufreq_driver = {
 	.name		= BMIPS_CPUFREQ_PREFIX,
 };
 
-static int __init bmips_cpufreq_probe(void)
+static int __init bmips_cpufreq_driver_init(void)
 {
 	struct cpufreq_compat *cc;
 	struct device_node *np;
@@ -176,7 +176,13 @@ static int __init bmips_cpufreq_probe(void)
 
 	return cpufreq_register_driver(&bmips_cpufreq_driver);
 }
-device_initcall(bmips_cpufreq_probe);
+module_init(bmips_cpufreq_driver_init);
+
+static void __exit bmips_cpufreq_driver_exit(void)
+{
+	cpufreq_unregister_driver(&bmips_cpufreq_driver);
+}
+module_exit(bmips_cpufreq_driver_exit);
 
 MODULE_AUTHOR("Markus Mayer <mmayer@broadcom.com>");
 MODULE_DESCRIPTION("CPUfreq driver for Broadcom BMIPS SoCs");
-- 
cgit v1.2.3


From d5008ef5b5a21177d3042816542f1afd5ae36152 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 29 Aug 2022 17:21:43 +0200
Subject: ACPI: PM: Fix NULL argument handling in acpi_device_get/set_power()

In principle, it should be valid to pass NULL as the ACPI device
pointer to acpi_device_get_power() and acpi_device_set_power() and they
both are expected to return -EINVAL in that case, but that has been
broken recently by commit 62fcb99bdf10 ("ACPI: Drop parent field from
struct acpi_device") which has caused the ACPI device pointer to be
dereferenced in these functions before the NULL check.

Fix that and while at it make acpi_device_set_power() only use the
parent field if the target ACPI device object's ignore_parent flag
in not set.

Fixes: 62fcb99bdf10 ("ACPI: Drop parent field from struct acpi_device")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/device_pm.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 028d8d14cd44..3aca67cf9ce5 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -74,13 +74,15 @@ static int acpi_dev_pm_explicit_get(struct acpi_device *device, int *state)
  */
 int acpi_device_get_power(struct acpi_device *device, int *state)
 {
-	struct acpi_device *parent = acpi_dev_parent(device);
 	int result = ACPI_STATE_UNKNOWN;
+	struct acpi_device *parent;
 	int error;
 
 	if (!device || !state)
 		return -EINVAL;
 
+	parent = acpi_dev_parent(device);
+
 	if (!device->flags.power_manageable) {
 		/* TBD: Non-recursive algorithm for walking up hierarchy. */
 		*state = parent ? parent->power.state : ACPI_STATE_D0;
@@ -159,7 +161,6 @@ static int acpi_dev_pm_explicit_set(struct acpi_device *adev, int state)
  */
 int acpi_device_set_power(struct acpi_device *device, int state)
 {
-	struct acpi_device *parent = acpi_dev_parent(device);
 	int target_state = state;
 	int result = 0;
 
@@ -192,13 +193,17 @@ int acpi_device_set_power(struct acpi_device *device, int state)
 		return -ENODEV;
 	}
 
-	if (!device->power.flags.ignore_parent && parent &&
-	    state < parent->power.state) {
-		acpi_handle_debug(device->handle,
-				  "Cannot transition to %s for parent in %s\n",
-				  acpi_power_state_string(state),
-				  acpi_power_state_string(parent->power.state));
-		return -ENODEV;
+	if (!device->power.flags.ignore_parent) {
+		struct acpi_device *parent;
+
+		parent = acpi_dev_parent(device);
+		if (parent && state < parent->power.state) {
+			acpi_handle_debug(device->handle,
+					  "Cannot transition to %s for parent in %s\n",
+					  acpi_power_state_string(state),
+					  acpi_power_state_string(parent->power.state));
+			return -ENODEV;
+		}
 	}
 
 	/*
-- 
cgit v1.2.3


From e3b9b27865c45c771f95b5dcf70ee8e88b343c75 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 29 Aug 2022 17:53:19 +0200
Subject: ACPI: Drop redundant acpi_dev_parent() header

Because acpi_dev_parent() is defined as static inline, the extra
header of it in acpi_bus.h is redundant, so drop it.

Fixes: 62fcb99bdf10 ("ACPI: Drop parent field from struct acpi_device")
Reported-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com>
---
 include/acpi/acpi_bus.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 6289020fdab8..42f76f2c2d49 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -485,7 +485,6 @@ void acpi_initialize_hp_context(struct acpi_device *adev,
 /* acpi_device.dev.bus == &acpi_bus_type */
 extern struct bus_type acpi_bus_type;
 
-struct acpi_device *acpi_dev_parent(struct acpi_device *adev);
 int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data);
 int acpi_dev_for_each_child(struct acpi_device *adev,
 			    int (*fn)(struct acpi_device *, void *), void *data);
-- 
cgit v1.2.3


From 72c976fc3a4b3ff264127f4ced7c06f35cede858 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 30 Aug 2022 20:15:38 +0200
Subject: thermal: gov_user_space: Do not lock thermal zone mutex

Commit 670a5e356cb6 ("thermal/core: Move the thermal zone lock out of
the governors") moved thermal zone locking away from governors, but it
forgot about the user space one which deadlocks now.

Fix this by removing the thermal zone locking from the user space
governor.

Fixes: 670a5e356cb6 ("thermal/core: Move the thermal zone lock out of the governors")
Tested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/gov_user_space.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/thermal/gov_user_space.c b/drivers/thermal/gov_user_space.c
index a62a4e90bd3f..8bc1c22aaf03 100644
--- a/drivers/thermal/gov_user_space.c
+++ b/drivers/thermal/gov_user_space.c
@@ -34,7 +34,8 @@ static int notify_user_space(struct thermal_zone_device *tz, int trip)
 	char *thermal_prop[5];
 	int i;
 
-	mutex_lock(&tz->lock);
+	lockdep_assert_held(&tz->lock);
+
 	thermal_prop[0] = kasprintf(GFP_KERNEL, "NAME=%s", tz->type);
 	thermal_prop[1] = kasprintf(GFP_KERNEL, "TEMP=%d", tz->temperature);
 	thermal_prop[2] = kasprintf(GFP_KERNEL, "TRIP=%d", trip);
@@ -43,7 +44,7 @@ static int notify_user_space(struct thermal_zone_device *tz, int trip)
 	kobject_uevent_env(&tz->device.kobj, KOBJ_CHANGE, thermal_prop);
 	for (i = 0; i < 4; ++i)
 		kfree(thermal_prop[i]);
-	mutex_unlock(&tz->lock);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From abec3d015fdfb7c63105c7e1c956188bf381aa55 Mon Sep 17 00:00:00 2001
From: Ondrej Mosnacek <omosnace@redhat.com>
Date: Fri, 8 Jul 2022 11:34:51 +0200
Subject: userfaultfd: open userfaultfds with O_RDONLY

Since userfaultfd doesn't implement a write operation, it is more
appropriate to open it read-only.

When userfaultfds are opened read-write like it is now, and such fd is
passed from one process to another, SELinux will check both read and
write permissions for the target process, even though it can't actually
do any write operation on the fd later.

Inspired by the following bug report, which has hit the SELinux scenario
described above:
https://bugzilla.redhat.com/show_bug.cgi?id=1974559

Reported-by: Robert O'Callahan <roc@ocallahan.org>
Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization")
Signed-off-by: Ondrej Mosnacek <omosnace@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
Acked-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/userfaultfd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 1c44bf75f916..e6ffe7bc59e3 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -991,7 +991,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new,
 	int fd;
 
 	fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
-			O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
+			O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
 	if (fd < 0)
 		return fd;
 
@@ -2090,7 +2090,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
 	mmgrab(ctx->mm);
 
 	fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
-			O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
+			O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
 	if (fd < 0) {
 		mmdrop(ctx->mm);
 		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
-- 
cgit v1.2.3


From 5698f08169b905ed215fa7f45e89b42283ed0554 Mon Sep 17 00:00:00 2001
From: ye xingchen <ye.xingchen@zte.com.cn>
Date: Tue, 30 Aug 2022 08:41:24 +0000
Subject: selinux: remove an unneeded variable in sel_make_class_dir_entries()

Return the value sel_make_perm_files() directly instead of storing it
in another redundant variable.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn>
[PM: subject line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 security/selinux/selinuxfs.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 8fcdd494af27..66610eb3d317 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1917,7 +1917,6 @@ static int sel_make_class_dir_entries(struct selinux_policy *newpolicy,
 	struct selinux_fs_info *fsi = sb->s_fs_info;
 	struct dentry *dentry = NULL;
 	struct inode *inode = NULL;
-	int rc;
 
 	dentry = d_alloc_name(dir, "index");
 	if (!dentry)
@@ -1937,9 +1936,7 @@ static int sel_make_class_dir_entries(struct selinux_policy *newpolicy,
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	rc = sel_make_perm_files(newpolicy, classname, index, dentry);
-
-	return rc;
+	return sel_make_perm_files(newpolicy, classname, index, dentry);
 }
 
 static int sel_make_classes(struct selinux_policy *newpolicy,
-- 
cgit v1.2.3


From c3fae2b2e690ab92ee1306cc22126240ab70e371 Mon Sep 17 00:00:00 2001
From: Christian Göttsche <cgzones@googlemail.com>
Date: Tue, 30 Aug 2022 17:45:00 +0200
Subject: selinux: use int arrays for boolean values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Do not cast pointers of signed integers to pointers of unsigned integers
and vice versa.

It should currently not be an issue since they hold SELinux boolean
values which should only contain either 0's or 1's, which should have
the same representation.

Reported by sparse:

  .../selinuxfs.c:1485:30: warning: incorrect type in assignment
                                    (different signedness)
  .../selinuxfs.c:1485:30:    expected unsigned int *
  .../selinuxfs.c:1485:30:    got int *[addressable] values
  .../selinuxfs.c:1402:48: warning: incorrect type in argument 3
                                    (different signedness)
  .../selinuxfs.c:1402:48:    expected int *values
  .../selinuxfs.c:1402:48:    got unsigned int *bool_pending_values

Signed-off-by: Christian Göttsche <cgzones@googlemail.com>
[PM: minor whitespace fixes, sparse output cleanup]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 security/selinux/selinuxfs.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 66610eb3d317..a00d19139436 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -71,7 +71,7 @@ struct selinux_fs_info {
 	struct dentry *bool_dir;
 	unsigned int bool_num;
 	char **bool_pending_names;
-	unsigned int *bool_pending_values;
+	int *bool_pending_values;
 	struct dentry *class_dir;
 	unsigned long last_class_ino;
 	bool policy_opened;
@@ -356,7 +356,7 @@ static const struct file_operations sel_policyvers_ops = {
 /* declaration for sel_write_load */
 static int sel_make_bools(struct selinux_policy *newpolicy, struct dentry *bool_dir,
 			  unsigned int *bool_num, char ***bool_pending_names,
-			  unsigned int **bool_pending_values);
+			  int **bool_pending_values);
 static int sel_make_classes(struct selinux_policy *newpolicy,
 			    struct dentry *class_dir,
 			    unsigned long *last_class_ino);
@@ -527,7 +527,7 @@ static const struct file_operations sel_policy_ops = {
 };
 
 static void sel_remove_old_bool_data(unsigned int bool_num, char **bool_names,
-				unsigned int *bool_values)
+				     int *bool_values)
 {
 	u32 i;
 
@@ -545,7 +545,7 @@ static int sel_make_policy_nodes(struct selinux_fs_info *fsi,
 	struct dentry *tmp_parent, *tmp_bool_dir, *tmp_class_dir, *old_dentry;
 	unsigned int tmp_bool_num, old_bool_num;
 	char **tmp_bool_names, **old_bool_names;
-	unsigned int *tmp_bool_values, *old_bool_values;
+	int *tmp_bool_values, *old_bool_values;
 	unsigned long tmp_ino = fsi->last_ino; /* Don't increment last_ino in this function */
 
 	tmp_parent = sel_make_disconnected_dir(fsi->sb, &tmp_ino);
@@ -1423,7 +1423,7 @@ static void sel_remove_entries(struct dentry *de)
 
 static int sel_make_bools(struct selinux_policy *newpolicy, struct dentry *bool_dir,
 			  unsigned int *bool_num, char ***bool_pending_names,
-			  unsigned int **bool_pending_values)
+			  int **bool_pending_values)
 {
 	int ret;
 	ssize_t len;
-- 
cgit v1.2.3


From 6354324d8a3e31108b53ac8ac6e9b67c9485d75a Mon Sep 17 00:00:00 2001
From: Christian Göttsche <cgzones@googlemail.com>
Date: Tue, 30 Aug 2022 17:52:49 +0200
Subject: selinux: declare read-only parameters const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Declare ebitmap, mls_level and mls_context parameters const where they
are only read from.  This allows callers to supply pointers to const
as arguments and increases readability.

Signed-off-by: Christian Göttsche <cgzones@googlemail.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 security/selinux/ss/context.h   | 17 +++++++++--------
 security/selinux/ss/ebitmap.c   | 21 +++++++++++----------
 security/selinux/ss/ebitmap.h   | 18 +++++++++---------
 security/selinux/ss/mls_types.h |  4 ++--
 4 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/security/selinux/ss/context.h b/security/selinux/ss/context.h
index 62990aa1ec9e..eda32c3d4c0a 100644
--- a/security/selinux/ss/context.h
+++ b/security/selinux/ss/context.h
@@ -38,7 +38,7 @@ static inline void mls_context_init(struct context *c)
 	memset(&c->range, 0, sizeof(c->range));
 }
 
-static inline int mls_context_cpy(struct context *dst, struct context *src)
+static inline int mls_context_cpy(struct context *dst, const struct context *src)
 {
 	int rc;
 
@@ -58,7 +58,7 @@ out:
 /*
  * Sets both levels in the MLS range of 'dst' to the low level of 'src'.
  */
-static inline int mls_context_cpy_low(struct context *dst, struct context *src)
+static inline int mls_context_cpy_low(struct context *dst, const struct context *src)
 {
 	int rc;
 
@@ -78,7 +78,7 @@ out:
 /*
  * Sets both levels in the MLS range of 'dst' to the high level of 'src'.
  */
-static inline int mls_context_cpy_high(struct context *dst, struct context *src)
+static inline int mls_context_cpy_high(struct context *dst, const struct context *src)
 {
 	int rc;
 
@@ -97,9 +97,10 @@ out:
 
 
 static inline int mls_context_glblub(struct context *dst,
-				     struct context *c1, struct context *c2)
+				     const struct context *c1, const struct context *c2)
 {
-	struct mls_range *dr = &dst->range, *r1 = &c1->range, *r2 = &c2->range;
+	struct mls_range *dr = &dst->range;
+	const struct mls_range *r1 = &c1->range, *r2 = &c2->range;
 	int rc = 0;
 
 	if (r1->level[1].sens < r2->level[0].sens ||
@@ -127,7 +128,7 @@ out:
 	return rc;
 }
 
-static inline int mls_context_cmp(struct context *c1, struct context *c2)
+static inline int mls_context_cmp(const struct context *c1, const struct context *c2)
 {
 	return ((c1->range.level[0].sens == c2->range.level[0].sens) &&
 		ebitmap_cmp(&c1->range.level[0].cat, &c2->range.level[0].cat) &&
@@ -147,7 +148,7 @@ static inline void context_init(struct context *c)
 	memset(c, 0, sizeof(*c));
 }
 
-static inline int context_cpy(struct context *dst, struct context *src)
+static inline int context_cpy(struct context *dst, const struct context *src)
 {
 	int rc;
 
@@ -180,7 +181,7 @@ static inline void context_destroy(struct context *c)
 	mls_context_destroy(c);
 }
 
-static inline int context_cmp(struct context *c1, struct context *c2)
+static inline int context_cmp(const struct context *c1, const struct context *c2)
 {
 	if (c1->len && c2->len)
 		return (c1->len == c2->len && !strcmp(c1->str, c2->str));
diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c
index abde349c8321..d31b87be9a1e 100644
--- a/security/selinux/ss/ebitmap.c
+++ b/security/selinux/ss/ebitmap.c
@@ -28,9 +28,9 @@
 
 static struct kmem_cache *ebitmap_node_cachep __ro_after_init;
 
-int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2)
+int ebitmap_cmp(const struct ebitmap *e1, const struct ebitmap *e2)
 {
-	struct ebitmap_node *n1, *n2;
+	const struct ebitmap_node *n1, *n2;
 
 	if (e1->highbit != e2->highbit)
 		return 0;
@@ -50,9 +50,10 @@ int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2)
 	return 1;
 }
 
-int ebitmap_cpy(struct ebitmap *dst, struct ebitmap *src)
+int ebitmap_cpy(struct ebitmap *dst, const struct ebitmap *src)
 {
-	struct ebitmap_node *n, *new, *prev;
+	struct ebitmap_node *new, *prev;
+	const struct ebitmap_node *n;
 
 	ebitmap_init(dst);
 	n = src->node;
@@ -78,7 +79,7 @@ int ebitmap_cpy(struct ebitmap *dst, struct ebitmap *src)
 	return 0;
 }
 
-int ebitmap_and(struct ebitmap *dst, struct ebitmap *e1, struct ebitmap *e2)
+int ebitmap_and(struct ebitmap *dst, const struct ebitmap *e1, const struct ebitmap *e2)
 {
 	struct ebitmap_node *n;
 	int bit, rc;
@@ -217,9 +218,9 @@ netlbl_import_failure:
  * if last_e2bit is non-zero, the highest set bit in e2 cannot exceed
  * last_e2bit.
  */
-int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2, u32 last_e2bit)
+int ebitmap_contains(const struct ebitmap *e1, const struct ebitmap *e2, u32 last_e2bit)
 {
-	struct ebitmap_node *n1, *n2;
+	const struct ebitmap_node *n1, *n2;
 	int i;
 
 	if (e1->highbit < e2->highbit)
@@ -258,9 +259,9 @@ int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2, u32 last_e2bit)
 	return 1;
 }
 
-int ebitmap_get_bit(struct ebitmap *e, unsigned long bit)
+int ebitmap_get_bit(const struct ebitmap *e, unsigned long bit)
 {
-	struct ebitmap_node *n;
+	const struct ebitmap_node *n;
 
 	if (e->highbit < bit)
 		return 0;
@@ -467,7 +468,7 @@ bad:
 	goto out;
 }
 
-int ebitmap_write(struct ebitmap *e, void *fp)
+int ebitmap_write(const struct ebitmap *e, void *fp)
 {
 	struct ebitmap_node *n;
 	u32 count;
diff --git a/security/selinux/ss/ebitmap.h b/security/selinux/ss/ebitmap.h
index 58eb822f11ee..e5b57dc3fc53 100644
--- a/security/selinux/ss/ebitmap.h
+++ b/security/selinux/ss/ebitmap.h
@@ -44,7 +44,7 @@ struct ebitmap {
 
 #define ebitmap_length(e) ((e)->highbit)
 
-static inline unsigned int ebitmap_start_positive(struct ebitmap *e,
+static inline unsigned int ebitmap_start_positive(const struct ebitmap *e,
 						  struct ebitmap_node **n)
 {
 	unsigned int ofs;
@@ -62,7 +62,7 @@ static inline void ebitmap_init(struct ebitmap *e)
 	memset(e, 0, sizeof(*e));
 }
 
-static inline unsigned int ebitmap_next_positive(struct ebitmap *e,
+static inline unsigned int ebitmap_next_positive(const struct ebitmap *e,
 						 struct ebitmap_node **n,
 						 unsigned int bit)
 {
@@ -85,7 +85,7 @@ static inline unsigned int ebitmap_next_positive(struct ebitmap *e,
 #define EBITMAP_NODE_OFFSET(node, bit)	\
 	(((bit) - (node)->startbit) % EBITMAP_UNIT_SIZE)
 
-static inline int ebitmap_node_get_bit(struct ebitmap_node *n,
+static inline int ebitmap_node_get_bit(const struct ebitmap_node *n,
 				       unsigned int bit)
 {
 	unsigned int index = EBITMAP_NODE_INDEX(n, bit);
@@ -122,15 +122,15 @@ static inline void ebitmap_node_clr_bit(struct ebitmap_node *n,
 	     (bit) < ebitmap_length(e);			\
 	     (bit) = ebitmap_next_positive(e, &(n), bit))	\
 
-int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2);
-int ebitmap_cpy(struct ebitmap *dst, struct ebitmap *src);
-int ebitmap_and(struct ebitmap *dst, struct ebitmap *e1, struct ebitmap *e2);
-int ebitmap_contains(struct ebitmap *e1, struct ebitmap *e2, u32 last_e2bit);
-int ebitmap_get_bit(struct ebitmap *e, unsigned long bit);
+int ebitmap_cmp(const struct ebitmap *e1, const struct ebitmap *e2);
+int ebitmap_cpy(struct ebitmap *dst, const struct ebitmap *src);
+int ebitmap_and(struct ebitmap *dst, const struct ebitmap *e1, const struct ebitmap *e2);
+int ebitmap_contains(const struct ebitmap *e1, const struct ebitmap *e2, u32 last_e2bit);
+int ebitmap_get_bit(const struct ebitmap *e, unsigned long bit);
 int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value);
 void ebitmap_destroy(struct ebitmap *e);
 int ebitmap_read(struct ebitmap *e, void *fp);
-int ebitmap_write(struct ebitmap *e, void *fp);
+int ebitmap_write(const struct ebitmap *e, void *fp);
 u32 ebitmap_hash(const struct ebitmap *e, u32 hash);
 
 #ifdef CONFIG_NETLABEL
diff --git a/security/selinux/ss/mls_types.h b/security/selinux/ss/mls_types.h
index 068e0d7809db..7d48d5e52233 100644
--- a/security/selinux/ss/mls_types.h
+++ b/security/selinux/ss/mls_types.h
@@ -27,13 +27,13 @@ struct mls_range {
 	struct mls_level level[2]; /* low == level[0], high == level[1] */
 };
 
-static inline int mls_level_eq(struct mls_level *l1, struct mls_level *l2)
+static inline int mls_level_eq(const struct mls_level *l1, const struct mls_level *l2)
 {
 	return ((l1->sens == l2->sens) &&
 		ebitmap_cmp(&l1->cat, &l2->cat));
 }
 
-static inline int mls_level_dom(struct mls_level *l1, struct mls_level *l2)
+static inline int mls_level_dom(const struct mls_level *l1, const struct mls_level *l2)
 {
 	return ((l1->sens >= l2->sens) &&
 		ebitmap_contains(&l1->cat, &l2->cat, 0));
-- 
cgit v1.2.3


From a26aa12384158116c0d80d50e0bdc7b3323551e2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 29 Aug 2022 14:38:40 +0200
Subject: ntfs3: rework xattr handlers and switch to POSIX ACL VFS helpers

The xattr code in ntfs3 is currently a bit confused. For example, it
defines a POSIX ACL i_op->set_acl() method but instead of relying on the
generic POSIX ACL VFS helpers it defines its own set of xattr helpers
with the consequence that i_op->set_acl() is currently dead code.

Switch ntfs3 to rely on the VFS POSIX ACL xattr handlers. Also remove
i_op->{g,s}et_acl() methods from symlink inode operations. Symlinks
don't support xattrs.

This is a preliminary change for the following patches which move
handling idmapped mounts directly in posix_acl_xattr_set().

This survives POSIX ACL xfstests.

Fixes: be71b5cba2e6 ("fs/ntfs3: Add attrib operations")
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>>
---
 fs/ntfs3/inode.c |   2 --
 fs/ntfs3/xattr.c | 102 ++++---------------------------------------------------
 2 files changed, 6 insertions(+), 98 deletions(-)

diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 51363d4e8636..26a76ebfe58f 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -1927,8 +1927,6 @@ const struct inode_operations ntfs_link_inode_operations = {
 	.setattr	= ntfs3_setattr,
 	.listxattr	= ntfs_listxattr,
 	.permission	= ntfs_permission,
-	.get_acl	= ntfs_get_acl,
-	.set_acl	= ntfs_set_acl,
 };
 
 const struct address_space_operations ntfs_aops = {
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 6ae1f56b7358..7de8718c68a9 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -625,67 +625,6 @@ int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
 	return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false);
 }
 
-static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns,
-			      struct inode *inode, int type, void *buffer,
-			      size_t size)
-{
-	struct posix_acl *acl;
-	int err;
-
-	if (!(inode->i_sb->s_flags & SB_POSIXACL)) {
-		ntfs_inode_warn(inode, "add mount option \"acl\" to use acl");
-		return -EOPNOTSUPP;
-	}
-
-	acl = ntfs_get_acl(inode, type, false);
-	if (IS_ERR(acl))
-		return PTR_ERR(acl);
-
-	if (!acl)
-		return -ENODATA;
-
-	err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
-	posix_acl_release(acl);
-
-	return err;
-}
-
-static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns,
-			      struct inode *inode, int type, const void *value,
-			      size_t size)
-{
-	struct posix_acl *acl;
-	int err;
-
-	if (!(inode->i_sb->s_flags & SB_POSIXACL)) {
-		ntfs_inode_warn(inode, "add mount option \"acl\" to use acl");
-		return -EOPNOTSUPP;
-	}
-
-	if (!inode_owner_or_capable(mnt_userns, inode))
-		return -EPERM;
-
-	if (!value) {
-		acl = NULL;
-	} else {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
-		if (IS_ERR(acl))
-			return PTR_ERR(acl);
-
-		if (acl) {
-			err = posix_acl_valid(&init_user_ns, acl);
-			if (err)
-				goto release_and_out;
-		}
-	}
-
-	err = ntfs_set_acl(mnt_userns, inode, acl, type);
-
-release_and_out:
-	posix_acl_release(acl);
-	return err;
-}
-
 /*
  * ntfs_init_acl - Initialize the ACLs of a new inode.
  *
@@ -852,23 +791,6 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
 		goto out;
 	}
 
-#ifdef CONFIG_NTFS3_FS_POSIX_ACL
-	if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 &&
-	     !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
-		     sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) ||
-	    (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 &&
-	     !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
-		     sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) {
-		/* TODO: init_user_ns? */
-		err = ntfs_xattr_get_acl(
-			&init_user_ns, inode,
-			name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1
-				? ACL_TYPE_ACCESS
-				: ACL_TYPE_DEFAULT,
-			buffer, size);
-		goto out;
-	}
-#endif
 	/* Deal with NTFS extended attribute. */
 	err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL);
 
@@ -981,22 +903,6 @@ set_new_fa:
 		goto out;
 	}
 
-#ifdef CONFIG_NTFS3_FS_POSIX_ACL
-	if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 &&
-	     !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
-		     sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) ||
-	    (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 &&
-	     !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
-		     sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) {
-		err = ntfs_xattr_set_acl(
-			mnt_userns, inode,
-			name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1
-				? ACL_TYPE_ACCESS
-				: ACL_TYPE_DEFAULT,
-			value, size);
-		goto out;
-	}
-#endif
 	/* Deal with NTFS extended attribute. */
 	err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
 
@@ -1086,7 +992,7 @@ static bool ntfs_xattr_user_list(struct dentry *dentry)
 }
 
 // clang-format off
-static const struct xattr_handler ntfs_xattr_handler = {
+static const struct xattr_handler ntfs_other_xattr_handler = {
 	.prefix	= "",
 	.get	= ntfs_getxattr,
 	.set	= ntfs_setxattr,
@@ -1094,7 +1000,11 @@ static const struct xattr_handler ntfs_xattr_handler = {
 };
 
 const struct xattr_handler *ntfs_xattr_handlers[] = {
-	&ntfs_xattr_handler,
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	&ntfs_other_xattr_handler,
 	NULL,
 };
 // clang-format on
-- 
cgit v1.2.3


From 985a6d0b3c800265a2d5312a52c549bf09254e55 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 29 Aug 2022 14:38:41 +0200
Subject: acl: return EOPNOTSUPP in posix_acl_fix_xattr_common()

Return EOPNOTSUPP when the POSIX ACL version doesn't match and zero if
there are no entries. This will allow us to reuse the helper in
posix_acl_from_xattr(). This change will have no user visible effects.

Fixes: 0c5fd887d2bb ("acl: move idmapped mount fixup into vfs_{g,s}etxattr()")
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>>
---
 fs/posix_acl.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 5af33800743e..abe387700ba9 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -710,9 +710,9 @@ EXPORT_SYMBOL(posix_acl_update_mode);
 /*
  * Fix up the uids and gids in posix acl extended attributes in place.
  */
-static int posix_acl_fix_xattr_common(void *value, size_t size)
+static int posix_acl_fix_xattr_common(const void *value, size_t size)
 {
-	struct posix_acl_xattr_header *header = value;
+	const struct posix_acl_xattr_header *header = value;
 	int count;
 
 	if (!header)
@@ -720,13 +720,13 @@ static int posix_acl_fix_xattr_common(void *value, size_t size)
 	if (size < sizeof(struct posix_acl_xattr_header))
 		return -EINVAL;
 	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
-		return -EINVAL;
+		return -EOPNOTSUPP;
 
 	count = posix_acl_xattr_count(size);
 	if (count < 0)
 		return -EINVAL;
 	if (count == 0)
-		return -EINVAL;
+		return 0;
 
 	return count;
 }
@@ -748,7 +748,7 @@ void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
 		return;
 
 	count = posix_acl_fix_xattr_common(value, size);
-	if (count < 0)
+	if (count <= 0)
 		return;
 
 	for (end = entry + count; entry != end; entry++) {
@@ -788,7 +788,7 @@ void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
 		return;
 
 	count = posix_acl_fix_xattr_common(value, size);
-	if (count < 0)
+	if (count <= 0)
 		return;
 
 	for (end = entry + count; entry != end; entry++) {
@@ -822,7 +822,7 @@ static void posix_acl_fix_xattr_userns(
 	kgid_t gid;
 
 	count = posix_acl_fix_xattr_common(value, size);
-	if (count < 0)
+	if (count <= 0)
 		return;
 
 	for (end = entry + count; entry != end; entry++) {
@@ -870,16 +870,9 @@ posix_acl_from_xattr(struct user_namespace *user_ns,
 	struct posix_acl *acl;
 	struct posix_acl_entry *acl_e;
 
-	if (!value)
-		return NULL;
-	if (size < sizeof(struct posix_acl_xattr_header))
-		 return ERR_PTR(-EINVAL);
-	if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
-		return ERR_PTR(-EOPNOTSUPP);
-
-	count = posix_acl_xattr_count(size);
+	count = posix_acl_fix_xattr_common(value, size);
 	if (count < 0)
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(count);
 	if (count == 0)
 		return NULL;
 	
-- 
cgit v1.2.3


From 6b70fe0601adb1396ad0b85cdf05d217500b49e7 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 29 Aug 2022 14:38:42 +0200
Subject: acl: add vfs_set_acl_prepare()

Various filesystems store POSIX ACLs on the backing store in their uapi
format. Such filesystems need to translate from the uapi POSIX ACL
format into the VFS format during i_op->get_acl(). The VFS provides the
posix_acl_from_xattr() helper for this task.

But the usage of posix_acl_from_xattr() is currently ambiguous. It is
intended to transform from a uapi POSIX ACL  to the VFS represenation.
For example, when retrieving POSIX ACLs for permission checking during
lookup or when calling getxattr() to retrieve system.posix_acl_{access,default}.

Calling posix_acl_from_xattr() during i_op->get_acl() will map the raw
{g,u}id values stored as ACL_{GROUP,USER} entries in the uapi POSIX ACL
format into k{g,u}id_t in the filesystem's idmapping and return a struct
posix_acl ready to be returned to the VFS for caching and to perform
permission checks on.

However, posix_acl_from_xattr() is also called during setxattr() for all
filesystems that rely on VFS provides posix_acl_{access,default}_xattr_handler.
The posix_acl_xattr_set() handler which is used for the ->set() method
of posix_acl_{access,default}_xattr_handler uses posix_acl_from_xattr()
to translate from the uapi POSIX ACL format to the VFS format so that it
can be passed to the i_op->set_acl() handler of the filesystem or for
direct caching in case no i_op->set_acl() handler is defined.

During setxattr() the {g,u}id values stored as ACL_{GROUP,USER} entries
in the uapi POSIX ACL format aren't raw {g,u}id values that need to be
mapped according to the filesystem's idmapping. Instead they are {g,u}id
values in the caller's idmapping which have been generated during
posix_acl_fix_xattr_from_user(). In other words, they are k{g,u}id_t
which are passed as raw {g,u}id values abusing the uapi POSIX ACL format
(Please note that this type safety violation has existed since the
introduction of k{g,u}id_t. Please see [1] for more details.).

So when posix_acl_from_xattr() is called in posix_acl_xattr_set() the
filesystem idmapping is completely irrelevant. Instead, we abuse the
initial idmapping to recover the k{g,u}id_t base on the value stored in
raw {g,u}id as ACL_{GROUP,USER} in the uapi POSIX ACL format.

We need to clearly distinguish betweeen these two operations as it is
really easy to confuse for filesystems as can be seen in ntfs3.

In order to do this we factor out make_posix_acl() which takes callbacks
allowing callers to pass dedicated methods to generate the correct
k{g,u}id_t. This is just an internal static helper which is not exposed
to any filesystems but it neatly encapsulates the basic logic of walking
through a uapi POSIX ACL and returning an allocated VFS POSIX ACL with
the correct k{g,u}id_t values.

The posix_acl_from_xattr() helper can then be implemented as a simple
call to make_posix_acl() with callbacks that generate the correct
k{g,u}id_t from the raw {g,u}id values in ACL_{GROUP,USER} entries in
the uapi POSIX ACL format as read from the backing store.

For setxattr() we add a new helper vfs_set_acl_prepare() which has
callbacks to map the POSIX ACLs from the uapi format with the k{g,u}id_t
values stored in raw {g,u}id format in ACL_{GROUP,USER} entries into the
correct k{g,u}id_t values in the filesystem idmapping. In contrast to
posix_acl_from_xattr() the vfs_set_acl_prepare() helper needs to take
the mount idmapping into account. The differences are explained in more
detail in the kernel doc for the new functions.

In follow up patches we will remove all abuses of posix_acl_from_xattr()
for setxattr() operations and replace it with calls to vfs_set_acl_prepare().

The new vfs_set_acl_prepare() helper allows us to deal with the
ambiguity in how the POSI ACL uapi struct stores {g,u}id values
depending on whether this is a getxattr() or setxattr() operation.

This also allows us to remove the posix_acl_setxattr_idmapped_mnt()
helper reducing the abuse of the POSIX ACL uapi format to pass values
that should be distinct types in {g,u}id values stored as
ACL_{GROUP,USER} entries.

The removal of posix_acl_setxattr_idmapped_mnt() in turn allows us to
re-constify the value parameter of vfs_setxattr() which in turn allows
us to avoid the nasty cast from a const void pointer to a non-const void
pointer on ovl_do_setxattr().

Ultimately, the plan is to get rid of the type violations completely and
never pass the values from k{g,u}id_t as raw {g,u}id in ACL_{GROUP,USER}
entries in uapi POSIX ACL format. But that's a longer way to go and this
is a preparatory step.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1]
Co-Developed-by: Seth Forshee <sforshee@digitalocean.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/posix_acl.c                  | 213 +++++++++++++++++++++++++++++++++++++---
 include/linux/posix_acl_xattr.h |   3 +
 2 files changed, 205 insertions(+), 11 deletions(-)

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index abe387700ba9..31eac28e6582 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -857,12 +857,32 @@ void posix_acl_fix_xattr_to_user(void *value, size_t size)
 	posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
 }
 
-/*
- * Convert from extended attribute to in-memory representation.
+/**
+ * make_posix_acl - convert POSIX ACLs from uapi to VFS format using the
+ *                  provided callbacks to map ACL_{GROUP,USER} entries into the
+ *                  appropriate format
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @value: the uapi representation of POSIX ACLs
+ * @size: the size of @void
+ * @uid_cb: callback to use for mapping the uid stored in ACL_USER entries
+ * @gid_cb: callback to use for mapping the gid stored in ACL_GROUP entries
+ *
+ * The make_posix_acl() helper is an abstraction to translate from uapi format
+ * into the VFS format allowing the caller to specific callbacks to map
+ * ACL_{GROUP,USER} entries into the expected format. This is used in
+ * posix_acl_from_xattr() and vfs_set_acl_prepare() and avoids pointless code
+ * duplication.
+ *
+ * Return: Allocated struct posix_acl on success, NULL for a valid header but
+ *         without actual POSIX ACL entries, or ERR_PTR() encoded error code.
  */
-struct posix_acl *
-posix_acl_from_xattr(struct user_namespace *user_ns,
-		     const void *value, size_t size)
+static struct posix_acl *make_posix_acl(struct user_namespace *mnt_userns,
+	struct user_namespace *fs_userns, const void *value, size_t size,
+	kuid_t (*uid_cb)(struct user_namespace *, struct user_namespace *,
+			 const struct posix_acl_xattr_entry *),
+	kgid_t (*gid_cb)(struct user_namespace *, struct user_namespace *,
+			 const struct posix_acl_xattr_entry *))
 {
 	const struct posix_acl_xattr_header *header = value;
 	const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end;
@@ -893,16 +913,12 @@ posix_acl_from_xattr(struct user_namespace *user_ns,
 				break;
 
 			case ACL_USER:
-				acl_e->e_uid =
-					make_kuid(user_ns,
-						  le32_to_cpu(entry->e_id));
+				acl_e->e_uid = uid_cb(mnt_userns, fs_userns, entry);
 				if (!uid_valid(acl_e->e_uid))
 					goto fail;
 				break;
 			case ACL_GROUP:
-				acl_e->e_gid =
-					make_kgid(user_ns,
-						  le32_to_cpu(entry->e_id));
+				acl_e->e_gid = gid_cb(mnt_userns, fs_userns, entry);
 				if (!gid_valid(acl_e->e_gid))
 					goto fail;
 				break;
@@ -917,6 +933,181 @@ fail:
 	posix_acl_release(acl);
 	return ERR_PTR(-EINVAL);
 }
+
+/**
+ * vfs_set_acl_prepare_kuid - map ACL_USER uid according to mount- and
+ *                            filesystem idmapping
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @e: a ACL_USER entry in POSIX ACL uapi format
+ *
+ * The uid stored as ACL_USER entry in @e is a kuid_t stored as a raw {g,u}id
+ * value. The vfs_set_acl_prepare_kuid() will recover the kuid_t through
+ * KUIDT_INIT() and then map it according to the idmapped mount. The resulting
+ * kuid_t is the value which the filesystem can map up into a raw backing store
+ * id in the filesystem's idmapping.
+ *
+ * This is used in vfs_set_acl_prepare() to generate the proper VFS
+ * representation of POSIX ACLs with ACL_USER entries during setxattr().
+ *
+ * Return: A kuid in @fs_userns for the uid stored in @e.
+ */
+static inline kuid_t
+vfs_set_acl_prepare_kuid(struct user_namespace *mnt_userns,
+			 struct user_namespace *fs_userns,
+			 const struct posix_acl_xattr_entry *e)
+{
+	kuid_t kuid = KUIDT_INIT(le32_to_cpu(e->e_id));
+	return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid));
+}
+
+/**
+ * vfs_set_acl_prepare_kgid - map ACL_GROUP gid according to mount- and
+ *                            filesystem idmapping
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @e: a ACL_GROUP entry in POSIX ACL uapi format
+ *
+ * The gid stored as ACL_GROUP entry in @e is a kgid_t stored as a raw {g,u}id
+ * value. The vfs_set_acl_prepare_kgid() will recover the kgid_t through
+ * KGIDT_INIT() and then map it according to the idmapped mount. The resulting
+ * kgid_t is the value which the filesystem can map up into a raw backing store
+ * id in the filesystem's idmapping.
+ *
+ * This is used in vfs_set_acl_prepare() to generate the proper VFS
+ * representation of POSIX ACLs with ACL_GROUP entries during setxattr().
+ *
+ * Return: A kgid in @fs_userns for the gid stored in @e.
+ */
+static inline kgid_t
+vfs_set_acl_prepare_kgid(struct user_namespace *mnt_userns,
+			 struct user_namespace *fs_userns,
+			 const struct posix_acl_xattr_entry *e)
+{
+	kgid_t kgid = KGIDT_INIT(le32_to_cpu(e->e_id));
+	return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid));
+}
+
+/**
+ * vfs_set_acl_prepare - convert POSIX ACLs from uapi to VFS format taking
+ *                       mount and filesystem idmappings into account
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @value: the uapi representation of POSIX ACLs
+ * @size: the size of @void
+ *
+ * When setting POSIX ACLs with ACL_{GROUP,USER} entries they need to be
+ * mapped according to the relevant mount- and filesystem idmapping. It is
+ * important that the ACL_{GROUP,USER} entries in struct posix_acl will be
+ * mapped into k{g,u}id_t that are supposed to be mapped up in the filesystem
+ * idmapping. This is crucial since the resulting struct posix_acl might be
+ * cached filesystem wide. The vfs_set_acl_prepare() function will take care to
+ * perform all necessary idmappings.
+ *
+ * Note, that since basically forever the {g,u}id values encoded as
+ * ACL_{GROUP,USER} entries in the uapi POSIX ACLs passed via @value contain
+ * values that have been mapped according to the caller's idmapping. In other
+ * words, POSIX ACLs passed in uapi format as @value during setxattr() contain
+ * {g,u}id values in their ACL_{GROUP,USER} entries that should actually have
+ * been stored as k{g,u}id_t.
+ *
+ * This means, vfs_set_acl_prepare() needs to first recover the k{g,u}id_t by
+ * calling K{G,U}IDT_INIT(). Afterwards they can be interpreted as vfs{g,u}id_t
+ * through from_vfs{g,u}id() to account for any idmapped mounts. The
+ * vfs_set_acl_prepare_k{g,u}id() helpers will take care to generate the
+ * correct k{g,u}id_t.
+ *
+ * The filesystem will then receive the POSIX ACLs ready to be cached
+ * filesystem wide and ready to be written to the backing store taking the
+ * filesystem's idmapping into account.
+ *
+ * Return: Allocated struct posix_acl on success, NULL for a valid header but
+ *         without actual POSIX ACL entries, or ERR_PTR() encoded error code.
+ */
+struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns,
+				      struct user_namespace *fs_userns,
+				      const void *value, size_t size)
+{
+	return make_posix_acl(mnt_userns, fs_userns, value, size,
+			      vfs_set_acl_prepare_kuid,
+			      vfs_set_acl_prepare_kgid);
+}
+EXPORT_SYMBOL(vfs_set_acl_prepare);
+
+/**
+ * posix_acl_from_xattr_kuid - map ACL_USER uid into filesystem idmapping
+ * @mnt_userns: unused
+ * @fs_userns: the filesystem's idmapping
+ * @e: a ACL_USER entry in POSIX ACL uapi format
+ *
+ * Map the uid stored as ACL_USER entry in @e into the filesystem's idmapping.
+ * This is used in posix_acl_from_xattr() to generate the proper VFS
+ * representation of POSIX ACLs with ACL_USER entries.
+ *
+ * Return: A kuid in @fs_userns for the uid stored in @e.
+ */
+static inline kuid_t
+posix_acl_from_xattr_kuid(struct user_namespace *mnt_userns,
+			  struct user_namespace *fs_userns,
+			  const struct posix_acl_xattr_entry *e)
+{
+	return make_kuid(fs_userns, le32_to_cpu(e->e_id));
+}
+
+/**
+ * posix_acl_from_xattr_kgid - map ACL_GROUP gid into filesystem idmapping
+ * @mnt_userns: unused
+ * @fs_userns: the filesystem's idmapping
+ * @e: a ACL_GROUP entry in POSIX ACL uapi format
+ *
+ * Map the gid stored as ACL_GROUP entry in @e into the filesystem's idmapping.
+ * This is used in posix_acl_from_xattr() to generate the proper VFS
+ * representation of POSIX ACLs with ACL_GROUP entries.
+ *
+ * Return: A kgid in @fs_userns for the gid stored in @e.
+ */
+static inline kgid_t
+posix_acl_from_xattr_kgid(struct user_namespace *mnt_userns,
+			  struct user_namespace *fs_userns,
+			  const struct posix_acl_xattr_entry *e)
+{
+	return make_kgid(fs_userns, le32_to_cpu(e->e_id));
+}
+
+/**
+ * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format
+ * @fs_userns: the filesystem's idmapping
+ * @value: the uapi representation of POSIX ACLs
+ * @size: the size of @void
+ *
+ * Filesystems that store POSIX ACLs in the unaltered uapi format should use
+ * posix_acl_from_xattr() when reading them from the backing store and
+ * converting them into the struct posix_acl VFS format. The helper is
+ * specifically intended to be called from the ->get_acl() inode operation.
+ *
+ * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
+ * in ACL_{GROUP,USER} entries into the filesystem idmapping in @fs_userns. The
+ * posix_acl_from_xattr_k{g,u}id() helpers will take care to generate the
+ * correct k{g,u}id_t. The returned struct posix_acl can be cached.
+ *
+ * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
+ * If it did it calling is from the ->get_acl() inode operation would return
+ * POSIX ACLs mapped according to an idmapped mount which would mean that the
+ * value couldn't be cached for the filesystem. Idmapped mounts are taken into
+ * account on the fly during permission checking or right at the VFS -
+ * userspace boundary before reporting them to the user.
+ *
+ * Return: Allocated struct posix_acl on success, NULL for a valid header but
+ *         without actual POSIX ACL entries, or ERR_PTR() encoded error code.
+ */
+struct posix_acl *
+posix_acl_from_xattr(struct user_namespace *fs_userns,
+		     const void *value, size_t size)
+{
+	return make_posix_acl(&init_user_ns, fs_userns, value, size,
+			      posix_acl_from_xattr_kuid,
+			      posix_acl_from_xattr_kgid);
+}
 EXPORT_SYMBOL (posix_acl_from_xattr);
 
 /*
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index b6bd3eac2bcc..47eca15fd842 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -66,6 +66,9 @@ struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns,
 				       const void *value, size_t size);
 int posix_acl_to_xattr(struct user_namespace *user_ns,
 		       const struct posix_acl *acl, void *buffer, size_t size);
+struct posix_acl *vfs_set_acl_prepare(struct user_namespace *mnt_userns,
+				      struct user_namespace *fs_userns,
+				      const void *value, size_t size);
 
 extern const struct xattr_handler posix_acl_access_xattr_handler;
 extern const struct xattr_handler posix_acl_default_xattr_handler;
-- 
cgit v1.2.3


From 150f7b11cb59670a147e88580a604ce3c6845a88 Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda@chromium.org>
Date: Mon, 22 Aug 2022 15:17:54 +0200
Subject: media: uvcvideo: Fix InterfaceProtocol for Quanta camera

The Quanta 0408:4034 camera implements UVC 1.5, and thus sets
bInterfaceProtocol to UVC_PC_PROTOCOL_15. Commit 95f03d973478 ("media:
uvcvideo: Limit power line control for Quanta cameras") added a quirk
for the device that incorrectly specified the UVC 1.0 protocol,
rendering the quirk inoperative. Fix it.

Link: https://lore.kernel.org/linux-media/20220822131754.102393-1-ribalda@chromium.org
Fixes: 95f03d973478 ("media: uvcvideo: Limit power line control for Quanta cameras")
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/usb/uvc/uvc_driver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c
index 9c05776f11d1..d509a4a2f08e 100644
--- a/drivers/media/usb/uvc/uvc_driver.c
+++ b/drivers/media/usb/uvc/uvc_driver.c
@@ -2740,7 +2740,7 @@ static const struct usb_device_id uvc_ids[] = {
 	  .idProduct		= 0x4034,
 	  .bInterfaceClass	= USB_CLASS_VIDEO,
 	  .bInterfaceSubClass	= 1,
-	  .bInterfaceProtocol	= 0,
+	  .bInterfaceProtocol	= UVC_PC_PROTOCOL_15,
 	  .driver_info		= (kernel_ulong_t)&uvc_ctrl_power_line_limited },
 	/* LogiLink Wireless Webcam */
 	{ .match_flags		= USB_DEVICE_ID_MATCH_DEVICE
-- 
cgit v1.2.3


From 3bec6c3e83b5c125ff35e3dae3127c8d62046a1d Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 30 Aug 2022 16:23:07 +0200
Subject: xfrm: propagate extack to all netlink doit handlers

xfrm_user_rcv_msg() already handles extack, we just need to pass it down.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 56 +++++++++++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 2ff017117730..cfa35d76fb7e 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -678,7 +678,7 @@ error_no_put:
 }
 
 static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+		       struct nlattr **attrs, struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_usersa_info *p = nlmsg_data(nlh);
@@ -757,7 +757,7 @@ static struct xfrm_state *xfrm_user_state_lookup(struct net *net,
 }
 
 static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+		       struct nlattr **attrs, struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_state *x;
@@ -1254,7 +1254,8 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net,
 }
 
 static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
-			    struct nlattr **attrs)
+			    struct nlattr **attrs,
+			    struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrmu_spdhthresh *thresh4 = NULL;
@@ -1299,7 +1300,8 @@ static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
 }
 
 static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+			    struct nlattr **attrs,
+			    struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct sk_buff *r_skb;
@@ -1358,7 +1360,8 @@ static int build_sadinfo(struct sk_buff *skb, struct net *net,
 }
 
 static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+			    struct nlattr **attrs,
+			    struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct sk_buff *r_skb;
@@ -1378,7 +1381,7 @@ static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
 }
 
 static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+		       struct nlattr **attrs, struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_usersa_id *p = nlmsg_data(nlh);
@@ -1402,7 +1405,8 @@ out_noput:
 }
 
 static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+			      struct nlattr **attrs,
+			      struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_state *x;
@@ -1754,7 +1758,8 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us
 }
 
 static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+			   struct nlattr **attrs,
+			   struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_userpolicy_info *p = nlmsg_data(nlh);
@@ -2015,7 +2020,7 @@ static bool xfrm_userpolicy_is_valid(__u8 policy)
 }
 
 static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh,
-			    struct nlattr **attrs)
+			    struct nlattr **attrs, struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_userpolicy_default *up = nlmsg_data(nlh);
@@ -2036,7 +2041,7 @@ static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh,
 }
 
 static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh,
-			    struct nlattr **attrs)
+			    struct nlattr **attrs, struct netlink_ext_ack *extack)
 {
 	struct sk_buff *r_skb;
 	struct nlmsghdr *r_nlh;
@@ -2066,7 +2071,8 @@ static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh,
 }
 
 static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+			   struct nlattr **attrs,
+			   struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_policy *xp;
@@ -2149,7 +2155,8 @@ out:
 }
 
 static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+			 struct nlattr **attrs,
+			 struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct km_event c;
@@ -2249,7 +2256,7 @@ out_cancel:
 }
 
 static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+		       struct nlattr **attrs, struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_state *x;
@@ -2293,7 +2300,7 @@ static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
 }
 
 static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+		       struct nlattr **attrs, struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_state *x;
@@ -2344,7 +2351,8 @@ out:
 }
 
 static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+			     struct nlattr **attrs,
+			     struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct km_event c;
@@ -2372,7 +2380,8 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 }
 
 static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+			       struct nlattr **attrs,
+			       struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_policy *xp;
@@ -2438,7 +2447,8 @@ out:
 }
 
 static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+			      struct nlattr **attrs,
+			      struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_state *x;
@@ -2472,7 +2482,8 @@ out:
 }
 
 static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
-		struct nlattr **attrs)
+			    struct nlattr **attrs,
+			    struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
 	struct xfrm_policy *xp;
@@ -2577,7 +2588,7 @@ static int copy_from_user_migrate(struct xfrm_migrate *ma,
 }
 
 static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
-			   struct nlattr **attrs)
+			   struct nlattr **attrs, struct netlink_ext_ack *extack)
 {
 	struct xfrm_userpolicy_id *pi = nlmsg_data(nlh);
 	struct xfrm_migrate m[XFRM_MAX_DEPTH];
@@ -2623,7 +2634,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
 }
 #else
 static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
-			   struct nlattr **attrs)
+			   struct nlattr **attrs, struct netlink_ext_ack *extack)
 {
 	return -ENOPROTOOPT;
 }
@@ -2819,7 +2830,8 @@ static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
 };
 
 static const struct xfrm_link {
-	int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
+	int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **,
+		    struct netlink_ext_ack *);
 	int (*start)(struct netlink_callback *);
 	int (*dump)(struct sk_buff *, struct netlink_callback *);
 	int (*done)(struct netlink_callback *);
@@ -2921,7 +2933,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto err;
 	}
 
-	err = link->doit(skb, nlh, attrs);
+	err = link->doit(skb, nlh, attrs, extack);
 
 	/* We need to free skb allocated in xfrm_alloc_compat() before
 	 * returning from this function, because consume_skb() won't take
-- 
cgit v1.2.3


From ec2b4f01536dcd9ecaae91e99334b5fdb510c0e2 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 30 Aug 2022 16:23:08 +0200
Subject: xfrm: add extack support to verify_newpolicy_info

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index cfa35d76fb7e..fa6024b2c88b 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1512,7 +1512,8 @@ static int verify_policy_type(u8 type)
 	return 0;
 }
 
-static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
+static int verify_newpolicy_info(struct xfrm_userpolicy_info *p,
+				 struct netlink_ext_ack *extack)
 {
 	int ret;
 
@@ -1524,6 +1525,7 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
 		break;
 
 	default:
+		NL_SET_ERR_MSG(extack, "Invalid policy share");
 		return -EINVAL;
 	}
 
@@ -1533,35 +1535,44 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p)
 		break;
 
 	default:
+		NL_SET_ERR_MSG(extack, "Invalid policy action");
 		return -EINVAL;
 	}
 
 	switch (p->sel.family) {
 	case AF_INET:
-		if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32)
+		if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) {
+			NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 32 for IPv4)");
 			return -EINVAL;
+		}
 
 		break;
 
 	case AF_INET6:
 #if IS_ENABLED(CONFIG_IPV6)
-		if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128)
+		if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) {
+			NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 128 for IPv6)");
 			return -EINVAL;
+		}
 
 		break;
 #else
+		NL_SET_ERR_MSG(extack, "IPv6 support disabled");
 		return  -EAFNOSUPPORT;
 #endif
 
 	default:
+		NL_SET_ERR_MSG(extack, "Invalid selector family");
 		return -EINVAL;
 	}
 
 	ret = verify_policy_dir(p->dir);
 	if (ret)
 		return ret;
-	if (p->index && (xfrm_policy_id2dir(p->index) != p->dir))
+	if (p->index && (xfrm_policy_id2dir(p->index) != p->dir)) {
+		NL_SET_ERR_MSG(extack, "Policy index doesn't match direction");
 		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -1768,7 +1779,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err;
 	int excl;
 
-	err = verify_newpolicy_info(p);
+	err = verify_newpolicy_info(p, extack);
 	if (err)
 		return err;
 	err = verify_sec_ctx_len(attrs);
@@ -2501,7 +2512,7 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	xfrm_mark_get(attrs, &mark);
 
-	err = verify_newpolicy_info(&ua->policy);
+	err = verify_newpolicy_info(&ua->policy, extack);
 	if (err)
 		goto free_state;
 	err = verify_sec_ctx_len(attrs);
@@ -3284,7 +3295,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
 	*dir = -EINVAL;
 
 	if (len < sizeof(*p) ||
-	    verify_newpolicy_info(p))
+	    verify_newpolicy_info(p, NULL))
 		return NULL;
 
 	nr = ((len - sizeof(*p)) / sizeof(*ut));
-- 
cgit v1.2.3


From 24fc544fb525a2d1cabe4e77e673938bf2e45aed Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 30 Aug 2022 16:23:09 +0200
Subject: xfrm: add extack to verify_policy_dir

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index fa6024b2c88b..0042b77337bd 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1481,7 +1481,7 @@ out_noput:
 	return err;
 }
 
-static int verify_policy_dir(u8 dir)
+static int verify_policy_dir(u8 dir, struct netlink_ext_ack *extack)
 {
 	switch (dir) {
 	case XFRM_POLICY_IN:
@@ -1490,6 +1490,7 @@ static int verify_policy_dir(u8 dir)
 		break;
 
 	default:
+		NL_SET_ERR_MSG(extack, "Invalid policy direction");
 		return -EINVAL;
 	}
 
@@ -1566,7 +1567,7 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p,
 		return -EINVAL;
 	}
 
-	ret = verify_policy_dir(p->dir);
+	ret = verify_policy_dir(p->dir, extack);
 	if (ret)
 		return ret;
 	if (p->index && (xfrm_policy_id2dir(p->index) != p->dir)) {
@@ -2102,7 +2103,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err)
 		return err;
 
-	err = verify_policy_dir(p->dir);
+	err = verify_policy_dir(p->dir, extack);
 	if (err)
 		return err;
 
@@ -2407,7 +2408,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err)
 		return err;
 
-	err = verify_policy_dir(p->dir);
+	err = verify_policy_dir(p->dir, extack);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From fb7deaba40cfc6b5eb91b7431102520c4b156513 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 30 Aug 2022 16:23:10 +0200
Subject: xfrm: add extack to verify_policy_type

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0042b77337bd..0f2a2aa1e289 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1497,7 +1497,7 @@ static int verify_policy_dir(u8 dir, struct netlink_ext_ack *extack)
 	return 0;
 }
 
-static int verify_policy_type(u8 type)
+static int verify_policy_type(u8 type, struct netlink_ext_ack *extack)
 {
 	switch (type) {
 	case XFRM_POLICY_TYPE_MAIN:
@@ -1507,6 +1507,7 @@ static int verify_policy_type(u8 type)
 		break;
 
 	default:
+		NL_SET_ERR_MSG(extack, "Invalid policy type");
 		return -EINVAL;
 	}
 
@@ -1688,7 +1689,8 @@ static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs)
 	return 0;
 }
 
-static int copy_from_user_policy_type(u8 *tp, struct nlattr **attrs)
+static int copy_from_user_policy_type(u8 *tp, struct nlattr **attrs,
+				      struct netlink_ext_ack *extack)
 {
 	struct nlattr *rt = attrs[XFRMA_POLICY_TYPE];
 	struct xfrm_userpolicy_type *upt;
@@ -1700,7 +1702,7 @@ static int copy_from_user_policy_type(u8 *tp, struct nlattr **attrs)
 		type = upt->type;
 	}
 
-	err = verify_policy_type(type);
+	err = verify_policy_type(type, extack);
 	if (err)
 		return err;
 
@@ -1735,7 +1737,11 @@ static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_i
 	p->share = XFRM_SHARE_ANY; /* XXX xp->share */
 }
 
-static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_userpolicy_info *p, struct nlattr **attrs, int *errp)
+static struct xfrm_policy *xfrm_policy_construct(struct net *net,
+						 struct xfrm_userpolicy_info *p,
+						 struct nlattr **attrs,
+						 int *errp,
+						 struct netlink_ext_ack *extack)
 {
 	struct xfrm_policy *xp = xfrm_policy_alloc(net, GFP_KERNEL);
 	int err;
@@ -1747,7 +1753,7 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us
 
 	copy_from_user_policy(xp, p);
 
-	err = copy_from_user_policy_type(&xp->type, attrs);
+	err = copy_from_user_policy_type(&xp->type, attrs, extack);
 	if (err)
 		goto error;
 
@@ -1787,7 +1793,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err)
 		return err;
 
-	xp = xfrm_policy_construct(net, p, attrs, &err);
+	xp = xfrm_policy_construct(net, p, attrs, &err, extack);
 	if (!xp)
 		return err;
 
@@ -2099,7 +2105,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	p = nlmsg_data(nlh);
 	delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY;
 
-	err = copy_from_user_policy_type(&type, attrs);
+	err = copy_from_user_policy_type(&type, attrs, extack);
 	if (err)
 		return err;
 
@@ -2371,7 +2377,7 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	u8 type = XFRM_POLICY_TYPE_MAIN;
 	int err;
 
-	err = copy_from_user_policy_type(&type, attrs);
+	err = copy_from_user_policy_type(&type, attrs, extack);
 	if (err)
 		return err;
 
@@ -2404,7 +2410,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct xfrm_mark m;
 	u32 if_id = 0;
 
-	err = copy_from_user_policy_type(&type, attrs);
+	err = copy_from_user_policy_type(&type, attrs, extack);
 	if (err)
 		return err;
 
@@ -2521,7 +2527,7 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto free_state;
 
 	/*   build an XP */
-	xp = xfrm_policy_construct(net, &ua->policy, attrs, &err);
+	xp = xfrm_policy_construct(net, &ua->policy, attrs, &err, extack);
 	if (!xp)
 		goto free_state;
 
@@ -2617,7 +2623,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	kmp = attrs[XFRMA_KMADDRESS] ? &km : NULL;
 
-	err = copy_from_user_policy_type(&type, attrs);
+	err = copy_from_user_policy_type(&type, attrs, extack);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From d37bed89f082cb84cbdf1f38114cde1defc1724c Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 30 Aug 2022 16:23:11 +0200
Subject: xfrm: add extack to validate_tmpl

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0f2a2aa1e289..9fd30914f1ff 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1616,13 +1616,16 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
 	}
 }
 
-static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
+static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family,
+			 struct netlink_ext_ack *extack)
 {
 	u16 prev_family;
 	int i;
 
-	if (nr > XFRM_MAX_DEPTH)
+	if (nr > XFRM_MAX_DEPTH) {
+		NL_SET_ERR_MSG(extack, "Template count must be <= XFRM_MAX_DEPTH (" __stringify(XFRM_MAX_DEPTH) ")");
 		return -EINVAL;
+	}
 
 	prev_family = family;
 
@@ -1642,12 +1645,16 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
 		case XFRM_MODE_BEET:
 			break;
 		default:
-			if (ut[i].family != prev_family)
+			if (ut[i].family != prev_family) {
+				NL_SET_ERR_MSG(extack, "Mode in template doesn't support a family change");
 				return -EINVAL;
+			}
 			break;
 		}
-		if (ut[i].mode >= XFRM_MODE_MAX)
+		if (ut[i].mode >= XFRM_MODE_MAX) {
+			NL_SET_ERR_MSG(extack, "Mode in template must be < XFRM_MODE_MAX (" __stringify(XFRM_MODE_MAX) ")");
 			return -EINVAL;
+		}
 
 		prev_family = ut[i].family;
 
@@ -1659,17 +1666,21 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
 			break;
 #endif
 		default:
+			NL_SET_ERR_MSG(extack, "Invalid family in template");
 			return -EINVAL;
 		}
 
-		if (!xfrm_id_proto_valid(ut[i].id.proto))
+		if (!xfrm_id_proto_valid(ut[i].id.proto)) {
+			NL_SET_ERR_MSG(extack, "Invalid XFRM protocol in template");
 			return -EINVAL;
+		}
 	}
 
 	return 0;
 }
 
-static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs)
+static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs,
+			       struct netlink_ext_ack *extack)
 {
 	struct nlattr *rt = attrs[XFRMA_TMPL];
 
@@ -1680,7 +1691,7 @@ static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs)
 		int nr = nla_len(rt) / sizeof(*utmpl);
 		int err;
 
-		err = validate_tmpl(nr, utmpl, pol->family);
+		err = validate_tmpl(nr, utmpl, pol->family, extack);
 		if (err)
 			return err;
 
@@ -1757,7 +1768,7 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net,
 	if (err)
 		goto error;
 
-	if (!(err = copy_from_user_tmpl(xp, attrs)))
+	if (!(err = copy_from_user_tmpl(xp, attrs, extack)))
 		err = copy_from_user_sec_ctx(xp, attrs);
 	if (err)
 		goto error;
@@ -3306,7 +3317,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
 		return NULL;
 
 	nr = ((len - sizeof(*p)) / sizeof(*ut));
-	if (validate_tmpl(nr, ut, p->sel.family))
+	if (validate_tmpl(nr, ut, p->sel.family, NULL))
 		return NULL;
 
 	if (p->dir > XFRM_POLICY_OUT)
-- 
cgit v1.2.3


From 08a717e4803798e066aa6b69ebf69da9fc8e1758 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 30 Aug 2022 16:23:12 +0200
Subject: xfrm: add extack to verify_sec_ctx_len

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 9fd30914f1ff..772a051feedb 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -102,7 +102,7 @@ static void verify_one_addr(struct nlattr **attrs, enum xfrm_attr_type_t type,
 		*addrp = nla_data(rt);
 }
 
-static inline int verify_sec_ctx_len(struct nlattr **attrs)
+static inline int verify_sec_ctx_len(struct nlattr **attrs, struct netlink_ext_ack *extack)
 {
 	struct nlattr *rt = attrs[XFRMA_SEC_CTX];
 	struct xfrm_user_sec_ctx *uctx;
@@ -112,8 +112,10 @@ static inline int verify_sec_ctx_len(struct nlattr **attrs)
 
 	uctx = nla_data(rt);
 	if (uctx->len > nla_len(rt) ||
-	    uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len))
+	    uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)) {
+		NL_SET_ERR_MSG(extack, "Invalid security context length");
 		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -264,7 +266,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		goto out;
 	if ((err = verify_one_alg(attrs, XFRMA_ALG_COMP)))
 		goto out;
-	if ((err = verify_sec_ctx_len(attrs)))
+	if ((err = verify_sec_ctx_len(attrs, NULL)))
 		goto out;
 	if ((err = verify_replay(p, attrs)))
 		goto out;
@@ -1800,7 +1802,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	err = verify_newpolicy_info(p, extack);
 	if (err)
 		return err;
-	err = verify_sec_ctx_len(attrs);
+	err = verify_sec_ctx_len(attrs, extack);
 	if (err)
 		return err;
 
@@ -2136,7 +2138,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 		struct nlattr *rt = attrs[XFRMA_SEC_CTX];
 		struct xfrm_sec_ctx *ctx;
 
-		err = verify_sec_ctx_len(attrs);
+		err = verify_sec_ctx_len(attrs, extack);
 		if (err)
 			return err;
 
@@ -2441,7 +2443,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
 		struct nlattr *rt = attrs[XFRMA_SEC_CTX];
 		struct xfrm_sec_ctx *ctx;
 
-		err = verify_sec_ctx_len(attrs);
+		err = verify_sec_ctx_len(attrs, extack);
 		if (err)
 			return err;
 
@@ -2533,7 +2535,7 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	err = verify_newpolicy_info(&ua->policy, extack);
 	if (err)
 		goto free_state;
-	err = verify_sec_ctx_len(attrs);
+	err = verify_sec_ctx_len(attrs, extack);
 	if (err)
 		goto free_state;
 
-- 
cgit v1.2.3


From 4d2f862b5b49141a1762baa67c846c598b54be27 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 5 Jul 2022 12:15:35 -0700
Subject: doc: Emphasize the need for explicit RCU read-side markers

This commit updates checklist.rst to emphasize the need for explicit
markers for RCU read-side critical sections.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/checklist.rst | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index 42cc5d891bd2..5eedef027d92 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -66,8 +66,13 @@ over a rather long period of time, but improvements are always welcome!
 	As a rough rule of thumb, any dereference of an RCU-protected
 	pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
 	rcu_read_lock_sched(), or by the appropriate update-side lock.
-	Disabling of preemption can serve as rcu_read_lock_sched(), but
-	is less readable and prevents lockdep from detecting locking issues.
+	Explicit disabling of preemption (preempt_disable(), for example)
+	can serve as rcu_read_lock_sched(), but is less readable and
+	prevents lockdep from detecting locking issues.
+
+	Please not that you *cannot* rely on code known to be built
+	only in non-preemptible kernels.  Such code can and will break,
+	especially in kernels built with CONFIG_PREEMPT_COUNT=y.
 
 	Letting RCU-protected pointers "leak" out of an RCU read-side
 	critical section is every bit as bad as letting them leak out
-- 
cgit v1.2.3


From ed336c6fe4c0110a56f6f237865ff9660de8c598 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 Jul 2022 12:00:50 -0700
Subject: doc: Call out queue_rcu_work() for blocking RCU callbacks

The current checklist.rst file correctly notes that RCU callbacks execute
in BH context, and cannot block.  This commit adds words advising people
needing callbacks to block to use workqueues, for example, by replacing
call_rcu() with queue_rcu_work().

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/checklist.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index 5eedef027d92..2db206fc3465 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -190,6 +190,9 @@ over a rather long period of time, but improvements are always welcome!
 
 5.	If call_rcu() or call_srcu() is used, the callback function will
 	be called from softirq context.  In particular, it cannot block.
+	If you need the callback to block, run that code in a workqueue
+	handler scheduled from the callback.  The queue_rcu_work()
+	function does this for you in the case of call_rcu().
 
 6.	Since synchronize_rcu() can block, it cannot be called
 	from any sort of irq context.  The same rule applies
-- 
cgit v1.2.3


From 1c1c1d983e75f53b3e307804a39d17d24310c5f2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 Jul 2022 12:04:00 -0700
Subject: doc: Use rcu_barrier() to rate-limit RCU callbacks

The checklist.rst document advises periodic synchronize_rcu() invocations
to prevent callback flooding.  However, rcu_barrier() is often a better
choice.  This commit therefore adds words to this effect.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/checklist.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index 2db206fc3465..178ca7547b98 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -305,7 +305,8 @@ over a rather long period of time, but improvements are always welcome!
 		the machine.
 
 	d.	Periodically invoke synchronize_rcu(), permitting a limited
-		number of updates per grace period.
+		number of updates per grace period.  Better yet, periodically
+		invoke rcu_barrier() to wait for all outstanding callbacks.
 
 	The same cautions apply to call_srcu() and kfree_rcu().
 
-- 
cgit v1.2.3


From ca8a439e4611765b0e4bf5d4fd3fab54bab5ffe2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Aug 2022 10:23:42 -0700
Subject: doc: Fix list: rcu_access_pointer() is not lockdep-checked

The rcu_access_pointer() macro does not consult lockdep by design because
it is intended to be used outside of RCU read-side critical sections.
This commit therefore makes a separate list for it in whatisRCU.rst.

Similarly, RCU_LOCKDEP_WARN(), rcu_sleep_check(), and RCU_NONIDLE()
do not do anything with pointer access.  This commit therefore creates
a separate utility-API list for them.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/whatisRCU.rst | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index 77ea260efd12..6940e0fe8599 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -1057,14 +1057,20 @@ SRCU: Initialization/cleanup::
 	init_srcu_struct
 	cleanup_srcu_struct
 
-All: lockdep-checked RCU-protected pointer access::
+All: lockdep-checked RCU utility APIs::
 
-	rcu_access_pointer
-	rcu_dereference_raw
 	RCU_LOCKDEP_WARN
 	rcu_sleep_check
 	RCU_NONIDLE
 
+All: Unchecked RCU-protected pointer access::
+
+	rcu_dereference_raw
+
+All: Unchecked RCU-protected pointer access with dereferencing prohibited::
+
+	rcu_access_pointer
+
 See the comment headers in the source code (or the docbook generated
 from them) for more information.
 
-- 
cgit v1.2.3


From 022d1b356c8a687a58a74cbc90a92d56d3aa207d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 11:23:19 -0700
Subject: doc: Update rcu_access_pointer() advice in rcu_dereference.rst

This commit updates the rcu_access_pointer() advice, noting that its
return value should not be assigned to a local variable, and also noting
that there is little point in using rcu_access_pointer() within an RCU
read-side critical section.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/rcu_dereference.rst | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst
index 0b418a5b243c..81e828c8313b 100644
--- a/Documentation/RCU/rcu_dereference.rst
+++ b/Documentation/RCU/rcu_dereference.rst
@@ -128,10 +128,16 @@ Follow these rules to keep your RCU code working properly:
 		This sort of comparison occurs frequently when scanning
 		RCU-protected circular linked lists.
 
-		Note that if checks for being within an RCU read-side
-		critical section are not required and the pointer is never
-		dereferenced, rcu_access_pointer() should be used in place
-		of rcu_dereference().
+		Note that if the pointer comparison is done outside
+		of an RCU read-side critical section, and the pointer
+		is never dereferenced, rcu_access_pointer() should be
+		used in place of rcu_dereference().  In most cases,
+		it is best to avoid accidental dereferences by testing
+		the rcu_access_pointer() return value directly, without
+		assigning it to a variable.
+
+		Within an RCU read-side critical section, there is little
+		reason to use rcu_access_pointer().
 
 	-	The comparison is against a pointer that references memory
 		that was initialized "a long time ago."  The reason
-- 
cgit v1.2.3


From 99cf092058e13f133ff9373fcbd25ae38509df03 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 9 Aug 2022 10:24:36 -0700
Subject: doc: SLAB_TYPESAFE_BY_RCU uses cannot rely on spinlocks

Because the SLAB_TYPESAFE_BY_RCU code does not zero pages that are
to be broken up into slabs, the memory returned by kmem_cache_alloc()
must be fully initialized, including any spinlocks included in the newly
allocated structure.  This means that readers attempting to look up an
SLAB_TYPESAFE_BY_RCU object must use a reference-counting approach.
A spinlock may be acquired only after a reference is obtained, which
prevents that object from being passed to kmem_struct_free(), but only
while that reference continues to be held.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/whatisRCU.rst | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index 6940e0fe8599..97f2d0fa84df 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -915,13 +915,18 @@ which an RCU reference is held include:
 The understanding that RCU provides a reference that only prevents a
 change of type is particularly visible with objects allocated from a
 slab cache marked ``SLAB_TYPESAFE_BY_RCU``.  RCU operations may yield a
-reference to an object from such a cache that has been concurrently
-freed and the memory reallocated to a completely different object,
-though of the same type.  In this case RCU doesn't even protect the
-identity of the object from changing, only its type.  So the object
-found may not be the one expected, but it will be one where it is safe
-to take a reference or spinlock and then confirm that the identity
-matches the expectations.
+reference to an object from such a cache that has been concurrently freed
+and the memory reallocated to a completely different object, though of
+the same type.  In this case RCU doesn't even protect the identity of the
+object from changing, only its type.  So the object found may not be the
+one expected, but it will be one where it is safe to take a reference
+(and then potentially acquiring a spinlock), allowing subsequent code
+to check whether the identity matches expectations.  It is tempting
+to simply acquire the spinlock without first taking the reference, but
+unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be
+initialized after each and every call to kmem_cache_alloc(), which renders
+reference-free spinlock acquisition completely unsafe.  Therefore, when
+using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter.
 
 With traditional reference counting -- such as that implemented by the
 kref library in Linux -- there is typically code that runs when the last
-- 
cgit v1.2.3


From ef32aee9aa062bfbaab2cab36ee837be6060d0a1 Mon Sep 17 00:00:00 2001
From: Shao-Tse Hung <ccs100203@gmail.com>
Date: Sat, 20 Aug 2022 16:32:44 +0800
Subject: doc/rcu: Update LWN article URLs and add 2019 article

This patch adds LWN articles about RCU APIs which were released in 2019.
Also, HTTP URLs are replaced by HTTPS.

Signed-off-by: Shao-Tse Hung <ccs100203@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/whatisRCU.rst | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst
index 97f2d0fa84df..1c747ac3f2c8 100644
--- a/Documentation/RCU/whatisRCU.rst
+++ b/Documentation/RCU/whatisRCU.rst
@@ -6,13 +6,15 @@ What is RCU?  --  "Read, Copy, Update"
 Please note that the "What is RCU?" LWN series is an excellent place
 to start learning about RCU:
 
-| 1.	What is RCU, Fundamentally?  http://lwn.net/Articles/262464/
-| 2.	What is RCU? Part 2: Usage   http://lwn.net/Articles/263130/
-| 3.	RCU part 3: the RCU API      http://lwn.net/Articles/264090/
-| 4.	The RCU API, 2010 Edition    http://lwn.net/Articles/418853/
-| 	2010 Big API Table           http://lwn.net/Articles/419086/
-| 5.	The RCU API, 2014 Edition    http://lwn.net/Articles/609904/
-|	2014 Big API Table           http://lwn.net/Articles/609973/
+| 1.	What is RCU, Fundamentally?  https://lwn.net/Articles/262464/
+| 2.	What is RCU? Part 2: Usage   https://lwn.net/Articles/263130/
+| 3.	RCU part 3: the RCU API      https://lwn.net/Articles/264090/
+| 4.	The RCU API, 2010 Edition    https://lwn.net/Articles/418853/
+| 	2010 Big API Table           https://lwn.net/Articles/419086/
+| 5.	The RCU API, 2014 Edition    https://lwn.net/Articles/609904/
+|	2014 Big API Table           https://lwn.net/Articles/609973/
+| 6.	The RCU API, 2019 Edition    https://lwn.net/Articles/777036/
+|	2019 Big API Table           https://lwn.net/Articles/777165/
 
 
 What is RCU?
-- 
cgit v1.2.3


From 6d60ea03ac2d3dcf6ddee6b45aa7213d8b0461c5 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Thu, 16 Jun 2022 21:53:47 +0800
Subject: rcu: Fix rcu_read_unlock_strict() strict QS reporting

Kernels built with CONFIG_PREEMPT=n and CONFIG_RCU_STRICT_GRACE_PERIOD=y
report the quiescent state directly from the outermost rcu_read_unlock().
However, the current CPU's rcu_data structure's ->cpu_no_qs.b.norm
might still be set, in which case rcu_report_qs_rdp() will exit early,
thus failing to report quiescent state.

This commit therefore causes rcu_read_unlock_strict() to clear
CPU's rcu_data structure's ->cpu_no_qs.b.norm field before invoking
rcu_report_qs_rdp().

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 438ecae6bd7e..86772c95ed0a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -824,6 +824,7 @@ void rcu_read_unlock_strict(void)
 	if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
 		return;
 	rdp = this_cpu_ptr(&rcu_data);
+	rdp->cpu_no_qs.b.norm = false;
 	rcu_report_qs_rdp(rdp);
 	udelay(rcu_unlock_delay);
 }
-- 
cgit v1.2.3


From bca4fa8cb0f4c096b515952f64e560fd784a0514 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Mon, 20 Jun 2022 14:42:24 +0800
Subject: rcu: Update rcu_preempt_deferred_qs() comments for !PREEMPT kernels

In non-premptible kernels, tasks never do context switches within
RCU read-side critical sections.  Therefore, in such kernels, each
leaf rcu_node structure's ->blkd_tasks list will always be empty.
The comment on the non-preemptible version of rcu_preempt_deferred_qs()
confuses this point, so this commit therefore fixes it.

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 86772c95ed0a..4152816dd29f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -932,10 +932,13 @@ static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
 	return false;
 }
 
-// Except that we do need to respond to a request by an expedited grace
-// period for a quiescent state from this CPU.  Note that requests from
-// tasks are handled when removing the task from the blocked-tasks list
-// below.
+// Except that we do need to respond to a request by an expedited
+// grace period for a quiescent state from this CPU.  Note that in
+// non-preemptible kernels, there can be no context switches within RCU
+// read-side critical sections, which in turn means that the leaf rcu_node
+// structure's blocked-tasks list is always empty.  is therefore no need to
+// actually check it.  Instead, a quiescent state from this CPU suffices,
+// and this function is only called from such a quiescent state.
 notrace void rcu_preempt_deferred_qs(struct task_struct *t)
 {
 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-- 
cgit v1.2.3


From fcb42c9a77d490ed0974e4d394519481aa06e585 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Tue, 5 Jul 2022 12:09:51 -0700
Subject: rcu: Add QS check in rcu_exp_handler() for non-preemptible kernels

Kernels built with CONFIG_PREEMPTION=n and CONFIG_PREEMPT_COUNT=y maintain
preempt_count() state.  Because such kernels map __rcu_read_lock()
and __rcu_read_unlock() to preempt_disable() and preempt_enable(),
respectively, this allows the expedited grace period's !CONFIG_PREEMPT_RCU
version of the rcu_exp_handler() IPI handler function to use
preempt_count() to detect quiescent states.

This preempt_count() usage might seem to risk failures due to
use of implicit RCU readers in portions of the kernel under #ifndef
CONFIG_PREEMPTION, except that rcu_core() already disallows such implicit
RCU readers.  The moral of this story is that you must use explicit
read-side markings such as rcu_read_lock() or preempt_disable() even if
the code knows that this kernel does not support preemption.

This commit therefore adds a preempt_count()-based check for a quiescent
state in the !CONFIG_PREEMPT_RCU version of the rcu_exp_handler()
function for kernels built with CONFIG_PREEMPT_COUNT=y, reporting an
immediate quiescent state when the interrupted code had both preemption
and softirqs enabled.

This change results in about a 2% reduction in expedited grace-period
latency in kernels built with both CONFIG_PREEMPT_RCU=n and
CONFIG_PREEMPT_COUNT=y.

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Link: https://lore.kernel.org/all/20220622103549.2840087-1-qiang1.zhang@intel.com/
---
 kernel/rcu/tree_exp.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index be667583a554..b07998159d1f 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -828,11 +828,13 @@ static void rcu_exp_handler(void *unused)
 {
 	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
 	struct rcu_node *rnp = rdp->mynode;
+	bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
 
 	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
 	    __this_cpu_read(rcu_data.cpu_no_qs.b.exp))
 		return;
-	if (rcu_is_cpu_rrupt_from_idle()) {
+	if (rcu_is_cpu_rrupt_from_idle() ||
+	    (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) {
 		rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
 		return;
 	}
-- 
cgit v1.2.3


From 6ca0292ccf968e0132e64d6a699d36ba3e92cb81 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Fri, 1 Jul 2022 10:44:04 +0800
Subject: rcu: Make tiny RCU support leak callbacks for debug-object errors

Currently, only Tree RCU leaks callbacks setting when it detects a
duplicate call_rcu().  This commit causes Tiny RCU to also leak
callbacks in this situation.

Because this is Tiny RCU, kernel size is important:

1. CONFIG_TINY_RCU=y and CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
   (Production kernel)

    Original:
    text      data      bss       dec       hex     filename
    26290663  20159823  15212544  61663030  3ace736 vmlinux

    With this commit:
    text      data      bss       dec       hex     filename
    26290663  20159823  15212544  61663030  3ace736 vmlinux

2. CONFIG_TINY_RCU=y and CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
   (Debugging kernel)

    Original:
    text      data      bss       dec       hex     filename
    26291319  20160143  15212544  61664006  3aceb06 vmlinux

    With this commit:
    text      data      bss       dec       hex     filename
    26291319  20160431  15212544  61664294  3acec26 vmlinux

These results show that the kernel size is unchanged for production
kernels, as desired.

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tiny.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f0561ee16b9c..943d431b908f 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -158,6 +158,10 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
+static void tiny_rcu_leak_callback(struct rcu_head *rhp)
+{
+}
+
 /*
  * Post an RCU callback to be invoked after the end of an RCU grace
  * period.  But since we have but one CPU, that would be after any
@@ -165,9 +169,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu);
  */
 void call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
+	static atomic_t doublefrees;
 	unsigned long flags;
 
-	debug_rcu_head_queue(head);
+	if (debug_rcu_head_queue(head)) {
+		if (atomic_inc_return(&doublefrees) < 4) {
+			pr_err("%s(): Double-freed CB %p->%pS()!!!  ", __func__, head, head->func);
+			mem_dump_obj(head);
+		}
+
+		if (!__is_kvfree_rcu_offset((unsigned long)head->func))
+			WRITE_ONCE(head->func, tiny_rcu_leak_callback);
+		return;
+	}
+
 	head->func = func;
 	head->next = NULL;
 
-- 
cgit v1.2.3


From 089254fd386eb6800dd7d7863f12a04ada0c35fa Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Aug 2022 08:48:12 -0700
Subject: rcu: Document reason for rcu_all_qs() call to preempt_disable()

Given that rcu_all_qs() is in non-preemptible kernels, why on earth should
it invoke preempt_disable()?  This commit adds the reason, which is to
work nicely with debugging enabled in CONFIG_PREEMPT_COUNT=y kernels.

Reported-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Reported-by: Boqun Feng <boqun.feng@gmail.com>
Reported-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 4152816dd29f..c46b3c74dad1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -870,7 +870,7 @@ void rcu_all_qs(void)
 
 	if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
 		return;
-	preempt_disable();
+	preempt_disable();  // For CONFIG_PREEMPT_COUNT=y kernels
 	/* Load rcu_urgent_qs before other flags. */
 	if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
 		preempt_enable();
-- 
cgit v1.2.3


From d8f3f5834febb74d18b5b2098f67d9db740f3e30 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Aug 2022 10:36:32 -0700
Subject: rcu: Update rcu_access_pointer() header for
 rcu_dereference_protected()

The rcu_access_pointer() docbook header correctly notes that it may be
used during post-grace-period teardown.  However, it is usually better to
use rcu_dereference_protected() for this purpose.  This commit therefore
calls out this preferred usage.

Reported-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f527f27e6438..61a1a85c720c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -496,13 +496,21 @@ do {									      \
  * against NULL.  Although rcu_access_pointer() may also be used in cases
  * where update-side locks prevent the value of the pointer from changing,
  * you should instead use rcu_dereference_protected() for this use case.
+ * Within an RCU read-side critical section, there is little reason to
+ * use rcu_access_pointer().
+ *
+ * It is usually best to test the rcu_access_pointer() return value
+ * directly in order to avoid accidental dereferences being introduced
+ * by later inattentive changes.  In other words, assigning the
+ * rcu_access_pointer() return value to a local variable results in an
+ * accident waiting to happen.
  *
  * It is also permissible to use rcu_access_pointer() when read-side
- * access to the pointer was removed at least one grace period ago, as
- * is the case in the context of the RCU callback that is freeing up
- * the data, or after a synchronize_rcu() returns.  This can be useful
- * when tearing down multi-linked structures after a grace period
- * has elapsed.
+ * access to the pointer was removed at least one grace period ago, as is
+ * the case in the context of the RCU callback that is freeing up the data,
+ * or after a synchronize_rcu() returns.  This can be useful when tearing
+ * down multi-linked structures after a grace period has elapsed.  However,
+ * rcu_dereference_protected() is normally preferred for this use case.
  */
 #define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu)
 
-- 
cgit v1.2.3


From e73dfe30930b75c98746152e7a2f6a8ab6067b51 Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Thu, 4 Aug 2022 10:34:19 +0800
Subject: sched/debug: Try trigger_single_cpu_backtrace(cpu) in dump_cpu_task()

The trigger_all_cpu_backtrace() function attempts to send an NMI to the
target CPU, which usually provides much better stack traces than the
dump_cpu_task() function's approach of dumping that stack from some other
CPU.  So much so that most calls to dump_cpu_task() only happen after
a call to trigger_all_cpu_backtrace() has failed.  And the exception to
this rule really should attempt to use trigger_all_cpu_backtrace() first.

Therefore, move the trigger_all_cpu_backtrace() invocation into
dump_cpu_task().

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Valentin Schneider <vschneid@redhat.com>
---
 kernel/rcu/tree_stall.h | 5 ++---
 kernel/sched/core.c     | 3 +++
 kernel/smp.c            | 3 +--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index c3fbbcc09327..5653560573e2 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -368,7 +368,7 @@ static void rcu_dump_cpu_stacks(void)
 			if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
 				if (cpu_is_offline(cpu))
 					pr_err("Offline CPU %d blocking current GP.\n", cpu);
-				else if (!trigger_single_cpu_backtrace(cpu))
+				else
 					dump_cpu_task(cpu);
 			}
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -511,8 +511,7 @@ static void rcu_check_gp_kthread_starvation(void)
 					pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu);
 				} else  {
 					pr_err("Stack dump where RCU GP kthread last ran:\n");
-					if (!trigger_single_cpu_backtrace(cpu))
-						dump_cpu_task(cpu);
+					dump_cpu_task(cpu);
 				}
 			}
 			wake_up_process(gpk);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ee28253c9ac0..e15b6a7f34f4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11183,6 +11183,9 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 
 void dump_cpu_task(int cpu)
 {
+	if (trigger_single_cpu_backtrace(cpu))
+		return;
+
 	pr_info("Task dump for CPU %d:\n", cpu);
 	sched_show_task(cpu_curr(cpu));
 }
diff --git a/kernel/smp.c b/kernel/smp.c
index 650810a6f29b..e8cdc025a046 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -370,8 +370,7 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
 	if (cpu >= 0) {
 		if (static_branch_unlikely(&csdlock_debug_extended))
 			csd_lock_print_extended(csd, cpu);
-		if (!trigger_single_cpu_backtrace(cpu))
-			dump_cpu_task(cpu);
+		dump_cpu_task(cpu);
 		if (!cpu_cur_csd) {
 			pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
 			arch_send_call_function_single_ipi(cpu);
-- 
cgit v1.2.3


From bc1cca97e6da6c7c34db7c5b864bb354ca5305ac Mon Sep 17 00:00:00 2001
From: Zhen Lei <thunder.leizhen@huawei.com>
Date: Thu, 4 Aug 2022 10:34:20 +0800
Subject: sched/debug: Show the registers of 'current' in dump_cpu_task()

The dump_cpu_task() function does not print registers on architectures
that do not support NMIs.  However, registers can be useful for
debugging.  Fortunately, in the case where dump_cpu_task() is invoked
from an interrupt handler and is dumping the current CPU's stack, the
get_irq_regs() function can be used to get the registers.

Therefore, this commit makes dump_cpu_task() check to see if it is being
asked to dump the current CPU's stack from within an interrupt handler,
and, if so, it uses the get_irq_regs() function to obtain the registers.
On systems that do support NMIs, this commit has the further advantage
of avoiding a self-NMI in this case.

This is an example of rcu self-detected stall on arm64, which does not
support NMIs:
[   27.501721] rcu: INFO: rcu_preempt self-detected stall on CPU
[   27.502238] rcu:     0-....: (1250 ticks this GP) idle=4f7/1/0x4000000000000000 softirq=2594/2594 fqs=619
[   27.502632]  (t=1251 jiffies g=2989 q=29 ncpus=4)
[   27.503845] CPU: 0 PID: 306 Comm: test0 Not tainted 5.19.0-rc7-00009-g1c1a6c29ff99-dirty #46
[   27.504732] Hardware name: linux,dummy-virt (DT)
[   27.504947] pstate: 20000005 (nzCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   27.504998] pc : arch_counter_read+0x18/0x24
[   27.505301] lr : arch_counter_read+0x18/0x24
[   27.505328] sp : ffff80000b29bdf0
[   27.505345] x29: ffff80000b29bdf0 x28: 0000000000000000 x27: 0000000000000000
[   27.505475] x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000
[   27.505553] x23: 0000000000001f40 x22: ffff800009849c48 x21: 000000065f871ae0
[   27.505627] x20: 00000000000025ec x19: ffff80000a6eb300 x18: ffffffffffffffff
[   27.505654] x17: 0000000000000001 x16: 0000000000000000 x15: ffff80000a6d0296
[   27.505681] x14: ffffffffffffffff x13: ffff80000a29bc18 x12: 0000000000000426
[   27.505709] x11: 0000000000000162 x10: ffff80000a2f3c18 x9 : ffff80000a29bc18
[   27.505736] x8 : 00000000ffffefff x7 : ffff80000a2f3c18 x6 : 00000000759bd013
[   27.505761] x5 : 01ffffffffffffff x4 : 0002dc6c00000000 x3 : 0000000000000017
[   27.505787] x2 : 00000000000025ec x1 : ffff80000b29bdf0 x0 : 0000000075a30653
[   27.505937] Call trace:
[   27.506002]  arch_counter_read+0x18/0x24
[   27.506171]  ktime_get+0x48/0xa0
[   27.506207]  test_task+0x70/0xf0
[   27.506227]  kthread+0x10c/0x110
[   27.506243]  ret_from_fork+0x10/0x20

This is a marked improvement over the old output:
[   27.944550] rcu: INFO: rcu_preempt self-detected stall on CPU
[   27.944980] rcu:     0-....: (1249 ticks this GP) idle=cbb/1/0x4000000000000000 softirq=2610/2610 fqs=614
[   27.945407]  (t=1251 jiffies g=2681 q=28 ncpus=4)
[   27.945731] Task dump for CPU 0:
[   27.945844] task:test0           state:R  running task     stack:    0 pid:  306 ppid:     2 flags:0x0000000a
[   27.946073] Call trace:
[   27.946151]  dump_backtrace.part.0+0xc8/0xd4
[   27.946378]  show_stack+0x18/0x70
[   27.946405]  sched_show_task+0x150/0x180
[   27.946427]  dump_cpu_task+0x44/0x54
[   27.947193]  rcu_dump_cpu_stacks+0xec/0x130
[   27.947212]  rcu_sched_clock_irq+0xb18/0xef0
[   27.947231]  update_process_times+0x68/0xac
[   27.947248]  tick_sched_handle+0x34/0x60
[   27.947266]  tick_sched_timer+0x4c/0xa4
[   27.947281]  __hrtimer_run_queues+0x178/0x360
[   27.947295]  hrtimer_interrupt+0xe8/0x244
[   27.947309]  arch_timer_handler_virt+0x38/0x4c
[   27.947326]  handle_percpu_devid_irq+0x88/0x230
[   27.947342]  generic_handle_domain_irq+0x2c/0x44
[   27.947357]  gic_handle_irq+0x44/0xc4
[   27.947376]  call_on_irq_stack+0x2c/0x54
[   27.947415]  do_interrupt_handler+0x80/0x94
[   27.947431]  el1_interrupt+0x34/0x70
[   27.947447]  el1h_64_irq_handler+0x18/0x24
[   27.947462]  el1h_64_irq+0x64/0x68                       <--- the above backtrace is worthless
[   27.947474]  arch_counter_read+0x18/0x24
[   27.947487]  ktime_get+0x48/0xa0
[   27.947501]  test_task+0x70/0xf0
[   27.947520]  kthread+0x10c/0x110
[   27.947538]  ret_from_fork+0x10/0x20

Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ben Segall <bsegall@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Valentin Schneider <vschneid@redhat.com>
---
 kernel/sched/core.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e15b6a7f34f4..60fdc0faf1c9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
 
 #include <uapi/linux/sched/types.h>
 
+#include <asm/irq_regs.h>
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
 
@@ -11183,6 +11184,16 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 
 void dump_cpu_task(int cpu)
 {
+	if (cpu == smp_processor_id() && in_hardirq()) {
+		struct pt_regs *regs;
+
+		regs = get_irq_regs();
+		if (regs) {
+			show_regs(regs);
+			return;
+		}
+	}
+
 	if (trigger_single_cpu_backtrace(cpu))
 		return;
 
-- 
cgit v1.2.3


From 621189a1fe93cb2b34d62c5cdb9e258bca044813 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Mon, 8 Aug 2022 10:26:26 +0800
Subject: rcu: Avoid triggering strict-GP irq-work when RCU is idle

Kernels built with PREEMPT_RCU=y and RCU_STRICT_GRACE_PERIOD=y trigger
irq-work from rcu_read_unlock(), and the resulting irq-work handler
invokes rcu_preempt_deferred_qs_handle().  The point of this triggering
is to force grace periods to end quickly in order to give tools like KASAN
a better chance of detecting RCU usage bugs such as leaking RCU-protected
pointers out of an RCU read-side critical section.

However, this irq-work triggering is unconditional.  This works, but
there is no point in doing this irq-work unless the current grace period
is waiting on the running CPU or task, which is not the common case.
After all, in the common case there are many rcu_read_unlock() calls
per CPU per grace period.

This commit therefore triggers the irq-work only when the current grace
period is waiting on the running CPU or task.

This change was tested as follows on a four-CPU system:

	echo rcu_preempt_deferred_qs_handler > /sys/kernel/debug/tracing/set_ftrace_filter
	echo 1 > /sys/kernel/debug/tracing/function_profile_enabled
	insmod rcutorture.ko
	sleep 20
	rmmod rcutorture.ko
	echo 0 > /sys/kernel/debug/tracing/function_profile_enabled
	echo > /sys/kernel/debug/tracing/set_ftrace_filter

This procedure produces results in this per-CPU set of files:

	/sys/kernel/debug/tracing/trace_stat/function*

Sample output from one of these files is as follows:

  Function                               Hit    Time            Avg             s^2
  --------                               ---    ----            ---             ---
  rcu_preempt_deferred_qs_handle      838746    182650.3 us     0.217 us        0.004 us

The baseline sum of the "Hit" values (the number of calls to this
function) was 3,319,015.  With this commit, that sum was 1,140,359,
for a 2.9x reduction.  The worst-case variance across the CPUs was less
than 25%, so this large effect size is statistically significant.

The raw data is available in the Link: URL.

Link: https://lore.kernel.org/all/20220808022626.12825-1-qiang1.zhang@intel.com/
Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c46b3c74dad1..207617f69aa5 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -641,7 +641,8 @@ static void rcu_read_unlock_special(struct task_struct *t)
 
 		expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
 			   (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
-			   IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ||
+			   (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
+			   ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
 			   (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
 			    t->rcu_blocked_node);
 		// Need to defer quiescent state until everything is enabled.
-- 
cgit v1.2.3


From 7634b1eaa0cd135d5eedadb04ad3c91b1ecf28a9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 24 Aug 2022 14:46:56 -0700
Subject: rcu: Exclude outgoing CPU when it is the last to leave

The rcu_boost_kthread_setaffinity() function removes the outgoing CPU
from the set_cpus_allowed() mask for the corresponding leaf rcu_node
structure's rcub priority-boosting kthread.  Except that if the outgoing
CPU will leave that structure without any online CPUs, the mask is set
to the housekeeping CPU mask from housekeeping_cpumask().  Which is fine
unless the outgoing CPU happens to be a housekeeping CPU.

This commit therefore removes the outgoing CPU from the housekeeping mask.
This would of course be problematic if the outgoing CPU was the last
online housekeeping CPU, but in that case you are in a world of hurt
anyway.  If someone comes up with a valid use case for a system needing
all the housekeeping CPUs to be offline, further adjustments can be made.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 207617f69aa5..32b424b571bd 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1243,8 +1243,11 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 		    cpu != outgoingcpu)
 			cpumask_set_cpu(cpu, cm);
 	cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
-	if (cpumask_empty(cm))
+	if (cpumask_empty(cm)) {
 		cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
+		if (outgoingcpu >= 0)
+			cpumask_clear_cpu(outgoingcpu, cm);
+	}
 	set_cpus_allowed_ptr(t, cm);
 	mutex_unlock(&rnp->boost_kthread_mutex);
 	free_cpumask_var(cm);
-- 
cgit v1.2.3


From 093590c16b447f53e66771c8579ae66c96f6ef61 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Wed, 22 Jun 2022 13:47:11 +0200
Subject: rcu: Back off upon fill_page_cache_func() allocation failure

The fill_page_cache_func() function allocates couple of pages to store
kvfree_rcu_bulk_data structures. This is a lightweight (GFP_NORETRY)
allocation which can fail under memory pressure. The function will,
however keep retrying even when the previous attempt has failed.

This retrying is in theory correct, but in practice the allocation is
invoked from workqueue context, which means that if the memory reclaim
gets stuck, these retries can hog the worker for quite some time.
Although the workqueues subsystem automatically adjusts concurrency, such
adjustment is not guaranteed to happen until the worker context sleeps.
And the fill_page_cache_func() function's retry loop is not guaranteed
to sleep (see the should_reclaim_retry() function).

And we have seen this function cause workqueue lockups:

kernel: BUG: workqueue lockup - pool cpus=93 node=1 flags=0x1 nice=0 stuck for 32s!
[...]
kernel: pool 74: cpus=37 node=0 flags=0x1 nice=0 hung=32s workers=2 manager: 2146
kernel:   pwq 498: cpus=249 node=1 flags=0x1 nice=0 active=4/256 refcnt=5
kernel:     in-flight: 1917:fill_page_cache_func
kernel:     pending: dbs_work_handler, free_work, kfree_rcu_monitor

Originally, we thought that the root cause of this lockup was several
retries with direct reclaim, but this is not yet confirmed.  Furthermore,
we have seen similar lockups without any heavy memory pressure.  This
suggests that there are other factors contributing to these lockups.
However, it is not really clear that endless retries are desireable.

So let's make the fill_page_cache_func() function back off after
allocation failure.

Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 79aea7df4345..eb435941e92f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3183,15 +3183,16 @@ static void fill_page_cache_func(struct work_struct *work)
 		bnode = (struct kvfree_rcu_bulk_data *)
 			__get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
 
-		if (bnode) {
-			raw_spin_lock_irqsave(&krcp->lock, flags);
-			pushed = put_cached_bnode(krcp, bnode);
-			raw_spin_unlock_irqrestore(&krcp->lock, flags);
+		if (!bnode)
+			break;
 
-			if (!pushed) {
-				free_page((unsigned long) bnode);
-				break;
-			}
+		raw_spin_lock_irqsave(&krcp->lock, flags);
+		pushed = put_cached_bnode(krcp, bnode);
+		raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+		if (!pushed) {
+			free_page((unsigned long) bnode);
+			break;
 		}
 	}
 
-- 
cgit v1.2.3


From 38269096351806bf7315f971c53205b676ada259 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Wed, 22 Jun 2022 22:51:02 +0000
Subject: rcu/kfree: Fix kfree_rcu_shrink_count() return value

As per the comments in include/linux/shrinker.h, .count_objects callback
should return the number of freeable items, but if there are no objects
to free, SHRINK_EMPTY should be returned. The only time 0 is returned
should be when we are unable to determine the number of objects, or the
cache should be skipped for another reason.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index eb435941e92f..3d234d536d4c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3372,7 +3372,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 		atomic_set(&krcp->backoff_page_cache_fill, 1);
 	}
 
-	return count;
+	return count == 0 ? SHRINK_EMPTY : count;
 }
 
 static unsigned long
-- 
cgit v1.2.3


From 51824b780b719c53113dc39e027fbf670dc66028 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Thu, 30 Jun 2022 18:33:35 +0200
Subject: rcu/kvfree: Update KFREE_DRAIN_JIFFIES interval

Currently the monitor work is scheduled with a fixed interval of HZ/20,
which is roughly 50 milliseconds. The drawback of this approach is
low utilization of the 512 page slots in scenarios with infrequence
kvfree_rcu() calls.  For example on an Android system:

<snip>
  kworker/3:3-507     [003] ....   470.286305: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000d0f0dde5 nr_records=6
  kworker/6:1-76      [006] ....   470.416613: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000ea0d6556 nr_records=1
  kworker/6:1-76      [006] ....   470.416625: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x000000003e025849 nr_records=9
  kworker/3:3-507     [003] ....   471.390000: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000815a8713 nr_records=48
  kworker/1:1-73      [001] ....   471.725785: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000fda9bf20 nr_records=3
  kworker/1:1-73      [001] ....   471.725833: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000a425b67b nr_records=76
  kworker/0:4-1411    [000] ....   472.085673: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x000000007996be9d nr_records=1
  kworker/0:4-1411    [000] ....   472.085728: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000d0f0dde5 nr_records=5
  kworker/6:1-76      [006] ....   472.260340: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x0000000065630ee4 nr_records=102
<snip>

In many cases, out of 512 slots, fewer than 10 were actually used.
In order to improve batching and make utilization more efficient this
commit sets a drain interval to a fixed 5-seconds interval. Floods are
detected when a page fills quickly, and in that case, the reclaim work
is re-scheduled for the next scheduling-clock tick (jiffy).

After this change:

<snip>
  kworker/7:1-371     [007] ....  5630.725708: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x000000005ab0ffb3 nr_records=121
  kworker/7:1-371     [007] ....  5630.989702: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x0000000060c84761 nr_records=47
  kworker/7:1-371     [007] ....  5630.989714: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x000000000babf308 nr_records=510
  kworker/7:1-371     [007] ....  5631.553790: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000bb7bd0ef nr_records=169
  kworker/7:1-371     [007] ....  5631.553808: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x0000000044c78753 nr_records=510
  kworker/5:6-9428    [005] ....  5631.746102: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000d98519aa nr_records=123
  kworker/4:7-9434    [004] ....  5632.001758: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000526c9d44 nr_records=322
  kworker/4:7-9434    [004] ....  5632.002073: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x000000002c6a8afa nr_records=185
  kworker/7:1-371     [007] ....  5632.277515: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x000000007f4a962f nr_records=510
<snip>

Here, all but one of the cases, more than one hundreds slots were used,
representing an order-of-magnitude improvement.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3d234d536d4c..7b90478b752e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2832,7 +2832,7 @@ EXPORT_SYMBOL_GPL(call_rcu);
 
 
 /* Maximum number of jiffies to wait before draining a batch. */
-#define KFREE_DRAIN_JIFFIES (HZ / 50)
+#define KFREE_DRAIN_JIFFIES (5 * HZ)
 #define KFREE_N_BATCHES 2
 #define FREE_N_CHANNELS 2
 
@@ -3093,6 +3093,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp)
 	return !!krcp->head;
 }
 
+static void
+schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
+{
+	long delay, delay_left;
+
+	delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
+	if (delayed_work_pending(&krcp->monitor_work)) {
+		delay_left = krcp->monitor_work.timer.expires - jiffies;
+		if (delay < delay_left)
+			mod_delayed_work(system_wq, &krcp->monitor_work, delay);
+		return;
+	}
+	queue_delayed_work(system_wq, &krcp->monitor_work, delay);
+}
+
 /*
  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
  */
@@ -3150,7 +3165,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
 	// work to repeat an attempt. Because previous batches are
 	// still in progress.
 	if (need_offload_krc(krcp))
-		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+		schedule_delayed_monitor_work(krcp);
 
 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
 }
@@ -3339,7 +3354,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 
 	// Set timer to drain after KFREE_DRAIN_JIFFIES.
 	if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
-		schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+		schedule_delayed_monitor_work(krcp);
 
 unlock_return:
 	krc_this_cpu_unlock(krcp, flags);
@@ -3415,7 +3430,7 @@ void __init kfree_rcu_scheduler_running(void)
 
 		raw_spin_lock_irqsave(&krcp->lock, flags);
 		if (need_offload_krc(krcp))
-			schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
+			schedule_delayed_monitor_work(krcp);
 		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 	}
 }
-- 
cgit v1.2.3


From 5334da2af25eceb6a88ae162e69d6586b5cb9abc Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Fri, 17 Jun 2022 22:15:19 +0800
Subject: rcu/nocb: Choose the right rcuog/rcuop kthreads to output

The show_rcu_nocb_gp_state() function is supposed to dump out the rcuog
kthread and the show_rcu_nocb_state() function is supposed to dump out
the rcuo[ps] kthread.  Currently, both do a mixture, which is not optimal
for debugging, even though it does not affect functionality.

This commit therefore adjusts these two functions to focus on their
respective kthreads.

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_nocb.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index a8f574d8850d..f20aec4f4394 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1452,8 +1452,8 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
 		(long)rdp->nocb_gp_seq,
 		rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
 		rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
-		rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
-		show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+		rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+		show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread));
 }
 
 /* Dump out nocb kthread state for the specified rcu_data structure. */
@@ -1497,7 +1497,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
 		".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
 		rcu_segcblist_n_cbs(&rdp->cblist),
 		rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
-		rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+		rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
 		show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
 
 	/* It is OK for GP kthreads to have GP state. */
-- 
cgit v1.2.3


From 638dce227a41534bbc4fdd73280e73e0b3570048 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Aug 2022 17:39:28 -0700
Subject: rcu/nocb: Add CPU number to CPU-{,de}offload failure messages

Offline CPUs cannot be offloaded or deoffloaded.  Any attempt to offload
or deoffload an offline CPU causes a message to be printed on the console,
which is good, but this message does not contain the CPU number, which
is bad.  Such a CPU number can be helpful when debugging, as it gives a
clear indication that the CPU in question is in fact offline.  This commit
therefore adds the CPU number to the CPU-{,de}offload failure messages.

Cc: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_nocb.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index f20aec4f4394..0a5f0ef41484 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1111,7 +1111,7 @@ int rcu_nocb_cpu_deoffload(int cpu)
 			if (!ret)
 				cpumask_clear_cpu(cpu, rcu_nocb_mask);
 		} else {
-			pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
+			pr_info("NOCB: Cannot CB-deoffload offline CPU %d\n", rdp->cpu);
 			ret = -EINVAL;
 		}
 	}
@@ -1196,7 +1196,7 @@ int rcu_nocb_cpu_offload(int cpu)
 			if (!ret)
 				cpumask_set_cpu(cpu, rcu_nocb_mask);
 		} else {
-			pr_info("NOCB: Can't CB-offload an offline CPU\n");
+			pr_info("NOCB: Cannot CB-offload offline CPU %d\n", rdp->cpu);
 			ret = -EINVAL;
 		}
 	}
-- 
cgit v1.2.3


From 91a967fd6934abc0c7e4b0d26728e38674278707 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 Jul 2022 15:37:05 -0700
Subject: rcu: Add full-sized polling for get_completed*() and poll_state*()

The get_completed_synchronize_rcu() and poll_state_synchronize_rcu()
APIs compress the combined expedited and normal grace-period states into a
single unsigned long, which conserves storage, but can miss grace periods
in certain cases involving overlapping normal and expedited grace periods.
Missing the occasional grace period is usually not a problem, but there
are use cases that care about each and every grace period.

This commit therefore adds the first members of the full-state RCU
grace-period polling API, namely the get_completed_synchronize_rcu_full()
and poll_state_synchronize_rcu_full() functions.  These use up to three
times the storage (rcu_gp_oldstate structure instead of unsigned long),
but which are guaranteed not to miss grace periods, at least in situations
where the single-CPU grace-period optimization does not apply.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h |  3 ++
 include/linux/rcutiny.h  |  9 ++++++
 include/linux/rcutree.h  |  8 +++++
 kernel/rcu/rcutorture.c  |  9 ++++++
 kernel/rcu/tiny.c        | 10 +++++++
 kernel/rcu/tree.c        | 76 +++++++++++++++++++++++++++++++++++++++++++++---
 6 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f527f27e6438..faaa174dfb27 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -42,7 +42,10 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
 void rcu_barrier_tasks(void);
 void rcu_barrier_tasks_rude(void);
 void synchronize_rcu(void);
+
+struct rcu_gp_oldstate;
 unsigned long get_completed_synchronize_rcu(void);
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 
 #ifdef CONFIG_PREEMPT_RCU
 
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 62815c0a2dce..1fbff8600d92 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -14,10 +14,19 @@
 
 #include <asm/param.h> /* for HZ */
 
+struct rcu_gp_oldstate {
+	unsigned long rgos_norm;
+};
+
 unsigned long get_state_synchronize_rcu(void);
 unsigned long start_poll_synchronize_rcu(void);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 
+static inline bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	return poll_state_synchronize_rcu(rgosp->rgos_norm);
+}
+
 static inline void cond_synchronize_rcu(unsigned long oldstate)
 {
 	might_sleep();
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 47eaa4cb0df7..4ccbc3aa9dc2 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -40,11 +40,19 @@ bool rcu_eqs_special_set(int cpu);
 void rcu_momentary_dyntick_idle(void);
 void kfree_rcu_scheduler_running(void);
 bool rcu_gp_might_be_stalled(void);
+
+struct rcu_gp_oldstate {
+	unsigned long rgos_norm;
+	unsigned long rgos_exp;
+	unsigned long rgos_polled;
+};
+
 unsigned long start_poll_synchronize_rcu_expedited(void);
 void cond_synchronize_rcu_expedited(unsigned long oldstate);
 unsigned long get_state_synchronize_rcu(void);
 unsigned long start_poll_synchronize_rcu(void);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
+bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 void cond_synchronize_rcu(unsigned long oldstate);
 
 bool rcu_is_idle_cpu(int cpu);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d8e1b270a065..b31e6ed64d1b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -336,8 +336,10 @@ struct rcu_torture_ops {
 	void (*cond_sync_exp)(unsigned long oldstate);
 	unsigned long (*get_gp_state)(void);
 	unsigned long (*get_gp_completed)(void);
+	void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp);
 	unsigned long (*start_gp_poll)(void);
 	bool (*poll_gp_state)(unsigned long oldstate);
+	bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
 	void (*cond_sync)(unsigned long oldstate);
 	call_rcu_func_t call;
 	void (*cb_barrier)(void);
@@ -503,8 +505,10 @@ static struct rcu_torture_ops rcu_ops = {
 	.exp_sync		= synchronize_rcu_expedited,
 	.get_gp_state		= get_state_synchronize_rcu,
 	.get_gp_completed	= get_completed_synchronize_rcu,
+	.get_gp_completed_full	= get_completed_synchronize_rcu_full,
 	.start_gp_poll		= start_poll_synchronize_rcu,
 	.poll_gp_state		= poll_state_synchronize_rcu,
+	.poll_gp_state_full	= poll_state_synchronize_rcu_full,
 	.cond_sync		= cond_synchronize_rcu,
 	.get_gp_state_exp	= get_state_synchronize_rcu,
 	.start_gp_poll_exp	= start_poll_synchronize_rcu_expedited,
@@ -1212,6 +1216,7 @@ rcu_torture_writer(void *arg)
 	bool boot_ended;
 	bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
 	unsigned long cookie;
+	struct rcu_gp_oldstate cookie_full;
 	int expediting = 0;
 	unsigned long gp_snap;
 	int i;
@@ -1277,6 +1282,10 @@ rcu_torture_writer(void *arg)
 				}
 				cur_ops->readunlock(idx);
 			}
+			if (cur_ops->get_gp_completed_full && cur_ops->poll_gp_state_full) {
+				cur_ops->get_gp_completed_full(&cookie_full);
+				WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
+			}
 			switch (synctype[torture_random(&rand) % nsynctypes]) {
 			case RTWS_DEF_FREE:
 				rcu_torture_writer_state = RTWS_DEF_FREE;
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f0561ee16b9c..435edc785412 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -183,6 +183,16 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
+/*
+ * Store a grace-period-counter "cookie".  For more information,
+ * see the Tree RCU header comment.
+ */
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
+
 /*
  * Return a grace-period-counter "cookie".  For more information,
  * see the Tree RCU header comment.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 79aea7df4345..d47c9b6d8106 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3522,6 +3522,22 @@ void synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
+/**
+ * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie
+ * @rgosp: Place to put state cookie
+ *
+ * Stores into @rgosp a value that will always be treated by functions
+ * like poll_state_synchronize_rcu_full() as a cookie whose grace period
+ * has already completed.
+ */
+void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
+	rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
+	rgosp->rgos_polled = RCU_GET_STATE_COMPLETED;
+}
+EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
+
 /**
  * get_state_synchronize_rcu - Snapshot current RCU state
  *
@@ -3580,7 +3596,7 @@ unsigned long start_poll_synchronize_rcu(void)
 EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
 
 /**
- * poll_state_synchronize_rcu - Conditionally wait for an RCU grace period
+ * poll_state_synchronize_rcu - Has the specified RCU grace period completed?
  *
  * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
  *
@@ -3595,9 +3611,10 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
  * But counter wrap is harmless.  If the counter wraps, we have waited for
  * more than a billion grace periods (and way more on a 64-bit system!).
  * Those needing to keep oldstate values for very long time periods
- * (many hours even on 32-bit systems) should check them occasionally
- * and either refresh them or set a flag indicating that the grace period
- * has completed.
+ * (many hours even on 32-bit systems) should check them occasionally and
+ * either refresh them or set a flag indicating that the grace period has
+ * completed.  Alternatively, they can use get_completed_synchronize_rcu()
+ * to get a guaranteed-completed grace-period state.
  *
  * This function provides the same memory-ordering guarantees that
  * would be provided by a synchronize_rcu() that was invoked at the call
@@ -3615,6 +3632,57 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
 }
 EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
 
+/**
+ * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed?
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ *
+ * If a full RCU grace period has elapsed since the earlier call from
+ * which *rgosp was obtained, return @true, otherwise return @false.
+ * If @false is returned, it is the caller's responsibility to invoke this
+ * function later on until it does return @true.  Alternatively, the caller
+ * can explicitly wait for a grace period, for example, by passing @rgosp
+ * to cond_synchronize_rcu() or by directly invoking synchronize_rcu().
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless.  If the counter wraps, we have waited
+ * for more than a billion grace periods (and way more on a 64-bit
+ * system!).  Those needing to keep rcu_gp_oldstate values for very
+ * long time periods (many hours even on 32-bit systems) should check
+ * them occasionally and either refresh them or set a flag indicating
+ * that the grace period has completed.  Alternatively, they can use
+ * get_completed_synchronize_rcu_full() to get a guaranteed-completed
+ * grace-period state.
+ *
+ * This function provides the same memory-ordering guarantees that would
+ * be provided by a synchronize_rcu() that was invoked at the call to
+ * the function that provided @rgosp, and that returned at the end of this
+ * function.  And this guarantee requires that the root rcu_node structure's
+ * ->gp_seq field be checked instead of that of the rcu_state structure.
+ * The problem is that the just-ending grace-period's callbacks can be
+ * invoked between the time that the root rcu_node structure's ->gp_seq
+ * field is updated and the time that the rcu_state structure's ->gp_seq
+ * field is updated.  Therefore, if a single synchronize_rcu() is to
+ * cause a subsequent poll_state_synchronize_rcu_full() to return @true,
+ * then the root rcu_node structure is the one that needs to be polled.
+ */
+bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	struct rcu_node *rnp = rcu_get_root();
+
+	smp_mb(); // Order against root rcu_node structure grace-period cleanup.
+	if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED ||
+	    rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) ||
+	    rgosp->rgos_exp == RCU_GET_STATE_COMPLETED ||
+	    rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp) ||
+	    rgosp->rgos_polled == RCU_GET_STATE_COMPLETED ||
+	    rcu_seq_done_exact(&rcu_state.gp_seq_polled, rgosp->rgos_polled)) {
+		smp_mb(); /* Ensure GP ends before subsequent accesses. */
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full);
+
 /**
  * cond_synchronize_rcu - Conditionally wait for an RCU grace period
  *
-- 
cgit v1.2.3


From 3fdefca9b42c8bebe3beea5c1a067c9718ca0fc7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 Jul 2022 19:58:13 -0700
Subject: rcu: Add full-sized polling for get_state()

The get_state_synchronize_rcu() API compresses the combined expedited and
normal grace-period states into a single unsigned long, which conserves
storage, but can miss grace periods in certain cases involving overlapping
normal and expedited grace periods.  Missing the occasional grace period
is usually not a problem, but there are use cases that care about each
and every grace period.

This commit therefore adds the next member of the full-state RCU
grace-period polling API, namely the get_state_synchronize_rcu_full()
function.  This uses up to three times the storage (rcu_gp_oldstate
structure instead of unsigned long), but is guaranteed not to miss
grace periods.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h |  6 ++++++
 include/linux/rcutree.h |  1 +
 kernel/rcu/rcutorture.c | 10 ++++++----
 kernel/rcu/tree.c       | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 1fbff8600d92..6e299955c4e9 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -19,6 +19,12 @@ struct rcu_gp_oldstate {
 };
 
 unsigned long get_state_synchronize_rcu(void);
+
+static inline void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	rgosp->rgos_norm = get_state_synchronize_rcu();
+}
+
 unsigned long start_poll_synchronize_rcu(void);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 4ccbc3aa9dc2..7b769f1b417a 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -50,6 +50,7 @@ struct rcu_gp_oldstate {
 unsigned long start_poll_synchronize_rcu_expedited(void);
 void cond_synchronize_rcu_expedited(unsigned long oldstate);
 unsigned long get_state_synchronize_rcu(void);
+void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 unsigned long start_poll_synchronize_rcu(void);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b31e6ed64d1b..4f196ebce7f2 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -335,6 +335,7 @@ struct rcu_torture_ops {
 	bool (*poll_gp_state_exp)(unsigned long oldstate);
 	void (*cond_sync_exp)(unsigned long oldstate);
 	unsigned long (*get_gp_state)(void);
+	void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
 	unsigned long (*get_gp_completed)(void);
 	void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp);
 	unsigned long (*start_gp_poll)(void);
@@ -504,6 +505,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.sync			= synchronize_rcu,
 	.exp_sync		= synchronize_rcu_expedited,
 	.get_gp_state		= get_state_synchronize_rcu,
+	.get_gp_state_full	= get_state_synchronize_rcu_full,
 	.get_gp_completed	= get_completed_synchronize_rcu,
 	.get_gp_completed_full	= get_completed_synchronize_rcu_full,
 	.start_gp_poll		= start_poll_synchronize_rcu,
@@ -1293,12 +1295,12 @@ rcu_torture_writer(void *arg)
 				break;
 			case RTWS_EXP_SYNC:
 				rcu_torture_writer_state = RTWS_EXP_SYNC;
-				if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
-					cookie = cur_ops->get_gp_state();
+				if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+					cur_ops->get_gp_state_full(&cookie_full);
 				cur_ops->exp_sync();
 				cur_ops->exp_sync();
-				if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
-					WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
+				if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+					WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
 				rcu_torture_pipe_update(old_rp);
 				break;
 			case RTWS_COND_GET:
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d47c9b6d8106..3fa79ee78b5b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1755,6 +1755,8 @@ static noinline void rcu_gp_cleanup(void)
 			dump_blkd_tasks(rnp, 10);
 		WARN_ON_ONCE(rnp->qsmask);
 		WRITE_ONCE(rnp->gp_seq, new_gp_seq);
+		if (!rnp->parent)
+			smp_mb(); // Order against failing poll_state_synchronize_rcu_full().
 		rdp = this_cpu_ptr(&rcu_data);
 		if (rnp == rdp->mynode)
 			needgp = __note_gp_changes(rnp, rdp) || needgp;
@@ -3556,6 +3558,37 @@ unsigned long get_state_synchronize_rcu(void)
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
 
+/**
+ * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited
+ * @rgosp: location to place combined normal/expedited grace-period state
+ *
+ * Places the normal and expedited grace-period states in @rgosp.  This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * The rcu_gp_oldstate structure takes up twice the memory of an unsigned
+ * long, but is guaranteed to see all grace periods.  In contrast, the
+ * combined state occupies less memory, but can sometimes fail to take
+ * grace periods into account.
+ *
+ * This does not guarantee that the needed grace period will actually
+ * start.
+ */
+void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	struct rcu_node *rnp = rcu_get_root();
+
+	/*
+	 * Any prior manipulation of RCU-protected data must happen
+	 * before the loads from ->gp_seq and ->expedited_sequence.
+	 */
+	smp_mb();  /* ^^^ */
+	rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
+	rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
+	rgosp->rgos_polled = rcu_seq_snap(&rcu_state.gp_seq_polled);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
+
 /**
  * start_poll_synchronize_rcu - Snapshot and start RCU grace period
  *
-- 
cgit v1.2.3


From ccb42229fb34bc3e93f7aa081da3e78eac68cd27 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 29 Jul 2022 22:05:17 -0700
Subject: rcutorture: Abstract synchronous and polled API testing

This commit abstracts a do_rtws_sync() function that does synchronous
grace-period testing, but also testing the polled API 25% of the time
each for the normal and full-state variants of the polled API.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 48 ++++++++++++++++++++++++++++++++++++------------
 1 file changed, 36 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4f196ebce7f2..c3c94e343eb2 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1207,6 +1207,40 @@ static void rcu_torture_write_types(void)
 	}
 }
 
+/*
+ * Do the specified rcu_torture_writer() synchronous grace period,
+ * while also testing out the polled APIs.  Note well that the single-CPU
+ * grace-period optimizations must be accounted for.
+ */
+static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
+{
+	unsigned long cookie;
+	struct rcu_gp_oldstate cookie_full;
+	bool dopoll;
+	bool dopoll_full;
+	unsigned long r = torture_random(trsp);
+
+	dopoll = cur_ops->get_gp_state && cur_ops->poll_gp_state && !(r & 0x300);
+	dopoll_full = cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full && !(r & 0xc00);
+	if (dopoll || dopoll_full)
+		cpus_read_lock();
+	if (dopoll)
+		cookie = cur_ops->get_gp_state();
+	if (dopoll_full)
+		cur_ops->get_gp_state_full(&cookie_full);
+	if (dopoll || (!IS_ENABLED(CONFIG_TINY_RCU) && dopoll_full && num_online_cpus() <= 1))
+		sync();
+	sync();
+	WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie),
+		  "%s: Cookie check 3 failed %pS() online %*pbl.",
+		  __func__, sync, cpumask_pr_args(cpu_online_mask));
+	WARN_ONCE(dopoll_full && !cur_ops->poll_gp_state_full(&cookie_full),
+		  "%s: Cookie check 4 failed %pS() online %*pbl",
+		  __func__, sync, cpumask_pr_args(cpu_online_mask));
+	if (dopoll || dopoll_full)
+		cpus_read_unlock();
+}
+
 /*
  * RCU torture writer kthread.  Repeatedly substitutes a new structure
  * for that pointed to by rcu_torture_current, freeing the old structure
@@ -1295,12 +1329,7 @@ rcu_torture_writer(void *arg)
 				break;
 			case RTWS_EXP_SYNC:
 				rcu_torture_writer_state = RTWS_EXP_SYNC;
-				if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
-					cur_ops->get_gp_state_full(&cookie_full);
-				cur_ops->exp_sync();
-				cur_ops->exp_sync();
-				if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
-					WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
+				do_rtws_sync(&rand, cur_ops->exp_sync);
 				rcu_torture_pipe_update(old_rp);
 				break;
 			case RTWS_COND_GET:
@@ -1339,12 +1368,7 @@ rcu_torture_writer(void *arg)
 				break;
 			case RTWS_SYNC:
 				rcu_torture_writer_state = RTWS_SYNC;
-				if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
-					cookie = cur_ops->get_gp_state();
-				cur_ops->sync();
-				cur_ops->sync();
-				if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
-					WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie));
+				do_rtws_sync(&rand, cur_ops->sync);
 				rcu_torture_pipe_update(old_rp);
 				break;
 			default:
-- 
cgit v1.2.3


From ed7d2f1abee48a90fae5fdf5c60d71803a5a1d10 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 1 Aug 2022 16:57:17 -0700
Subject: rcutorture: Allow per-RCU-flavor polled double-GP check

Only vanilla RCU needs a double grace period for its compressed
polled grace-period old-state cookie.  This commit therefore adds an
rcu_torture_ops per-flavor function ->poll_need_2gp to allow this check
to be adapted to the RCU flavor under test.  A NULL pointer for this
function says that doubled grace periods are never needed.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index c3c94e343eb2..f2564c7633a8 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -341,6 +341,7 @@ struct rcu_torture_ops {
 	unsigned long (*start_gp_poll)(void);
 	bool (*poll_gp_state)(unsigned long oldstate);
 	bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
+	bool (*poll_need_2gp)(bool poll, bool poll_full);
 	void (*cond_sync)(unsigned long oldstate);
 	call_rcu_func_t call;
 	void (*cb_barrier)(void);
@@ -492,6 +493,11 @@ static void rcu_sync_torture_init(void)
 	INIT_LIST_HEAD(&rcu_torture_removed);
 }
 
+static bool rcu_poll_need_2gp(bool poll, bool poll_full)
+{
+	return poll || (!IS_ENABLED(CONFIG_TINY_RCU) && poll_full && num_online_cpus() <= 1);
+}
+
 static struct rcu_torture_ops rcu_ops = {
 	.ttype			= RCU_FLAVOR,
 	.init			= rcu_sync_torture_init,
@@ -511,6 +517,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.start_gp_poll		= start_poll_synchronize_rcu,
 	.poll_gp_state		= poll_state_synchronize_rcu,
 	.poll_gp_state_full	= poll_state_synchronize_rcu_full,
+	.poll_need_2gp		= rcu_poll_need_2gp,
 	.cond_sync		= cond_synchronize_rcu,
 	.get_gp_state_exp	= get_state_synchronize_rcu,
 	.start_gp_poll_exp	= start_poll_synchronize_rcu_expedited,
@@ -1228,7 +1235,7 @@ static void do_rtws_sync(struct torture_random_state *trsp, void (*sync)(void))
 		cookie = cur_ops->get_gp_state();
 	if (dopoll_full)
 		cur_ops->get_gp_state_full(&cookie_full);
-	if (dopoll || (!IS_ENABLED(CONFIG_TINY_RCU) && dopoll_full && num_online_cpus() <= 1))
+	if (cur_ops->poll_need_2gp && cur_ops->poll_need_2gp(dopoll, dopoll_full))
 		sync();
 	sync();
 	WARN_ONCE(dopoll && !cur_ops->poll_gp_state(cookie),
-- 
cgit v1.2.3


From d594231aa50a0f113cfb4749da5162e6a0c3fa73 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 1 Aug 2022 17:33:24 -0700
Subject: rcutorture: Verify RCU reader prevents full polling from completing

This commit adds a test to rcu_torture_writer() that verifies that a
->get_gp_state_full() and ->poll_gp_state_full() polled grace-period
sequence does not claim that a grace period elapsed within the confines
of the corresponding read-side critical section.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f2564c7633a8..050f4d0a987f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1309,6 +1309,8 @@ rcu_torture_writer(void *arg)
 			atomic_inc(&rcu_torture_wcount[i]);
 			WRITE_ONCE(old_rp->rtort_pipe_count,
 				   old_rp->rtort_pipe_count + 1);
+
+			// Make sure readers block polled grace periods.
 			if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
 				idx = cur_ops->readlock();
 				cookie = cur_ops->get_gp_state();
@@ -1325,9 +1327,20 @@ rcu_torture_writer(void *arg)
 				}
 				cur_ops->readunlock(idx);
 			}
-			if (cur_ops->get_gp_completed_full && cur_ops->poll_gp_state_full) {
-				cur_ops->get_gp_completed_full(&cookie_full);
-				WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
+			if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full) {
+				idx = cur_ops->readlock();
+				cur_ops->get_gp_state_full(&cookie_full);
+				WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+					  "%s: Cookie check 5 failed %s(%d) online %*pbl\n",
+					  __func__,
+					  rcu_torture_writer_state_getname(),
+					  rcu_torture_writer_state,
+					  cpumask_pr_args(cpu_online_mask));
+				if (cur_ops->get_gp_completed_full) {
+					cur_ops->get_gp_completed_full(&cookie_full);
+					WARN_ON_ONCE(!cur_ops->poll_gp_state_full(&cookie_full));
+				}
+				cur_ops->readunlock(idx);
 			}
 			switch (synctype[torture_random(&rand) % nsynctypes]) {
 			case RTWS_DEF_FREE:
-- 
cgit v1.2.3


From 37d6ade31cf83305495a2875ab5d283e17322032 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Aug 2022 10:22:12 -0700
Subject: rcutorture: Remove redundant RTWS_DEF_FREE check

This check does nothing because the state at this point in the code
because the rcu_torture_writer_state value is guaranteed to instead
be RTWS_REPLACE.  This commit therefore removes this check.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 050f4d0a987f..236bd6b57277 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1314,8 +1314,7 @@ rcu_torture_writer(void *arg)
 			if (cur_ops->get_gp_state && cur_ops->poll_gp_state) {
 				idx = cur_ops->readlock();
 				cookie = cur_ops->get_gp_state();
-				WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE &&
-					  cur_ops->poll_gp_state(cookie),
+				WARN_ONCE(cur_ops->poll_gp_state(cookie),
 					  "%s: Cookie check 1 failed %s(%d) %lu->%lu\n",
 					  __func__,
 					  rcu_torture_writer_state_getname(),
-- 
cgit v1.2.3


From f4754ad2922e5a2b08c8aecf33d1ec03d7219fb4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Aug 2022 16:40:48 -0700
Subject: rcutorture: Verify long-running reader prevents full polling from
 completing

This commit adds full-state polling checks to accompany the old-style
polling checks in the rcu_torture_one_read() function.  If a polling
cycle within an RCU reader completes, a WARN_ONCE() is triggered.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 236bd6b57277..3d8542010847 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1770,6 +1770,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
 static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
 {
 	unsigned long cookie;
+	struct rcu_gp_oldstate cookie_full;
 	int i;
 	unsigned long started;
 	unsigned long completed;
@@ -1787,6 +1788,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
 	rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
 	if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
 		cookie = cur_ops->get_gp_state();
+	if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+		cur_ops->get_gp_state_full(&cookie_full);
 	started = cur_ops->get_gp_seq();
 	ts = rcu_trace_clock_local();
 	p = rcu_dereference_check(rcu_torture_current,
@@ -1827,6 +1830,13 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
 			  rcu_torture_writer_state_getname(),
 			  rcu_torture_writer_state,
 			  cookie, cur_ops->get_gp_state());
+	if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+		WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+			  "%s: Cookie check 6 failed %s(%d) online %*pbl\n",
+			  __func__,
+			  rcu_torture_writer_state_getname(),
+			  rcu_torture_writer_state,
+			  cpumask_pr_args(cpu_online_mask));
 	rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
 	WARN_ON_ONCE(readstate);
 	// This next splat is expected behavior if leakpointer, especially
-- 
cgit v1.2.3


From 76ea364161e72b1878126edf8d507d2a62fb47b0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Aug 2022 17:04:54 -0700
Subject: rcu: Add full-sized polling for start_poll()

The start_poll_synchronize_rcu() API compresses the combined expedited and
normal grace-period states into a single unsigned long, which conserves
storage, but can miss grace periods in certain cases involving overlapping
normal and expedited grace periods.  Missing the occasional grace period
is usually not a problem, but there are use cases that care about each
and every grace period.

This commit therefore adds the next member of the full-state RCU
grace-period polling API, namely the start_poll_synchronize_rcu_full()
function.  This uses up to three times the storage (rcu_gp_oldstate
structure instead of unsigned long), but is guaranteed not to miss
grace periods.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h |  6 +++++
 include/linux/rcutree.h |  1 +
 kernel/rcu/rcutorture.c | 49 +++++++++++++++++++++++++++++++++--------
 kernel/rcu/tree.c       | 58 ++++++++++++++++++++++++++++++++++++++-----------
 4 files changed, 92 insertions(+), 22 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 6e299955c4e9..6bc30e46a819 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -26,6 +26,12 @@ static inline void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 }
 
 unsigned long start_poll_synchronize_rcu(void);
+
+static inline void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	rgosp->rgos_norm = start_poll_synchronize_rcu();
+}
+
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 
 static inline bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 7b769f1b417a..8f2e0f0b26f6 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -52,6 +52,7 @@ void cond_synchronize_rcu_expedited(unsigned long oldstate);
 unsigned long get_state_synchronize_rcu(void);
 void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 unsigned long start_poll_synchronize_rcu(void);
+void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 void cond_synchronize_rcu(unsigned long oldstate);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 3d8542010847..68387ccc7ddf 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -88,6 +88,7 @@ torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
 torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
 torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
 torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
+torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives");
 torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
 torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
 torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
@@ -198,12 +199,14 @@ static int rcu_torture_writer_state;
 #define RTWS_COND_SYNC		7
 #define RTWS_COND_SYNC_EXP	8
 #define RTWS_POLL_GET		9
-#define RTWS_POLL_GET_EXP	10
-#define RTWS_POLL_WAIT		11
-#define RTWS_POLL_WAIT_EXP	12
-#define RTWS_SYNC		13
-#define RTWS_STUTTER		14
-#define RTWS_STOPPING		15
+#define RTWS_POLL_GET_FULL	10
+#define RTWS_POLL_GET_EXP	11
+#define RTWS_POLL_WAIT		12
+#define RTWS_POLL_WAIT_FULL	13
+#define RTWS_POLL_WAIT_EXP	14
+#define RTWS_SYNC		15
+#define RTWS_STUTTER		16
+#define RTWS_STOPPING		17
 static const char * const rcu_torture_writer_state_names[] = {
 	"RTWS_FIXED_DELAY",
 	"RTWS_DELAY",
@@ -215,8 +218,10 @@ static const char * const rcu_torture_writer_state_names[] = {
 	"RTWS_COND_SYNC",
 	"RTWS_COND_SYNC_EXP",
 	"RTWS_POLL_GET",
+	"RTWS_POLL_GET_FULL",
 	"RTWS_POLL_GET_EXP",
 	"RTWS_POLL_WAIT",
+	"RTWS_POLL_WAIT_FULL",
 	"RTWS_POLL_WAIT_EXP",
 	"RTWS_SYNC",
 	"RTWS_STUTTER",
@@ -339,6 +344,7 @@ struct rcu_torture_ops {
 	unsigned long (*get_gp_completed)(void);
 	void (*get_gp_completed_full)(struct rcu_gp_oldstate *rgosp);
 	unsigned long (*start_gp_poll)(void);
+	void (*start_gp_poll_full)(struct rcu_gp_oldstate *rgosp);
 	bool (*poll_gp_state)(unsigned long oldstate);
 	bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
 	bool (*poll_need_2gp)(bool poll, bool poll_full);
@@ -515,6 +521,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.get_gp_completed	= get_completed_synchronize_rcu,
 	.get_gp_completed_full	= get_completed_synchronize_rcu_full,
 	.start_gp_poll		= start_poll_synchronize_rcu,
+	.start_gp_poll_full	= start_poll_synchronize_rcu_full,
 	.poll_gp_state		= poll_state_synchronize_rcu,
 	.poll_gp_state_full	= poll_state_synchronize_rcu_full,
 	.poll_need_2gp		= rcu_poll_need_2gp,
@@ -1163,13 +1170,13 @@ static void rcu_torture_write_types(void)
 {
 	bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp;
 	bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
-	bool gp_sync1 = gp_sync;
+	bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync;
 
 	/* Initialize synctype[] array.  If none set, take default. */
 	if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp &&
-	    !gp_normal1 && !gp_poll1 && !gp_sync1)
+	    !gp_normal1 && !gp_poll1 && !gp_poll_full1 && !gp_sync1)
 		gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 =
-			   gp_normal1 = gp_poll1 = gp_sync1 = true;
+			   gp_normal1 = gp_poll1 = gp_poll_full1 = gp_sync1 = true;
 	if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
 		synctype[nsynctypes++] = RTWS_COND_GET;
 		pr_info("%s: Testing conditional GPs.\n", __func__);
@@ -1200,6 +1207,12 @@ static void rcu_torture_write_types(void)
 	} else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) {
 		pr_alert("%s: gp_poll without primitives.\n", __func__);
 	}
+	if (gp_poll_full1 && cur_ops->start_gp_poll_full && cur_ops->poll_gp_state_full) {
+		synctype[nsynctypes++] = RTWS_POLL_GET_FULL;
+		pr_info("%s: Testing polling full-state GPs.\n", __func__);
+	} else if (gp_poll_full && (!cur_ops->start_gp_poll_full || !cur_ops->poll_gp_state_full)) {
+		pr_alert("%s: gp_poll_full without primitives.\n", __func__);
+	}
 	if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) {
 		synctype[nsynctypes++] = RTWS_POLL_GET_EXP;
 		pr_info("%s: Testing polling expedited GPs.\n", __func__);
@@ -1262,6 +1275,7 @@ rcu_torture_writer(void *arg)
 	struct rcu_gp_oldstate cookie_full;
 	int expediting = 0;
 	unsigned long gp_snap;
+	struct rcu_gp_oldstate gp_snap_full;
 	int i;
 	int idx;
 	int oldnice = task_nice(current);
@@ -1376,6 +1390,15 @@ rcu_torture_writer(void *arg)
 								  &rand);
 				rcu_torture_pipe_update(old_rp);
 				break;
+			case RTWS_POLL_GET_FULL:
+				rcu_torture_writer_state = RTWS_POLL_GET_FULL;
+				cur_ops->start_gp_poll_full(&gp_snap_full);
+				rcu_torture_writer_state = RTWS_POLL_WAIT_FULL;
+				while (!cur_ops->poll_gp_state_full(&gp_snap_full))
+					torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+								  &rand);
+				rcu_torture_pipe_update(old_rp);
+				break;
 			case RTWS_POLL_GET_EXP:
 				rcu_torture_writer_state = RTWS_POLL_GET_EXP;
 				gp_snap = cur_ops->start_gp_poll_exp();
@@ -1454,6 +1477,7 @@ static int
 rcu_torture_fakewriter(void *arg)
 {
 	unsigned long gp_snap;
+	struct rcu_gp_oldstate gp_snap_full;
 	DEFINE_TORTURE_RANDOM(rand);
 
 	VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
@@ -1499,6 +1523,13 @@ rcu_torture_fakewriter(void *arg)
 								  &rand);
 				}
 				break;
+			case RTWS_POLL_GET_FULL:
+				cur_ops->start_gp_poll_full(&gp_snap_full);
+				while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+					torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+								  &rand);
+				}
+				break;
 			case RTWS_POLL_GET_EXP:
 				gp_snap = cur_ops->start_gp_poll_exp();
 				while (!cur_ops->poll_gp_state_exp(gp_snap)) {
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3fa79ee78b5b..89572385fd1a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3589,22 +3589,13 @@ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
 
-/**
- * start_poll_synchronize_rcu - Snapshot and start RCU grace period
- *
- * Returns a cookie that is used by a later call to cond_synchronize_rcu()
- * or poll_state_synchronize_rcu() to determine whether or not a full
- * grace period has elapsed in the meantime.  If the needed grace period
- * is not already slated to start, notifies RCU core of the need for that
- * grace period.
- *
- * Interrupts must be enabled for the case where it is necessary to awaken
- * the grace-period kthread.
+/*
+ * Helper function for start_poll_synchronize_rcu() and
+ * start_poll_synchronize_rcu_full().
  */
-unsigned long start_poll_synchronize_rcu(void)
+static void start_poll_synchronize_rcu_common(void)
 {
 	unsigned long flags;
-	unsigned long gp_seq = get_state_synchronize_rcu();
 	bool needwake;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
@@ -3624,10 +3615,51 @@ unsigned long start_poll_synchronize_rcu(void)
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	if (needwake)
 		rcu_gp_kthread_wake();
+}
+
+/**
+ * start_poll_synchronize_rcu - Snapshot and start RCU grace period
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * or poll_state_synchronize_rcu() to determine whether or not a full
+ * grace period has elapsed in the meantime.  If the needed grace period
+ * is not already slated to start, notifies RCU core of the need for that
+ * grace period.
+ *
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+unsigned long start_poll_synchronize_rcu(void)
+{
+	unsigned long gp_seq = get_state_synchronize_rcu();
+
+	start_poll_synchronize_rcu_common();
 	return gp_seq;
 }
 EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
 
+/**
+ * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full()
+ *
+ * Places the normal and expedited grace-period states in *@rgos.  This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed grace period is not already slated to start, notifies
+ * RCU core of the need for that grace period.
+ *
+ * Interrupts must be enabled for the case where it is necessary to awaken
+ * the grace-period kthread.
+ */
+void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	get_state_synchronize_rcu_full(rgosp);
+
+	start_poll_synchronize_rcu_common();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
+
 /**
  * poll_state_synchronize_rcu - Has the specified RCU grace period completed?
  *
-- 
cgit v1.2.3


From 6c502b14ba66da0670a59e20354469fa56eab26c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Aug 2022 12:38:51 -0700
Subject: rcu: Add full-sized polling for start_poll_expedited()

The start_poll_synchronize_rcu_expedited() API compresses the combined
expedited and normal grace-period states into a single unsigned long,
which conserves storage, but can miss grace periods in certain cases
involving overlapping normal and expedited grace periods.  Missing the
occasional grace period is usually not a problem, but there are use
cases that care about each and every grace period.

This commit therefore adds yet another member of the
full-state RCU grace-period polling API, which is the
start_poll_synchronize_rcu_expedited_full() function.  This uses up to
three times the storage (rcu_gp_oldstate structure instead of unsigned
long), but is guaranteed not to miss grace periods.

[ paulmck: Apply feedback from kernel test robot and Julia Lawall. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h |  5 +++++
 include/linux/rcutree.h |  1 +
 kernel/rcu/rcutorture.c | 51 +++++++++++++++++++++++++++++++++++++++----------
 kernel/rcu/tree_exp.h   | 18 +++++++++++++++++
 4 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 6bc30e46a819..653e35777a99 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -49,6 +49,11 @@ static inline unsigned long start_poll_synchronize_rcu_expedited(void)
 	return start_poll_synchronize_rcu();
 }
 
+static inline void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+	rgosp->rgos_norm = start_poll_synchronize_rcu_expedited();
+}
+
 static inline void cond_synchronize_rcu_expedited(unsigned long oldstate)
 {
 	cond_synchronize_rcu(oldstate);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 8f2e0f0b26f6..7151fd861736 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -48,6 +48,7 @@ struct rcu_gp_oldstate {
 };
 
 unsigned long start_poll_synchronize_rcu_expedited(void);
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
 void cond_synchronize_rcu_expedited(unsigned long oldstate);
 unsigned long get_state_synchronize_rcu(void);
 void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 68387ccc7ddf..f9ca33555deb 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -89,6 +89,7 @@ torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primit
 torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
 torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
 torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives");
+torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives");
 torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
 torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
 torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
@@ -201,12 +202,14 @@ static int rcu_torture_writer_state;
 #define RTWS_POLL_GET		9
 #define RTWS_POLL_GET_FULL	10
 #define RTWS_POLL_GET_EXP	11
-#define RTWS_POLL_WAIT		12
-#define RTWS_POLL_WAIT_FULL	13
-#define RTWS_POLL_WAIT_EXP	14
-#define RTWS_SYNC		15
-#define RTWS_STUTTER		16
-#define RTWS_STOPPING		17
+#define RTWS_POLL_GET_EXP_FULL	12
+#define RTWS_POLL_WAIT		13
+#define RTWS_POLL_WAIT_FULL	14
+#define RTWS_POLL_WAIT_EXP	15
+#define RTWS_POLL_WAIT_EXP_FULL	16
+#define RTWS_SYNC		17
+#define RTWS_STUTTER		18
+#define RTWS_STOPPING		19
 static const char * const rcu_torture_writer_state_names[] = {
 	"RTWS_FIXED_DELAY",
 	"RTWS_DELAY",
@@ -220,9 +223,11 @@ static const char * const rcu_torture_writer_state_names[] = {
 	"RTWS_POLL_GET",
 	"RTWS_POLL_GET_FULL",
 	"RTWS_POLL_GET_EXP",
+	"RTWS_POLL_GET_EXP_FULL",
 	"RTWS_POLL_WAIT",
 	"RTWS_POLL_WAIT_FULL",
 	"RTWS_POLL_WAIT_EXP",
+	"RTWS_POLL_WAIT_EXP_FULL",
 	"RTWS_SYNC",
 	"RTWS_STUTTER",
 	"RTWS_STOPPING",
@@ -337,6 +342,7 @@ struct rcu_torture_ops {
 	void (*exp_sync)(void);
 	unsigned long (*get_gp_state_exp)(void);
 	unsigned long (*start_gp_poll_exp)(void);
+	void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
 	bool (*poll_gp_state_exp)(unsigned long oldstate);
 	void (*cond_sync_exp)(unsigned long oldstate);
 	unsigned long (*get_gp_state)(void);
@@ -528,6 +534,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.cond_sync		= cond_synchronize_rcu,
 	.get_gp_state_exp	= get_state_synchronize_rcu,
 	.start_gp_poll_exp	= start_poll_synchronize_rcu_expedited,
+	.start_gp_poll_exp_full	= start_poll_synchronize_rcu_expedited_full,
 	.poll_gp_state_exp	= poll_state_synchronize_rcu,
 	.cond_sync_exp		= cond_synchronize_rcu_expedited,
 	.call			= call_rcu,
@@ -1169,13 +1176,14 @@ static int nsynctypes;
 static void rcu_torture_write_types(void)
 {
 	bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp;
-	bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
-	bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync;
+	bool gp_poll_exp1 = gp_poll_exp, gp_poll_exp_full1 = gp_poll_exp_full;
+	bool gp_normal1 = gp_normal, gp_poll1 = gp_poll, gp_poll_full1 = gp_poll_full;
+	bool gp_sync1 = gp_sync;
 
 	/* Initialize synctype[] array.  If none set, take default. */
-	if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp &&
+	if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp && !gp_poll_exp_full1 &&
 	    !gp_normal1 && !gp_poll1 && !gp_poll_full1 && !gp_sync1)
-		gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 =
+		gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 = gp_poll_exp_full1 =
 			   gp_normal1 = gp_poll1 = gp_poll_full1 = gp_sync1 = true;
 	if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
 		synctype[nsynctypes++] = RTWS_COND_GET;
@@ -1219,6 +1227,13 @@ static void rcu_torture_write_types(void)
 	} else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) {
 		pr_alert("%s: gp_poll_exp without primitives.\n", __func__);
 	}
+	if (gp_poll_exp_full1 && cur_ops->start_gp_poll_exp_full && cur_ops->poll_gp_state_full) {
+		synctype[nsynctypes++] = RTWS_POLL_GET_EXP_FULL;
+		pr_info("%s: Testing polling full-state expedited GPs.\n", __func__);
+	} else if (gp_poll_exp_full &&
+		   (!cur_ops->start_gp_poll_exp_full || !cur_ops->poll_gp_state_full)) {
+		pr_alert("%s: gp_poll_exp_full without primitives.\n", __func__);
+	}
 	if (gp_sync1 && cur_ops->sync) {
 		synctype[nsynctypes++] = RTWS_SYNC;
 		pr_info("%s: Testing normal GPs.\n", __func__);
@@ -1408,6 +1423,15 @@ rcu_torture_writer(void *arg)
 								  &rand);
 				rcu_torture_pipe_update(old_rp);
 				break;
+			case RTWS_POLL_GET_EXP_FULL:
+				rcu_torture_writer_state = RTWS_POLL_GET_EXP_FULL;
+				cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+				rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL;
+				while (!cur_ops->poll_gp_state_full(&gp_snap_full))
+					torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+								  &rand);
+				rcu_torture_pipe_update(old_rp);
+				break;
 			case RTWS_SYNC:
 				rcu_torture_writer_state = RTWS_SYNC;
 				do_rtws_sync(&rand, cur_ops->sync);
@@ -1537,6 +1561,13 @@ rcu_torture_fakewriter(void *arg)
 								  &rand);
 				}
 				break;
+			case RTWS_POLL_GET_EXP_FULL:
+				cur_ops->start_gp_poll_exp_full(&gp_snap_full);
+				while (!cur_ops->poll_gp_state_full(&gp_snap_full)) {
+					torture_hrtimeout_jiffies(torture_random(&rand) % 16,
+								  &rand);
+				}
+				break;
 			case RTWS_SYNC:
 				cur_ops->sync();
 				break;
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index be667583a554..18128ee0d36c 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -1027,6 +1027,24 @@ unsigned long start_poll_synchronize_rcu_expedited(void)
 }
 EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited);
 
+/**
+ * start_poll_synchronize_rcu_expedited_full - Take a full snapshot and start expedited grace period
+ * @rgosp: Place to put snapshot of grace-period state
+ *
+ * Places the normal and expedited grace-period states in rgosp.  This
+ * state value can be passed to a later call to cond_synchronize_rcu_full()
+ * or poll_state_synchronize_rcu_full() to determine whether or not a
+ * grace period (whether normal or expedited) has elapsed in the meantime.
+ * If the needed expedited grace period is not already slated to start,
+ * initiates that grace period.
+ */
+void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+	get_state_synchronize_rcu_full(rgosp);
+	(void)start_poll_synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited_full);
+
 /**
  * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period
  *
-- 
cgit v1.2.3


From f21e014345e0abf11fdc2e59fb6eb6d6aa6ae4eb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Aug 2022 16:57:47 -0700
Subject: rcu: Remove blank line from poll_state_synchronize_rcu() docbook
 header

This commit removes the blank line preceding the oldstate parameter to
the docbook header for the poll_state_synchronize_rcu() function and
marks uses of this parameter later in that header.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 89572385fd1a..0a24ef4d6b82 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3662,11 +3662,10 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
 
 /**
  * poll_state_synchronize_rcu - Has the specified RCU grace period completed?
- *
  * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
  *
  * If a full RCU grace period has elapsed since the earlier call from
- * which oldstate was obtained, return @true, otherwise return @false.
+ * which @oldstate was obtained, return @true, otherwise return @false.
  * If @false is returned, it is the caller's responsibility to invoke this
  * function later on until it does return @true.  Alternatively, the caller
  * can explicitly wait for a grace period, for example, by passing @oldstate
@@ -3675,7 +3674,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_full);
  * Yes, this function does not take counter wrap into account.
  * But counter wrap is harmless.  If the counter wraps, we have waited for
  * more than a billion grace periods (and way more on a 64-bit system!).
- * Those needing to keep oldstate values for very long time periods
+ * Those needing to keep old state values for very long time periods
  * (many hours even on 32-bit systems) should check them occasionally and
  * either refresh them or set a flag indicating that the grace period has
  * completed.  Alternatively, they can use get_completed_synchronize_rcu()
-- 
cgit v1.2.3


From b6fe4917ae4353b397079902cb024ae01f20dfb2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 13:46:05 -0700
Subject: rcu: Add full-sized polling for cond_sync_full()

The cond_synchronize_rcu() API compresses the combined expedited and
normal grace-period states into a single unsigned long, which conserves
storage, but can miss grace periods in certain cases involving overlapping
normal and expedited grace periods.  Missing the occasional grace period
is usually not a problem, but there are use cases that care about each
and every grace period.

This commit therefore adds yet another member of the full-state RCU
grace-period polling API, which is the cond_synchronize_rcu_full()
function.  This uses up to three times the storage (rcu_gp_oldstate
structure instead of unsigned long), but is guaranteed not to miss
grace periods.

[ paulmck: Apply feedback from kernel test robot and Julia Lawall. ]

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h |  5 ++++
 include/linux/rcutree.h |  1 +
 kernel/rcu/rcutorture.c | 67 ++++++++++++++++++++++++++++++++++---------------
 kernel/rcu/tree.c       | 28 ++++++++++++++++++++-
 4 files changed, 80 insertions(+), 21 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 653e35777a99..3bee97f76bf4 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -44,6 +44,11 @@ static inline void cond_synchronize_rcu(unsigned long oldstate)
 	might_sleep();
 }
 
+static inline void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	cond_synchronize_rcu(rgosp->rgos_norm);
+}
+
 static inline unsigned long start_poll_synchronize_rcu_expedited(void)
 {
 	return start_poll_synchronize_rcu();
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 7151fd861736..1b44288c027d 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -57,6 +57,7 @@ void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 bool poll_state_synchronize_rcu(unsigned long oldstate);
 bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 void cond_synchronize_rcu(unsigned long oldstate);
+void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 
 bool rcu_is_idle_cpu(int cpu);
 
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f9ca33555deb..9d22161bf770 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -84,6 +84,7 @@ torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress test
 torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()");
 torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
 torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
+torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives");
 torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
 torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
 torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
@@ -196,20 +197,22 @@ static int rcu_torture_writer_state;
 #define RTWS_DEF_FREE		3
 #define RTWS_EXP_SYNC		4
 #define RTWS_COND_GET		5
-#define RTWS_COND_GET_EXP	6
-#define RTWS_COND_SYNC		7
-#define RTWS_COND_SYNC_EXP	8
-#define RTWS_POLL_GET		9
-#define RTWS_POLL_GET_FULL	10
-#define RTWS_POLL_GET_EXP	11
-#define RTWS_POLL_GET_EXP_FULL	12
-#define RTWS_POLL_WAIT		13
-#define RTWS_POLL_WAIT_FULL	14
-#define RTWS_POLL_WAIT_EXP	15
-#define RTWS_POLL_WAIT_EXP_FULL	16
-#define RTWS_SYNC		17
-#define RTWS_STUTTER		18
-#define RTWS_STOPPING		19
+#define RTWS_COND_GET_FULL	6
+#define RTWS_COND_GET_EXP	7
+#define RTWS_COND_SYNC		8
+#define RTWS_COND_SYNC_FULL	9
+#define RTWS_COND_SYNC_EXP	10
+#define RTWS_POLL_GET		11
+#define RTWS_POLL_GET_FULL	12
+#define RTWS_POLL_GET_EXP	13
+#define RTWS_POLL_GET_EXP_FULL	14
+#define RTWS_POLL_WAIT		15
+#define RTWS_POLL_WAIT_FULL	16
+#define RTWS_POLL_WAIT_EXP	17
+#define RTWS_POLL_WAIT_EXP_FULL	18
+#define RTWS_SYNC		19
+#define RTWS_STUTTER		20
+#define RTWS_STOPPING		21
 static const char * const rcu_torture_writer_state_names[] = {
 	"RTWS_FIXED_DELAY",
 	"RTWS_DELAY",
@@ -217,8 +220,10 @@ static const char * const rcu_torture_writer_state_names[] = {
 	"RTWS_DEF_FREE",
 	"RTWS_EXP_SYNC",
 	"RTWS_COND_GET",
+	"RTWS_COND_GET_FULL",
 	"RTWS_COND_GET_EXP",
 	"RTWS_COND_SYNC",
+	"RTWS_COND_SYNC_FULL",
 	"RTWS_COND_SYNC_EXP",
 	"RTWS_POLL_GET",
 	"RTWS_POLL_GET_FULL",
@@ -355,6 +360,7 @@ struct rcu_torture_ops {
 	bool (*poll_gp_state_full)(struct rcu_gp_oldstate *rgosp);
 	bool (*poll_need_2gp)(bool poll, bool poll_full);
 	void (*cond_sync)(unsigned long oldstate);
+	void (*cond_sync_full)(struct rcu_gp_oldstate *rgosp);
 	call_rcu_func_t call;
 	void (*cb_barrier)(void);
 	void (*fqs)(void);
@@ -532,6 +538,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.poll_gp_state_full	= poll_state_synchronize_rcu_full,
 	.poll_need_2gp		= rcu_poll_need_2gp,
 	.cond_sync		= cond_synchronize_rcu,
+	.cond_sync_full		= cond_synchronize_rcu_full,
 	.get_gp_state_exp	= get_state_synchronize_rcu,
 	.start_gp_poll_exp	= start_poll_synchronize_rcu_expedited,
 	.start_gp_poll_exp_full	= start_poll_synchronize_rcu_expedited_full,
@@ -1175,16 +1182,17 @@ static int nsynctypes;
  */
 static void rcu_torture_write_types(void)
 {
-	bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp;
-	bool gp_poll_exp1 = gp_poll_exp, gp_poll_exp_full1 = gp_poll_exp_full;
+	bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full;
+	bool gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp, gp_poll_exp_full1 = gp_poll_exp_full;
 	bool gp_normal1 = gp_normal, gp_poll1 = gp_poll, gp_poll_full1 = gp_poll_full;
 	bool gp_sync1 = gp_sync;
 
 	/* Initialize synctype[] array.  If none set, take default. */
-	if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp && !gp_poll_exp_full1 &&
-	    !gp_normal1 && !gp_poll1 && !gp_poll_full1 && !gp_sync1)
-		gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 = gp_poll_exp_full1 =
-			   gp_normal1 = gp_poll1 = gp_poll_full1 = gp_sync1 = true;
+	if (!gp_cond1 && !gp_cond_exp1 && !gp_cond_full1 && !gp_exp1 && !gp_poll_exp &&
+	    !gp_poll_exp_full1 && !gp_normal1 && !gp_poll1 && !gp_poll_full1 && !gp_sync1)
+		gp_cond1 = gp_cond_exp1 = gp_cond_full1 = gp_exp1 = gp_poll_exp1 =
+			   gp_poll_exp_full1 = gp_normal1 = gp_poll1 = gp_poll_full1 =
+			   gp_sync1 = true;
 	if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
 		synctype[nsynctypes++] = RTWS_COND_GET;
 		pr_info("%s: Testing conditional GPs.\n", __func__);
@@ -1197,6 +1205,12 @@ static void rcu_torture_write_types(void)
 	} else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) {
 		pr_alert("%s: gp_cond_exp without primitives.\n", __func__);
 	}
+	if (gp_cond_full1 && cur_ops->get_gp_state && cur_ops->cond_sync_full) {
+		synctype[nsynctypes++] = RTWS_COND_GET_FULL;
+		pr_info("%s: Testing conditional full-state GPs.\n", __func__);
+	} else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) {
+		pr_alert("%s: gp_cond_full without primitives.\n", __func__);
+	}
 	if (gp_exp1 && cur_ops->exp_sync) {
 		synctype[nsynctypes++] = RTWS_EXP_SYNC;
 		pr_info("%s: Testing expedited GPs.\n", __func__);
@@ -1396,6 +1410,14 @@ rcu_torture_writer(void *arg)
 				cur_ops->cond_sync_exp(gp_snap);
 				rcu_torture_pipe_update(old_rp);
 				break;
+			case RTWS_COND_GET_FULL:
+				rcu_torture_writer_state = RTWS_COND_GET_FULL;
+				cur_ops->get_gp_state_full(&gp_snap_full);
+				torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+				rcu_torture_writer_state = RTWS_COND_SYNC_FULL;
+				cur_ops->cond_sync_full(&gp_snap_full);
+				rcu_torture_pipe_update(old_rp);
+				break;
 			case RTWS_POLL_GET:
 				rcu_torture_writer_state = RTWS_POLL_GET;
 				gp_snap = cur_ops->start_gp_poll();
@@ -1540,6 +1562,11 @@ rcu_torture_fakewriter(void *arg)
 				torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
 				cur_ops->cond_sync_exp(gp_snap);
 				break;
+			case RTWS_COND_GET_FULL:
+				cur_ops->get_gp_state_full(&gp_snap_full);
+				torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+				cur_ops->cond_sync_full(&gp_snap_full);
+				break;
 			case RTWS_POLL_GET:
 				gp_snap = cur_ops->start_gp_poll();
 				while (!cur_ops->poll_gp_state(gp_snap)) {
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0a24ef4d6b82..5c46c0d34ef0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3749,7 +3749,6 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu_full);
 
 /**
  * cond_synchronize_rcu - Conditionally wait for an RCU grace period
- *
  * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
  *
  * If a full RCU grace period has elapsed since the earlier call to
@@ -3773,6 +3772,33 @@ void cond_synchronize_rcu(unsigned long oldstate)
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 
+/**
+ * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return.  Otherwise, invoke synchronize_rcu() to wait
+ * for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless.  If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
+{
+	if (!poll_state_synchronize_rcu_full(rgosp))
+		synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_full);
+
 /*
  * Check to see if there is any immediate RCU-related work to be done by
  * the current CPU, returning 1 if so and zero otherwise.  The checks are
-- 
cgit v1.2.3


From 8df13f01608ea48712956c0b1afce35bdba5a1c5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 15:23:26 -0700
Subject: rcu: Add full-sized polling for cond_sync_exp_full()

The cond_synchronize_rcu_expedited() API compresses the combined expedited and
normal grace-period states into a single unsigned long, which conserves
storage, but can miss grace periods in certain cases involving overlapping
normal and expedited grace periods.  Missing the occasional grace period
is usually not a problem, but there are use cases that care about each
and every grace period.

This commit therefore adds yet another member of the full-state RCU
grace-period polling API, which is the cond_synchronize_rcu_exp_full()
function.  This uses up to three times the storage (rcu_gp_oldstate
structure instead of unsigned long), but is guaranteed not to miss
grace periods.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h |  5 ++++
 include/linux/rcutree.h |  1 +
 kernel/rcu/rcutorture.c | 72 ++++++++++++++++++++++++++++++++++---------------
 kernel/rcu/tree_exp.h   | 27 +++++++++++++++++++
 4 files changed, 83 insertions(+), 22 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 3bee97f76bf4..4405e9112cee 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -64,6 +64,11 @@ static inline void cond_synchronize_rcu_expedited(unsigned long oldstate)
 	cond_synchronize_rcu(oldstate);
 }
 
+static inline void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+	cond_synchronize_rcu_expedited(rgosp->rgos_norm);
+}
+
 extern void rcu_barrier(void);
 
 static inline void synchronize_rcu_expedited(void)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 1b44288c027d..755b082f4ec6 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -50,6 +50,7 @@ struct rcu_gp_oldstate {
 unsigned long start_poll_synchronize_rcu_expedited(void);
 void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
 void cond_synchronize_rcu_expedited(unsigned long oldstate);
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
 unsigned long get_state_synchronize_rcu(void);
 void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 unsigned long start_poll_synchronize_rcu(void);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 9d22161bf770..8995429c6f1c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -85,6 +85,8 @@ torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind ne
 torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives");
 torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives");
 torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives");
+torture_param(bool, gp_cond_exp_full, false,
+		    "Use conditional/async full-stateexpedited GP wait primitives");
 torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
 torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
 torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
@@ -199,20 +201,22 @@ static int rcu_torture_writer_state;
 #define RTWS_COND_GET		5
 #define RTWS_COND_GET_FULL	6
 #define RTWS_COND_GET_EXP	7
-#define RTWS_COND_SYNC		8
-#define RTWS_COND_SYNC_FULL	9
-#define RTWS_COND_SYNC_EXP	10
-#define RTWS_POLL_GET		11
-#define RTWS_POLL_GET_FULL	12
-#define RTWS_POLL_GET_EXP	13
-#define RTWS_POLL_GET_EXP_FULL	14
-#define RTWS_POLL_WAIT		15
-#define RTWS_POLL_WAIT_FULL	16
-#define RTWS_POLL_WAIT_EXP	17
-#define RTWS_POLL_WAIT_EXP_FULL	18
-#define RTWS_SYNC		19
-#define RTWS_STUTTER		20
-#define RTWS_STOPPING		21
+#define RTWS_COND_GET_EXP_FULL	8
+#define RTWS_COND_SYNC		9
+#define RTWS_COND_SYNC_FULL	10
+#define RTWS_COND_SYNC_EXP	11
+#define RTWS_COND_SYNC_EXP_FULL	12
+#define RTWS_POLL_GET		13
+#define RTWS_POLL_GET_FULL	14
+#define RTWS_POLL_GET_EXP	15
+#define RTWS_POLL_GET_EXP_FULL	16
+#define RTWS_POLL_WAIT		17
+#define RTWS_POLL_WAIT_FULL	18
+#define RTWS_POLL_WAIT_EXP	19
+#define RTWS_POLL_WAIT_EXP_FULL	20
+#define RTWS_SYNC		21
+#define RTWS_STUTTER		22
+#define RTWS_STOPPING		23
 static const char * const rcu_torture_writer_state_names[] = {
 	"RTWS_FIXED_DELAY",
 	"RTWS_DELAY",
@@ -222,9 +226,11 @@ static const char * const rcu_torture_writer_state_names[] = {
 	"RTWS_COND_GET",
 	"RTWS_COND_GET_FULL",
 	"RTWS_COND_GET_EXP",
+	"RTWS_COND_GET_EXP_FULL",
 	"RTWS_COND_SYNC",
 	"RTWS_COND_SYNC_FULL",
 	"RTWS_COND_SYNC_EXP",
+	"RTWS_COND_SYNC_EXP_FULL",
 	"RTWS_POLL_GET",
 	"RTWS_POLL_GET_FULL",
 	"RTWS_POLL_GET_EXP",
@@ -350,6 +356,7 @@ struct rcu_torture_ops {
 	void (*start_gp_poll_exp_full)(struct rcu_gp_oldstate *rgosp);
 	bool (*poll_gp_state_exp)(unsigned long oldstate);
 	void (*cond_sync_exp)(unsigned long oldstate);
+	void (*cond_sync_exp_full)(struct rcu_gp_oldstate *rgosp);
 	unsigned long (*get_gp_state)(void);
 	void (*get_gp_state_full)(struct rcu_gp_oldstate *rgosp);
 	unsigned long (*get_gp_completed)(void);
@@ -1183,16 +1190,17 @@ static int nsynctypes;
 static void rcu_torture_write_types(void)
 {
 	bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_cond_full1 = gp_cond_full;
-	bool gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp, gp_poll_exp_full1 = gp_poll_exp_full;
-	bool gp_normal1 = gp_normal, gp_poll1 = gp_poll, gp_poll_full1 = gp_poll_full;
-	bool gp_sync1 = gp_sync;
+	bool gp_cond_exp_full1 = gp_cond_exp_full, gp_exp1 = gp_exp, gp_poll_exp1 = gp_poll_exp;
+	bool gp_poll_exp_full1 = gp_poll_exp_full, gp_normal1 = gp_normal, gp_poll1 = gp_poll;
+	bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync;
 
 	/* Initialize synctype[] array.  If none set, take default. */
-	if (!gp_cond1 && !gp_cond_exp1 && !gp_cond_full1 && !gp_exp1 && !gp_poll_exp &&
-	    !gp_poll_exp_full1 && !gp_normal1 && !gp_poll1 && !gp_poll_full1 && !gp_sync1)
-		gp_cond1 = gp_cond_exp1 = gp_cond_full1 = gp_exp1 = gp_poll_exp1 =
-			   gp_poll_exp_full1 = gp_normal1 = gp_poll1 = gp_poll_full1 =
-			   gp_sync1 = true;
+	if (!gp_cond1 && !gp_cond_exp1 && !gp_cond_full1 && !gp_cond_exp_full1 && !gp_exp1 &&
+	    !gp_poll_exp && !gp_poll_exp_full1 && !gp_normal1 && !gp_poll1 && !gp_poll_full1 &&
+	    !gp_sync1)
+		gp_cond1 = gp_cond_exp1 = gp_cond_full1 = gp_cond_exp_full1 = gp_exp1 =
+			   gp_poll_exp1 = gp_poll_exp_full1 = gp_normal1 = gp_poll1 =
+			   gp_poll_full1 = gp_sync1 = true;
 	if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
 		synctype[nsynctypes++] = RTWS_COND_GET;
 		pr_info("%s: Testing conditional GPs.\n", __func__);
@@ -1211,6 +1219,13 @@ static void rcu_torture_write_types(void)
 	} else if (gp_cond_full && (!cur_ops->get_gp_state || !cur_ops->cond_sync_full)) {
 		pr_alert("%s: gp_cond_full without primitives.\n", __func__);
 	}
+	if (gp_cond_exp_full1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp_full) {
+		synctype[nsynctypes++] = RTWS_COND_GET_EXP_FULL;
+		pr_info("%s: Testing conditional full-state expedited GPs.\n", __func__);
+	} else if (gp_cond_exp_full &&
+		   (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp_full)) {
+		pr_alert("%s: gp_cond_exp_full without primitives.\n", __func__);
+	}
 	if (gp_exp1 && cur_ops->exp_sync) {
 		synctype[nsynctypes++] = RTWS_EXP_SYNC;
 		pr_info("%s: Testing expedited GPs.\n", __func__);
@@ -1418,6 +1433,14 @@ rcu_torture_writer(void *arg)
 				cur_ops->cond_sync_full(&gp_snap_full);
 				rcu_torture_pipe_update(old_rp);
 				break;
+			case RTWS_COND_GET_EXP_FULL:
+				rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL;
+				cur_ops->get_gp_state_full(&gp_snap_full);
+				torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+				rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL;
+				cur_ops->cond_sync_exp_full(&gp_snap_full);
+				rcu_torture_pipe_update(old_rp);
+				break;
 			case RTWS_POLL_GET:
 				rcu_torture_writer_state = RTWS_POLL_GET;
 				gp_snap = cur_ops->start_gp_poll();
@@ -1567,6 +1590,11 @@ rcu_torture_fakewriter(void *arg)
 				torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
 				cur_ops->cond_sync_full(&gp_snap_full);
 				break;
+			case RTWS_COND_GET_EXP_FULL:
+				cur_ops->get_gp_state_full(&gp_snap_full);
+				torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+				cur_ops->cond_sync_exp_full(&gp_snap_full);
+				break;
 			case RTWS_POLL_GET:
 				gp_snap = cur_ops->start_gp_poll();
 				while (!cur_ops->poll_gp_state(gp_snap)) {
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 18128ee0d36c..9c0ae834ef07 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -1071,3 +1071,30 @@ void cond_synchronize_rcu_expedited(unsigned long oldstate)
 		synchronize_rcu_expedited();
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited);
+
+/**
+ * cond_synchronize_rcu_expedited_full - Conditionally wait for an expedited RCU grace period
+ * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full()
+ *
+ * If a full RCU grace period has elapsed since the call to
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was
+ * obtained, just return.  Otherwise, invoke synchronize_rcu_expedited()
+ * to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless.  If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for a couple of additional grace periods should be just fine.
+ *
+ * This function provides the same memory-ordering guarantees that
+ * would be provided by a synchronize_rcu() that was invoked at the call
+ * to the function that provided @rgosp and that returned at the end of
+ * this function.
+ */
+void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp)
+{
+	if (!poll_state_synchronize_rcu_full(rgosp))
+		synchronize_rcu_expedited();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited_full);
-- 
cgit v1.2.3


From 258f887aba60c8fc7946a9f379f9a3889f92fc85 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 16:07:04 -0700
Subject: rcu: Disable run-time single-CPU grace-period optimization

The run-time single-CPU grace-period optimization applies only to
kernels built with CONFIG_SMP=y && CONFIG_PREEMPTION=y that are running
on a single-CPU system.  But a kernel intended for a single-CPU system
should instead be built with CONFIG_SMP=n, and in any case, single-CPU
systems running Linux no longer appear to be the common case.  Plus this
optimization results in the rcu_gp_oldstate structure being half again
larger than it needs to be.

This commit therefore disables the run-time single-CPU grace-period
optimization, so that this optimization applies only during the
pre-scheduler portion of the boot sequence.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 40 +++++++++-------------------------------
 1 file changed, 9 insertions(+), 31 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5c46c0d34ef0..5c4ec9dd4ce7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3423,42 +3423,20 @@ void __init kfree_rcu_scheduler_running(void)
 
 /*
  * During early boot, any blocking grace-period wait automatically
- * implies a grace period.  Later on, this is never the case for PREEMPTION.
+ * implies a grace period.
  *
- * However, because a context switch is a grace period for !PREEMPTION, any
- * blocking grace-period wait automatically implies a grace period if
- * there is only one CPU online at any point time during execution of
- * either synchronize_rcu() or synchronize_rcu_expedited().  It is OK to
- * occasionally incorrectly indicate that there are multiple CPUs online
- * when there was in fact only one the whole time, as this just adds some
- * overhead: RCU still operates correctly.
+ * Later on, this could in theory be the case for kernels built with
+ * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this
+ * is not a common case.  Furthermore, this optimization would cause
+ * the rcu_gp_oldstate structure to expand by 50%, so this potential
+ * grace-period optimization is ignored once the scheduler is running.
  */
 static int rcu_blocking_is_gp(void)
 {
-	int ret;
-
-	// Invoking preempt_model_*() too early gets a splat.
-	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE ||
-	    preempt_model_full() || preempt_model_rt())
-		return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
+	if (rcu_scheduler_active != RCU_SCHEDULER_INACTIVE)
+		return false;
 	might_sleep();  /* Check for RCU read-side critical section. */
-	preempt_disable();
-	/*
-	 * If the rcu_state.n_online_cpus counter is equal to one,
-	 * there is only one CPU, and that CPU sees all prior accesses
-	 * made by any CPU that was online at the time of its access.
-	 * Furthermore, if this counter is equal to one, its value cannot
-	 * change until after the preempt_enable() below.
-	 *
-	 * Furthermore, if rcu_state.n_online_cpus is equal to one here,
-	 * all later CPUs (both this one and any that come online later
-	 * on) are guaranteed to see all accesses prior to this point
-	 * in the code, without the need for additional memory barriers.
-	 * Those memory barriers are provided by CPU-hotplug code.
-	 */
-	ret = READ_ONCE(rcu_state.n_online_cpus) <= 1;
-	preempt_enable();
-	return ret;
+	return true;
 }
 
 /**
-- 
cgit v1.2.3


From a5d1b0b68a62afb1bce0b36cc9a1875acf8a6dff Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 17:01:55 -0700
Subject: rcu: Set rcu_data structures' initial ->gpwrap value to true

It would be good do reduce the size of the rcu_gp_oldstate structure
from three unsigned long instances to two, but this requires that the
boot-time optimized grace periods update the various ->gp_seq fields.
Updating these fields in the rcu_state structure and in all of the
rcu_node structures is at least semi-reasonable, but updating them in
all of the rcu_data structures is a bridge too far.  This means that if
there are too many early boot-time grace periods, the ->gp_seq field in
the rcu_data structure cannot be trusted.  This commit therefore sets
each rcu_data structure's ->gpwrap field to provide the necessary impetus
for a suitable level of distrust.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5c4ec9dd4ce7..03b089184b37 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -76,6 +76,7 @@
 /* Data structures. */
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
+	.gpwrap = true,
 #ifdef CONFIG_RCU_NOCB_CPU
 	.cblist.flags = SEGCBLIST_RCU_CORE,
 #endif
-- 
cgit v1.2.3


From 5f11bad6b7228858e06729de6dd4079dfc082648 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 17:16:24 -0700
Subject: rcu-tasks: Remove grace-period fast-path rcu-tasks helper

Now that the grace-period fast path can only happen during the
pre-scheduler portion of early boot, this fast path can no longer block
run-time RCU Tasks and RCU Tasks Trace grace periods.  This commit
therefore removes the conditional cond_resched_tasks_rcu_qs() invocation.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 03b089184b37..0ff7d5eaa376 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3492,8 +3492,6 @@ void synchronize_rcu(void)
 		// which allows reuse of ->gp_seq_polled_snap.
 		rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
 		rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
-		if (rcu_init_invoked())
-			cond_resched_tasks_rcu_qs();
 		return;  // Context allows vacuous grace periods.
 	}
 	if (rcu_gp_is_expedited())
-- 
cgit v1.2.3


From 910e12092eac8a9f19b507ed0fdc1c21d8da9483 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 17:28:01 -0700
Subject: rcu: Make synchronize_rcu() fast path update ->gp_seq counters

This commit causes the early boot single-CPU synchronize_rcu() fastpath to
update the rcu_state and rcu_node structures' ->gp_seq and ->gp_seq_needed
counters.  This will allow the full-state polled grace-period APIs to
detect all normal grace periods without the need to track the special
combined polling-only counter, which is a step towards removing the
->rgos_polled field from the rcu_gp_oldstate, thereby reducing its size
by one third.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0ff7d5eaa376..8fa5ec0f3d11 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3480,24 +3480,37 @@ static int rcu_blocking_is_gp(void)
  */
 void synchronize_rcu(void)
 {
+	unsigned long flags;
+	struct rcu_node *rnp;
+
 	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
 			 lock_is_held(&rcu_lock_map) ||
 			 lock_is_held(&rcu_sched_lock_map),
 			 "Illegal synchronize_rcu() in RCU read-side critical section");
-	if (rcu_blocking_is_gp()) {
-		// Note well that this code runs with !PREEMPT && !SMP.
-		// In addition, all code that advances grace periods runs at
-		// process level.  Therefore, this normal GP overlaps with
-		// other normal GPs only by being fully nested within them,
-		// which allows reuse of ->gp_seq_polled_snap.
-		rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
-		rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
-		return;  // Context allows vacuous grace periods.
+	if (!rcu_blocking_is_gp()) {
+		if (rcu_gp_is_expedited())
+			synchronize_rcu_expedited();
+		else
+			wait_rcu_gp(call_rcu);
+		return;
 	}
-	if (rcu_gp_is_expedited())
-		synchronize_rcu_expedited();
-	else
-		wait_rcu_gp(call_rcu);
+
+	// Context allows vacuous grace periods.
+	// Note well that this code runs with !PREEMPT && !SMP.
+	// In addition, all code that advances grace periods runs at
+	// process level.  Therefore, this normal GP overlaps with other
+	// normal GPs only by being fully nested within them, which allows
+	// reuse of ->gp_seq_polled_snap.
+	rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+	rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+
+	// Update normal grace-period counters to record grace period.
+	local_irq_save(flags);
+	WARN_ON_ONCE(num_online_cpus() > 1);
+	rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT);
+	rcu_for_each_node_breadth_first(rnp)
+		rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
 
-- 
cgit v1.2.3


From e8755d2bde7c296026fe5d47f1b75c7b19ba46fd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 17:34:30 -0700
Subject: rcu: Remove expedited grace-period fast-path forward-progress helper

Now that the expedited grace-period fast path can only happen during
the pre-scheduler portion of early boot, this fast path can no longer
block run-time RCU Trace grace periods.  This commit therefore removes
the conditional cond_resched() invocation.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 9c0ae834ef07..1a51f9301ebf 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -924,8 +924,6 @@ void synchronize_rcu_expedited(void)
 		// them, which allows reuse of ->gp_seq_polled_exp_snap.
 		rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
 		rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
-		if (rcu_init_invoked())
-			cond_resched();
 		return;  // Context allows vacuous grace periods.
 	}
 
-- 
cgit v1.2.3


From 43ff97cc997f5641127152f97e1fd0fc9fa060f6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 17:43:53 -0700
Subject: rcu: Make synchronize_rcu_expedited() fast path update
 .expedited_sequence

This commit causes the early boot single-CPU synchronize_rcu_expedited()
fastpath to update the rcu_state structure's ->expedited_sequence
counter.  This will allow the full-state polled grace-period APIs to
detect all expedited grace periods without the need to track the special
combined polling-only counter, which is another step towards removing
the ->rgos_polled field from the rcu_gp_oldstate, thereby reducing its
size by one third.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 1a51f9301ebf..54e05d13d151 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -906,6 +906,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
 void synchronize_rcu_expedited(void)
 {
 	bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT);
+	unsigned long flags;
 	struct rcu_exp_work rew;
 	struct rcu_node *rnp;
 	unsigned long s;
@@ -924,6 +925,11 @@ void synchronize_rcu_expedited(void)
 		// them, which allows reuse of ->gp_seq_polled_exp_snap.
 		rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap);
 		rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap);
+
+		local_irq_save(flags);
+		WARN_ON_ONCE(num_online_cpus() > 1);
+		rcu_state.expedited_sequence += (1 << RCU_SEQ_CTR_SHIFT);
+		local_irq_restore(flags);
 		return;  // Context allows vacuous grace periods.
 	}
 
-- 
cgit v1.2.3


From 7ecef0871dd9a879038dbe8a681ab48bd0c92988 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 17:54:53 -0700
Subject: rcu: Remove ->rgos_polled field from rcu_gp_oldstate structure

Because both normal and expedited grace periods increment their respective
counters on their pre-scheduler early boot fastpaths, the rcu_gp_oldstate
structure no longer needs its ->rgos_polled field.  This commit therefore
removes this field, shrinking this structure so that it is the same size
as an rcu_head structure.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutree.h | 1 -
 kernel/rcu/tree.c       | 6 +-----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 755b082f4ec6..455a03bdce15 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -44,7 +44,6 @@ bool rcu_gp_might_be_stalled(void);
 struct rcu_gp_oldstate {
 	unsigned long rgos_norm;
 	unsigned long rgos_exp;
-	unsigned long rgos_polled;
 };
 
 unsigned long start_poll_synchronize_rcu_expedited(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8fa5ec0f3d11..b9e8ed00536d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3526,7 +3526,6 @@ void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 {
 	rgosp->rgos_norm = RCU_GET_STATE_COMPLETED;
 	rgosp->rgos_exp = RCU_GET_STATE_COMPLETED;
-	rgosp->rgos_polled = RCU_GET_STATE_COMPLETED;
 }
 EXPORT_SYMBOL_GPL(get_completed_synchronize_rcu_full);
 
@@ -3575,7 +3574,6 @@ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 	smp_mb();  /* ^^^ */
 	rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
 	rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
-	rgosp->rgos_polled = rcu_seq_snap(&rcu_state.gp_seq_polled);
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
 
@@ -3727,9 +3725,7 @@ bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
 	if (rgosp->rgos_norm == RCU_GET_STATE_COMPLETED ||
 	    rcu_seq_done_exact(&rnp->gp_seq, rgosp->rgos_norm) ||
 	    rgosp->rgos_exp == RCU_GET_STATE_COMPLETED ||
-	    rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp) ||
-	    rgosp->rgos_polled == RCU_GET_STATE_COMPLETED ||
-	    rcu_seq_done_exact(&rcu_state.gp_seq_polled, rgosp->rgos_polled)) {
+	    rcu_seq_done_exact(&rcu_state.expedited_sequence, rgosp->rgos_exp)) {
 		smp_mb(); /* Ensure GP ends before subsequent accesses. */
 		return true;
 	}
-- 
cgit v1.2.3


From b3cdd0a79c875d5e9cac9f6555485031ce5bea81 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 4 Aug 2022 18:05:46 -0700
Subject: rcutorture: Adjust rcu_poll_need_2gp() for rcu_gp_oldstate field
 removal

Now that rcu_gp_oldstate can accurately track both normal and
expedited grace periods regardless of system state, rcutorture's
rcu_poll_need_2gp() function need only call for a second grace period
for the old single-unsigned-long grace-period polling APIs
This commit therefore adjusts rcu_poll_need_2gp() accordingly.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8995429c6f1c..029de67a9da9 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -520,7 +520,7 @@ static void rcu_sync_torture_init(void)
 
 static bool rcu_poll_need_2gp(bool poll, bool poll_full)
 {
-	return poll || (!IS_ENABLED(CONFIG_TINY_RCU) && poll_full && num_online_cpus() <= 1);
+	return poll;
 }
 
 static struct rcu_torture_ops rcu_ops = {
-- 
cgit v1.2.3


From d761de8a7dcef8e8e9e20a543f85a2c079ae3d0d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 5 Aug 2022 15:42:25 -0700
Subject: rcu: Make synchronize_rcu() fastpath update only boot-CPU counters

Large systems can have hundreds of rcu_node structures, and updating
counters in each of them might slow down booting.  This commit therefore
updates only the counters in those rcu_node structures corresponding
to the boot CPU, up to and including the root rcu_node structure.

The counters for the remaining rcu_node structures are updated by the
rcu_scheduler_starting() function, which executes just before the first
non-boot kthread is spawned.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b9e8ed00536d..ef15bae3c7c7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3504,11 +3504,14 @@ void synchronize_rcu(void)
 	rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
 	rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
 
-	// Update normal grace-period counters to record grace period.
+	// Update the normal grace-period counters to record
+	// this grace period, but only those used by the boot CPU.
+	// The rcu_scheduler_starting() will take care of the rest of
+	// these counters.
 	local_irq_save(flags);
 	WARN_ON_ONCE(num_online_cpus() > 1);
 	rcu_state.gp_seq += (1 << RCU_SEQ_CTR_SHIFT);
-	rcu_for_each_node_breadth_first(rnp)
+	for (rnp = this_cpu_ptr(&rcu_data)->mynode; rnp; rnp = rnp->parent)
 		rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
 	local_irq_restore(flags);
 }
@@ -4456,9 +4459,20 @@ early_initcall(rcu_spawn_gp_kthread);
  */
 void rcu_scheduler_starting(void)
 {
+	unsigned long flags;
+	struct rcu_node *rnp;
+
 	WARN_ON(num_online_cpus() != 1);
 	WARN_ON(nr_context_switches() > 0);
 	rcu_test_sync_prims();
+
+	// Fix up the ->gp_seq counters.
+	local_irq_save(flags);
+	rcu_for_each_node_breadth_first(rnp)
+		rnp->gp_seq_needed = rnp->gp_seq = rcu_state.gp_seq;
+	local_irq_restore(flags);
+
+	// Switch out of early boot mode.
 	rcu_scheduler_active = RCU_SCHEDULER_INIT;
 	rcu_test_sync_prims();
 }
-- 
cgit v1.2.3


From cc8faf5b65997a994c77a0122b94240b82c95f23 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 22 Aug 2022 10:02:56 -0700
Subject: rcutorture: Use 1-suffixed variable in rcu_torture_write_types()
 check

This commit changes the use of gp_poll_exp to gp_poll_exp1 in the first
check in rcu_torture_write_types().  No functional effect, but consistency
is a good thing.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 029de67a9da9..71d1af9c060e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1196,7 +1196,7 @@ static void rcu_torture_write_types(void)
 
 	/* Initialize synctype[] array.  If none set, take default. */
 	if (!gp_cond1 && !gp_cond_exp1 && !gp_cond_full1 && !gp_cond_exp_full1 && !gp_exp1 &&
-	    !gp_poll_exp && !gp_poll_exp_full1 && !gp_normal1 && !gp_poll1 && !gp_poll_full1 &&
+	    !gp_poll_exp1 && !gp_poll_exp_full1 && !gp_normal1 && !gp_poll1 && !gp_poll_full1 &&
 	    !gp_sync1)
 		gp_cond1 = gp_cond_exp1 = gp_cond_full1 = gp_cond_exp_full1 = gp_exp1 =
 			   gp_poll_exp1 = gp_poll_exp_full1 = gp_normal1 = gp_poll1 =
-- 
cgit v1.2.3


From 5d7801f20170a50252b791c2ce05fb982616665b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 23 Aug 2022 12:11:12 -0700
Subject: rcutorture: Expand rcu_torture_write_types() first "if" statement

This commit expands the rcu_torture_write_types() function's first "if"
condition and body, placing one element per line, in order to make the
compiler's error messages more helpful.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 71d1af9c060e..fe1836aad646 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1195,12 +1195,29 @@ static void rcu_torture_write_types(void)
 	bool gp_poll_full1 = gp_poll_full, gp_sync1 = gp_sync;
 
 	/* Initialize synctype[] array.  If none set, take default. */
-	if (!gp_cond1 && !gp_cond_exp1 && !gp_cond_full1 && !gp_cond_exp_full1 && !gp_exp1 &&
-	    !gp_poll_exp1 && !gp_poll_exp_full1 && !gp_normal1 && !gp_poll1 && !gp_poll_full1 &&
-	    !gp_sync1)
-		gp_cond1 = gp_cond_exp1 = gp_cond_full1 = gp_cond_exp_full1 = gp_exp1 =
-			   gp_poll_exp1 = gp_poll_exp_full1 = gp_normal1 = gp_poll1 =
-			   gp_poll_full1 = gp_sync1 = true;
+	if (!gp_cond1 &&
+	    !gp_cond_exp1 &&
+	    !gp_cond_full1 &&
+	    !gp_cond_exp_full1 &&
+	    !gp_exp1 &&
+	    !gp_poll_exp1 &&
+	    !gp_poll_exp_full1 &&
+	    !gp_normal1 &&
+	    !gp_poll1 &&
+	    !gp_poll_full1 &&
+	    !gp_sync1) {
+		gp_cond1 = true;
+		gp_cond_exp1 = true;
+		gp_cond_full1 = true;
+		gp_cond_exp_full1 = true;
+		gp_exp1 = true;
+		gp_poll_exp1 = true;
+		gp_poll_exp_full1 = true;
+		gp_normal1 = true;
+		gp_poll1 = true;
+		gp_poll_full1 = true;
+		gp_sync1 = true;
+	}
 	if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) {
 		synctype[nsynctypes++] = RTWS_COND_GET;
 		pr_info("%s: Testing conditional GPs.\n", __func__);
-- 
cgit v1.2.3


From 18538248e5486b0f0e8581083de275176674cd1f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 24 Aug 2022 15:39:09 -0700
Subject: rcu: Add functions to compare grace-period state values

This commit adds same_state_synchronize_rcu() and
same_state_synchronize_rcu_full() functions to compare grace-period state
values, for example, those obtained from get_state_synchronize_rcu()
and get_state_synchronize_rcu_full().  These functions allow small
structures to omit these state values by placing them in list headers for
lists containing structures with the same token value.  Presumably the
per-structure list pointers are the same ones used to link the structures
into whatever reader-accessible data structure was used.

This commit also adds both NUM_ACTIVE_RCU_POLL_OLDSTATE and
NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE, which define the maximum number of
distinct unsigned long values and rcu_gp_oldstate values, respectively,
corresponding to not-yet-completed grace periods.  These values can be
used to size arrays of the list headers described above.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 21 +++++++++++++++++++++
 include/linux/rcutiny.h  | 14 ++++++++++++++
 include/linux/rcutree.h  | 28 ++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index faaa174dfb27..9941d5c3d5e1 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -47,6 +47,27 @@ struct rcu_gp_oldstate;
 unsigned long get_completed_synchronize_rcu(void);
 void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
 
+// Maximum number of unsigned long values corresponding to
+// not-yet-completed RCU grace periods.
+#define NUM_ACTIVE_RCU_POLL_OLDSTATE 2
+
+/**
+ * same_state_synchronize_rcu - Are two old-state values identical?
+ * @oldstate1: First old-state value.
+ * @oldstate2: Second old-state value.
+ *
+ * The two old-state values must have been obtained from either
+ * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or
+ * get_completed_synchronize_rcu().  Returns @true if the two values are
+ * identical and @false otherwise.  This allows structures whose lifetimes
+ * are tracked by old-state values to push these values to a list header,
+ * allowing those structures to be slightly smaller.
+ */
+static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2)
+{
+	return oldstate1 == oldstate2;
+}
+
 #ifdef CONFIG_PREEMPT_RCU
 
 void __rcu_read_lock(void);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 4405e9112cee..768196a5f39d 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -18,6 +18,20 @@ struct rcu_gp_oldstate {
 	unsigned long rgos_norm;
 };
 
+// Maximum number of rcu_gp_oldstate values corresponding to
+// not-yet-completed RCU grace periods.
+#define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 2
+
+/*
+ * Are the two oldstate values the same?  See the Tree RCU version for
+ * docbook header.
+ */
+static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1,
+						   struct rcu_gp_oldstate *rgosp2)
+{
+	return rgosp1->rgos_norm == rgosp2->rgos_norm;
+}
+
 unsigned long get_state_synchronize_rcu(void);
 
 static inline void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 455a03bdce15..5efb51486e8a 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -46,6 +46,34 @@ struct rcu_gp_oldstate {
 	unsigned long rgos_exp;
 };
 
+// Maximum number of rcu_gp_oldstate values corresponding to
+// not-yet-completed RCU grace periods.
+#define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 4
+
+/**
+ * same_state_synchronize_rcu_full - Are two old-state values identical?
+ * @rgosp1: First old-state value.
+ * @rgosp2: Second old-state value.
+ *
+ * The two old-state values must have been obtained from either
+ * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
+ * or get_completed_synchronize_rcu_full().  Returns @true if the two
+ * values are identical and @false otherwise.  This allows structures
+ * whose lifetimes are tracked by old-state values to push these values
+ * to a list header, allowing those structures to be slightly smaller.
+ *
+ * Note that equality is judged on a bitwise basis, so that an
+ * @rcu_gp_oldstate structure with an already-completed state in one field
+ * will compare not-equal to a structure with an already-completed state
+ * in the other field.  After all, the @rcu_gp_oldstate structure is opaque
+ * so how did such a situation come to pass in the first place?
+ */
+static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1,
+						   struct rcu_gp_oldstate *rgosp2)
+{
+	return rgosp1->rgos_norm == rgosp2->rgos_norm && rgosp1->rgos_exp == rgosp2->rgos_exp;
+}
+
 unsigned long start_poll_synchronize_rcu_expedited(void);
 void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
 void cond_synchronize_rcu_expedited(unsigned long oldstate);
-- 
cgit v1.2.3


From 967c298d650e2a8562fb4ea488af36bdf2a71ac4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 24 Aug 2022 18:57:45 -0700
Subject: rcutorture: Limit read-side polling-API testing

RCU's polled grace-period API is reasonably lightweight, but still
contains heavyweight memory barriers.  This commit therefore limits
testing of this API from rcutorture's readers in order to avoid the
false negatives that these heavyweight operations could provoke.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index fe1836aad646..91103279d7b4 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1903,6 +1903,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
  */
 static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
 {
+	bool checkpolling = !(torture_random(trsp) & 0xfff);
 	unsigned long cookie;
 	struct rcu_gp_oldstate cookie_full;
 	int i;
@@ -1920,10 +1921,12 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
 	WARN_ON_ONCE(!rcu_is_watching());
 	newstate = rcutorture_extend_mask(readstate, trsp);
 	rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
-	if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
-		cookie = cur_ops->get_gp_state();
-	if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
-		cur_ops->get_gp_state_full(&cookie_full);
+	if (checkpolling) {
+		if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+			cookie = cur_ops->get_gp_state();
+		if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+			cur_ops->get_gp_state_full(&cookie_full);
+	}
 	started = cur_ops->get_gp_seq();
 	ts = rcu_trace_clock_local();
 	p = rcu_dereference_check(rcu_torture_current,
@@ -1957,20 +1960,22 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
 	}
 	__this_cpu_inc(rcu_torture_batch[completed]);
 	preempt_enable();
-	if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
-		WARN_ONCE(cur_ops->poll_gp_state(cookie),
-			  "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
-			  __func__,
-			  rcu_torture_writer_state_getname(),
-			  rcu_torture_writer_state,
-			  cookie, cur_ops->get_gp_state());
-	if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
-		WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
-			  "%s: Cookie check 6 failed %s(%d) online %*pbl\n",
-			  __func__,
-			  rcu_torture_writer_state_getname(),
-			  rcu_torture_writer_state,
-			  cpumask_pr_args(cpu_online_mask));
+	if (checkpolling) {
+		if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
+			WARN_ONCE(cur_ops->poll_gp_state(cookie),
+				  "%s: Cookie check 2 failed %s(%d) %lu->%lu\n",
+				  __func__,
+				  rcu_torture_writer_state_getname(),
+				  rcu_torture_writer_state,
+				  cookie, cur_ops->get_gp_state());
+		if (cur_ops->get_gp_state_full && cur_ops->poll_gp_state_full)
+			WARN_ONCE(cur_ops->poll_gp_state_full(&cookie_full),
+				  "%s: Cookie check 6 failed %s(%d) online %*pbl\n",
+				  __func__,
+				  rcu_torture_writer_state_getname(),
+				  rcu_torture_writer_state,
+				  cpumask_pr_args(cpu_online_mask));
+	}
 	rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
 	WARN_ON_ONCE(readstate);
 	// This next splat is expected behavior if leakpointer, especially
-- 
cgit v1.2.3


From 599d97e3f2236a0f02a59da05798e8af43d5ce72 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 27 Jul 2022 16:16:41 -0700
Subject: rcutorture: Make "srcud" option also test polled grace-period API

This commit brings the "srcud" (dynamically allocated) SRCU test in line
with the "srcu" (statically allocated) test, so that both test the full
SRCU polled grace-period API.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d8e1b270a065..7168dc8d61e9 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -709,6 +709,9 @@ static struct rcu_torture_ops srcud_ops = {
 	.deferred_free	= srcu_torture_deferred_free,
 	.sync		= srcu_torture_synchronize,
 	.exp_sync	= srcu_torture_synchronize_expedited,
+	.get_gp_state	= srcu_torture_get_gp_state,
+	.start_gp_poll	= srcu_torture_start_gp_poll,
+	.poll_gp_state	= srcu_torture_poll_gp_state,
 	.call		= srcu_torture_call,
 	.cb_barrier	= srcu_torture_barrier,
 	.stats		= srcu_torture_stats,
-- 
cgit v1.2.3


From d66e4cf974a53c1195f1f5a96387ee5dbad2bdf2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Aug 2022 10:18:23 -0700
Subject: srcu: Add GP and maximum requested GP to Tiny SRCU rcutorture output

This commit adds the ->srcu_idx and ->srcu_max_idx fields to the Tiny
SRCU rcutorture output for additional diagnostics.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutiny.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 6cfaa0a9a9b9..4fcec6f5af90 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -82,10 +82,12 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
 	int idx;
 
 	idx = ((data_race(READ_ONCE(ssp->srcu_idx)) + 1) & 0x2) >> 1;
-	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
+	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd) gp: %hu->%hu\n",
 		 tt, tf, idx,
 		 data_race(READ_ONCE(ssp->srcu_lock_nesting[!idx])),
-		 data_race(READ_ONCE(ssp->srcu_lock_nesting[idx])));
+		 data_race(READ_ONCE(ssp->srcu_lock_nesting[idx])),
+		 data_race(READ_ONCE(ssp->srcu_idx)),
+		 data_race(READ_ONCE(ssp->srcu_idx_max)));
 }
 
 #endif
-- 
cgit v1.2.3


From de3f2671ae563d24c679dcca36c9e0ebd9564ebd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Aug 2022 11:59:49 -0700
Subject: srcu: Make Tiny SRCU poll_state_synchronize_srcu() more precise

This commit applies the more-precise grace-period-state check used by
rcu_seq_done_exact() to poll_state_synchronize_srcu().  This is important
because Tiny SRCU uses a 16-bit counter, which can wrap quite quickly.
If counter wrap continues to be a problem, then expanding ->srcu_idx
and ->srcu_idx_max to 32 bits might be warranted.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutiny.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 92c002d65482..a2af24f21467 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -240,10 +240,10 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
  */
 bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
 {
-	bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie);
+	unsigned short cur_s = READ_ONCE(ssp->srcu_idx);
 
 	barrier();
-	return ret;
+	return USHORT_CMP_GE(cur_s, cookie) || USHORT_CMP_LT(cur_s, cookie - 3);
 }
 EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
 
-- 
cgit v1.2.3


From 5fe89191e43fb37e874b8e5177fb2a5c72379b06 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Aug 2022 15:32:47 -0700
Subject: srcu: Make Tiny SRCU use full-sized grace-period counters

This commit makes Tiny SRCU use full-sized grace-period counters to
further avoid counter-wrap issues when using polled grace-period APIs.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/srcutiny.h |  6 +++---
 kernel/rcu/srcutiny.c    | 14 +++++++-------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 4fcec6f5af90..5aa5e0faf6a1 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -15,10 +15,10 @@
 
 struct srcu_struct {
 	short srcu_lock_nesting[2];	/* srcu_read_lock() nesting depth. */
-	unsigned short srcu_idx;	/* Current reader array element in bit 0x2. */
-	unsigned short srcu_idx_max;	/* Furthest future srcu_idx request. */
 	u8 srcu_gp_running;		/* GP workqueue running? */
 	u8 srcu_gp_waiting;		/* GP waiting for readers? */
+	unsigned long srcu_idx;		/* Current reader array element in bit 0x2. */
+	unsigned long srcu_idx_max;	/* Furthest future srcu_idx request. */
 	struct swait_queue_head srcu_wq;
 					/* Last srcu_read_unlock() wakes GP. */
 	struct rcu_head *srcu_cb_head;	/* Pending callbacks: Head. */
@@ -82,7 +82,7 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
 	int idx;
 
 	idx = ((data_race(READ_ONCE(ssp->srcu_idx)) + 1) & 0x2) >> 1;
-	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd) gp: %hu->%hu\n",
+	pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd) gp: %lu->%lu\n",
 		 tt, tf, idx,
 		 data_race(READ_ONCE(ssp->srcu_lock_nesting[!idx])),
 		 data_race(READ_ONCE(ssp->srcu_lock_nesting[idx])),
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index a2af24f21467..33adafdad261 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -117,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp)
 	struct srcu_struct *ssp;
 
 	ssp = container_of(wp, struct srcu_struct, srcu_work);
-	if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+	if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
 		return; /* Already running or nothing to do. */
 
 	/* Remove recently arrived callbacks and wait for readers. */
@@ -150,17 +150,17 @@ void srcu_drive_gp(struct work_struct *wp)
 	 * straighten that out.
 	 */
 	WRITE_ONCE(ssp->srcu_gp_running, false);
-	if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+	if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
 		schedule_work(&ssp->srcu_work);
 }
 EXPORT_SYMBOL_GPL(srcu_drive_gp);
 
 static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
 {
-	unsigned short cookie;
+	unsigned long cookie;
 
 	cookie = get_state_synchronize_srcu(ssp);
-	if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+	if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
 		return;
 	WRITE_ONCE(ssp->srcu_idx_max, cookie);
 	if (!READ_ONCE(ssp->srcu_gp_running)) {
@@ -215,7 +215,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
 	barrier();
 	ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
 	barrier();
-	return ret & USHRT_MAX;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
 
@@ -240,10 +240,10 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
  */
 bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
 {
-	unsigned short cur_s = READ_ONCE(ssp->srcu_idx);
+	unsigned long cur_s = READ_ONCE(ssp->srcu_idx);
 
 	barrier();
-	return USHORT_CMP_GE(cur_s, cookie) || USHORT_CMP_LT(cur_s, cookie - 3);
+	return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
 }
 EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
 
-- 
cgit v1.2.3


From fcd53c8a4dfa38bafb89efdd0b0f718f3a03f884 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Tue, 12 Jul 2022 16:26:05 +0800
Subject: rcu-tasks: Convert RCU_LOCKDEP_WARN() to WARN_ONCE()

Kernels built with CONFIG_PROVE_RCU=y and CONFIG_DEBUG_LOCK_ALLOC=y
attempt to emit a warning when the synchronize_rcu_tasks_generic()
function is called during early boot while the rcu_scheduler_active
variable is RCU_SCHEDULER_INACTIVE.  However the warnings is not
actually be printed because the debug_lockdep_rcu_enabled() returns
false, exactly because the rcu_scheduler_active variable is still equal
to RCU_SCHEDULER_INACTIVE.

This commit therefore replaces RCU_LOCKDEP_WARN() with WARN_ONCE()
to force these warnings to actually be printed.

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 83c7e6620d40..469bf2a3b505 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -560,7 +560,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
 {
 	/* Complain if the scheduler has not started.  */
-	RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
+	WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
 			 "synchronize_rcu_tasks called too soon");
 
 	// If the grace-period kthread is running, use it.
-- 
cgit v1.2.3


From d6ad60635cafe900bcd11ad588d8accb36c36b1b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 18 Jul 2022 10:57:26 -0700
Subject: rcu-tasks: Ensure RCU Tasks Trace loops have quiescent states

The RCU Tasks Trace grace-period kthread loops across all CPUs, and
there can be quite a few CPUs, with some commercially available systems
sporting well over a thousand of them.  Some of these loops can feature
IPIs, which can take some time.  This commit therefore places a call to
cond_resched_tasks_rcu_qs() in each such loop.

Link: https://docs.google.com/document/d/1V0YnG1HTWMt9WHJjroiJL9lf-hMrud4v8Fn3fhyY0cI/edit?usp=sharing
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 469bf2a3b505..f5bf6fb430da 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1500,6 +1500,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
 		if (rcu_tasks_trace_pertask_prep(t, true))
 			trc_add_holdout(t, hop);
 		rcu_read_unlock();
+		cond_resched_tasks_rcu_qs();
 	}
 
 	// Only after all running tasks have been accounted for is it
@@ -1520,6 +1521,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
 			raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
 		}
 		raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
+		cond_resched_tasks_rcu_qs();
 	}
 
 	// Re-enable CPU hotplug now that the holdout list is populated.
@@ -1619,6 +1621,7 @@ static void check_all_holdout_tasks_trace(struct list_head *hop,
 			trc_del_holdout(t);
 		else if (needreport)
 			show_stalled_task_trace(t, firstreport);
+		cond_resched_tasks_rcu_qs();
 	}
 
 	// Re-enable CPU hotplug now that the holdout list scan has completed.
-- 
cgit v1.2.3


From 528262f50274079740b53e29bcaaabf219aa7417 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Tue, 19 Jul 2022 12:39:00 +0800
Subject: rcu-tasks: Make RCU Tasks Trace check for userspace execution

Userspace execution is a valid quiescent state for RCU Tasks Trace,
but the scheduling-clock interrupt does not currently report such
quiescent states.

Of course, the scheduling-clock interrupt is not strictly speaking
userspace execution.  However, the only way that this code is not
in a quiescent state is if something invoked rcu_read_lock_trace(),
and that would be reflected in the ->trc_reader_nesting field in
the task_struct structure.  Furthermore, this field is checked by
rcu_tasks_trace_qs(), which is invoked by rcu_tasks_qs() which is in
turn invoked by rcu_note_voluntary_context_switch() in kernels building
at least one of the RCU Tasks flavors.  It is therefore safe to invoke
rcu_tasks_trace_qs() from the rcu_sched_clock_irq().

But rcu_tasks_qs() also invokes rcu_tasks_classic_qs() for RCU
Tasks, which lacks the read-side markers provided by RCU Tasks Trace.
This raises the possibility that an RCU Tasks grace period could start
after the interrupt from userspace execution, but before the call to
rcu_sched_clock_irq().  However, it turns out that this is safe because
the RCU Tasks grace period waits for an RCU grace period, which will
wait for the entire scheduling-clock interrupt handler, including any
RCU Tasks read-side critical section that this handler might contain.

This commit therefore updates the rcu_sched_clock_irq() function's
check for usermode execution and its call to rcu_tasks_classic_qs()
to instead check for both usermode execution and interrupt from idle,
and to instead call rcu_note_voluntary_context_switch().  This
consolidates code and provides more faster RCU Tasks Trace
reporting of quiescent states in kernels that do scheduling-clock
interrupts for userspace execution.

[ paulmck: Consolidate checks into rcu_sched_clock_irq(). ]

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c        | 4 ++--
 kernel/rcu/tree_plugin.h | 4 ----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 79aea7df4345..11d5aefd1696 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2341,8 +2341,8 @@ void rcu_sched_clock_irq(int user)
 	rcu_flavor_sched_clock_irq(user);
 	if (rcu_pending(user))
 		invoke_rcu_core();
-	if (user)
-		rcu_tasks_classic_qs(current, false);
+	if (user || rcu_is_cpu_rrupt_from_idle())
+		rcu_note_voluntary_context_switch(current);
 	lockdep_assert_irqs_disabled();
 
 	trace_rcu_utilization(TPS("End scheduler-tick"));
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 438ecae6bd7e..aa64b035c24f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -718,9 +718,6 @@ static void rcu_flavor_sched_clock_irq(int user)
 	struct task_struct *t = current;
 
 	lockdep_assert_irqs_disabled();
-	if (user || rcu_is_cpu_rrupt_from_idle()) {
-		rcu_note_voluntary_context_switch(current);
-	}
 	if (rcu_preempt_depth() > 0 ||
 	    (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
 		/* No QS, force context switch if deferred. */
@@ -972,7 +969,6 @@ static void rcu_flavor_sched_clock_irq(int user)
 		 * neither access nor modify, at least not while the
 		 * corresponding CPU is online.
 		 */
-
 		rcu_qs();
 	}
 }
-- 
cgit v1.2.3


From 203185f6b1e393cf2f4b84023d4d01d515fcfd5f Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Mon, 20 Jun 2022 17:17:49 +0900
Subject: docs/memory-barriers.txt: Fix confusing name of 'data dependency
 barrier'

The term "data dependency barrier", which has been in
memory-barriers.txt ever since it was first authored by David Howells,
has become confusing due to the fact that in LKMM's explanations.txt
and elsewhere, "data dependency" is used mostly for load-to-store data
dependency.

To prevent further confusions, do the changes listed below:

  - substitute "data dependency barrier" with "address-dependency
    barrier";
  - add note on the removal of kernel APIs for explicit address-
    dependency barriers in kernel release v5.9;
  - note that address-dependency barriers are not necessary for
    load-to-store situations;
  - use READ_ONCE_OLD() for pre-4.15 READ_ONCE() (no implicit address-
    dependency barrier);
  - fix count of kernel memory barrier APIs;
  - and a few more context adjustments.

Note: Cleanups of long lines are deferred to a followup patch.

Reported-by: "Michael S. Tsirkin" <mst@redhat.com>
Link: https://lore.kernel.org/r/20211011064233-mutt-send-email-mst@kernel.org/
Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Daniel Lustig <dlustig@nvidia.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/memory-barriers.txt | 116 +++++++++++++++++++++-----------------
 1 file changed, 64 insertions(+), 52 deletions(-)

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 832b5d36e279..b16767cb6d31 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -52,7 +52,7 @@ CONTENTS
 
      - Varieties of memory barrier.
      - What may not be assumed about memory barriers?
-     - Data dependency barriers (historical).
+     - Address-dependency barriers (historical).
      - Control dependencies.
      - SMP barrier pairing.
      - Examples of memory barrier sequences.
@@ -187,7 +187,7 @@ As a further example, consider this sequence of events:
 	B = 4;		Q = P;
 	P = &B;		D = *Q;
 
-There is an obvious data dependency here, as the value loaded into D depends on
+There is an obvious address dependency here, as the value loaded into D depends on
 the address retrieved from P by CPU 2.  At the end of the sequence, any of the
 following results are possible:
 
@@ -391,49 +391,53 @@ Memory barriers come in four basic varieties:
      memory system as time progresses.  All stores _before_ a write barrier
      will occur _before_ all the stores after the write barrier.
 
-     [!] Note that write barriers should normally be paired with read or data
-     dependency barriers; see the "SMP barrier pairing" subsection.
+     [!] Note that write barriers should normally be paired with read or
+     address-dependency barriers; see the "SMP barrier pairing" subsection.
 
 
- (2) Data dependency barriers.
+ (2) Address-dependency barriers (historical).
 
-     A data dependency barrier is a weaker form of read barrier.  In the case
+     An address-dependency barrier is a weaker form of read barrier.  In the case
      where two loads are performed such that the second depends on the result
      of the first (eg: the first load retrieves the address to which the second
-     load will be directed), a data dependency barrier would be required to
+     load will be directed), an address-dependency barrier would be required to
      make sure that the target of the second load is updated after the address
      obtained by the first load is accessed.
 
-     A data dependency barrier is a partial ordering on interdependent loads
+     An address-dependency barrier is a partial ordering on interdependent loads
      only; it is not required to have any effect on stores, independent loads
      or overlapping loads.
 
      As mentioned in (1), the other CPUs in the system can be viewed as
      committing sequences of stores to the memory system that the CPU being
-     considered can then perceive.  A data dependency barrier issued by the CPU
+     considered can then perceive.  An address-dependency barrier issued by the CPU
      under consideration guarantees that for any load preceding it, if that
      load touches one of a sequence of stores from another CPU, then by the
      time the barrier completes, the effects of all the stores prior to that
-     touched by the load will be perceptible to any loads issued after the data
+     touched by the load will be perceptible to any loads issued after the address-
      dependency barrier.
 
      See the "Examples of memory barrier sequences" subsection for diagrams
      showing the ordering constraints.
 
-     [!] Note that the first load really has to have a _data_ dependency and
+     [!] Note that the first load really has to have an _address_ dependency and
      not a control dependency.  If the address for the second load is dependent
      on the first load, but the dependency is through a conditional rather than
      actually loading the address itself, then it's a _control_ dependency and
      a full read barrier or better is required.  See the "Control dependencies"
      subsection for more information.
 
-     [!] Note that data dependency barriers should normally be paired with
+     [!] Note that address-dependency barriers should normally be paired with
      write barriers; see the "SMP barrier pairing" subsection.
 
+     [!] Kernel release v5.9 removed kernel APIs for explicit address-
+     dependency barriers.  Nowadays, APIs for marking loads from shared
+     variables such as READ_ONCE() and rcu_dereference() provide implicit
+     address-dependency barriers.
 
  (3) Read (or load) memory barriers.
 
-     A read barrier is a data dependency barrier plus a guarantee that all the
+     A read barrier is an address-dependency barrier plus a guarantee that all the
      LOAD operations specified before the barrier will appear to happen before
      all the LOAD operations specified after the barrier with respect to the
      other components of the system.
@@ -441,7 +445,7 @@ Memory barriers come in four basic varieties:
      A read barrier is a partial ordering on loads only; it is not required to
      have any effect on stores.
 
-     Read memory barriers imply data dependency barriers, and so can substitute
+     Read memory barriers imply address-dependency barriers, and so can substitute
      for them.
 
      [!] Note that read barriers should normally be paired with write barriers;
@@ -550,17 +554,21 @@ There are certain things that the Linux kernel memory barriers do not guarantee:
 	    Documentation/core-api/dma-api.rst
 
 
-DATA DEPENDENCY BARRIERS (HISTORICAL)
--------------------------------------
+ADDRESS-DEPENDENCY BARRIERS (HISTORICAL)
+----------------------------------------
 
 As of v4.15 of the Linux kernel, an smp_mb() was added to READ_ONCE() for
 DEC Alpha, which means that about the only people who need to pay attention
 to this section are those working on DEC Alpha architecture-specific code
 and those working on READ_ONCE() itself.  For those who need it, and for
 those who are interested in the history, here is the story of
-data-dependency barriers.
+address-dependency barriers.
+
+[!] While address dependencies are observed in both load-to-load and
+load-to-store relations, address-dependency barriers are not necessary
+for load-to-store situations.
 
-The usage requirements of data dependency barriers are a little subtle, and
+The requirement of address-dependency barriers is a little subtle, and
 it's not always obvious that they're needed.  To illustrate, consider the
 following sequence of events:
 
@@ -570,10 +578,13 @@ following sequence of events:
 	B = 4;
 	<write barrier>
 	WRITE_ONCE(P, &B);
-			      Q = READ_ONCE(P);
+			      Q = READ_ONCE_OLD(P);
 			      D = *Q;
 
-There's a clear data dependency here, and it would seem that by the end of the
+[!] READ_ONCE_OLD() corresponds to READ_ONCE() of pre-4.15 kernel, which
+doesn't imply an address-dependency barrier.
+
+There's a clear address dependency here, and it would seem that by the end of the
 sequence, Q must be either &A or &B, and that:
 
 	(Q == &A) implies (D == 1)
@@ -588,8 +599,8 @@ While this may seem like a failure of coherency or causality maintenance, it
 isn't, and this behaviour can be observed on certain real CPUs (such as the DEC
 Alpha).
 
-To deal with this, a data dependency barrier or better must be inserted
-between the address load and the data load:
+To deal with this, READ_ONCE() provides an implicit address-dependency
+barrier since kernel release v4.15:
 
 	CPU 1		      CPU 2
 	===============	      ===============
@@ -598,7 +609,7 @@ between the address load and the data load:
 	<write barrier>
 	WRITE_ONCE(P, &B);
 			      Q = READ_ONCE(P);
-			      <data dependency barrier>
+			      <implicit address-dependency barrier>
 			      D = *Q;
 
 This enforces the occurrence of one of the two implications, and prevents the
@@ -615,7 +626,7 @@ odd-numbered bank is idle, one can see the new value of the pointer P (&B),
 but the old value of the variable B (2).
 
 
-A data-dependency barrier is not required to order dependent writes
+An address-dependency barrier is not required to order dependent writes
 because the CPUs that the Linux kernel supports don't do writes
 until they are certain (1) that the write will actually happen, (2)
 of the location of the write, and (3) of the value to be written.
@@ -629,12 +640,12 @@ break dependencies in a great many highly creative ways.
 	B = 4;
 	<write barrier>
 	WRITE_ONCE(P, &B);
-			      Q = READ_ONCE(P);
+			      Q = READ_ONCE_OLD(P);
 			      WRITE_ONCE(*Q, 5);
 
-Therefore, no data-dependency barrier is required to order the read into
+Therefore, no address-dependency barrier is required to order the read into
 Q with the store into *Q.  In other words, this outcome is prohibited,
-even without a data-dependency barrier:
+even without an implicit address-dependency barrier of modern READ_ONCE():
 
 	(Q == &B) && (B == 4)
 
@@ -645,12 +656,12 @@ can be used to record rare error conditions and the like, and the CPUs'
 naturally occurring ordering prevents such records from being lost.
 
 
-Note well that the ordering provided by a data dependency is local to
+Note well that the ordering provided by an address dependency is local to
 the CPU containing it.  See the section on "Multicopy atomicity" for
 more information.
 
 
-The data dependency barrier is very important to the RCU system,
+The address-dependency barrier is very important to the RCU system,
 for example.  See rcu_assign_pointer() and rcu_dereference() in
 include/linux/rcupdate.h.  This permits the current target of an RCU'd
 pointer to be replaced with a new modified target, without the replacement
@@ -667,16 +678,17 @@ not understand them.  The purpose of this section is to help you prevent
 the compiler's ignorance from breaking your code.
 
 A load-load control dependency requires a full read memory barrier, not
-simply a data dependency barrier to make it work correctly.  Consider the
+simply an (implicit) address-dependency barrier to make it work correctly.  Consider the
 following bit of code:
 
 	q = READ_ONCE(a);
+	<implicit address-dependency barrier>
 	if (q) {
-		<data dependency barrier>  /* BUG: No data dependency!!! */
+		/* BUG: No address dependency!!! */
 		p = READ_ONCE(b);
 	}
 
-This will not have the desired effect because there is no actual data
+This will not have the desired effect because there is no actual address
 dependency, but rather a control dependency that the CPU may short-circuit
 by attempting to predict the outcome in advance, so that other CPUs see
 the load from b as having happened before the load from a.  In such a
@@ -927,9 +939,9 @@ General barriers pair with each other, though they also pair with most
 other types of barriers, albeit without multicopy atomicity.  An acquire
 barrier pairs with a release barrier, but both may also pair with other
 barriers, including of course general barriers.  A write barrier pairs
-with a data dependency barrier, a control dependency, an acquire barrier,
+with an address-dependency barrier, a control dependency, an acquire barrier,
 a release barrier, a read barrier, or a general barrier.  Similarly a
-read barrier, control dependency, or a data dependency barrier pairs
+read barrier, control dependency, or an address-dependency barrier pairs
 with a write barrier, an acquire barrier, a release barrier, or a
 general barrier:
 
@@ -948,7 +960,7 @@ Or:
 	a = 1;
 	<write barrier>
 	WRITE_ONCE(b, &a);    x = READ_ONCE(b);
-			      <data dependency barrier>
+			      <implicit address-dependency barrier>
 			      y = *x;
 
 Or even:
@@ -968,7 +980,7 @@ Basically, the read barrier always has to be there, even though it can be of
 the "weaker" type.
 
 [!] Note that the stores before the write barrier would normally be expected to
-match the loads after the read barrier or the data dependency barrier, and vice
+match the loads after the read barrier or the address-dependency barrier, and vice
 versa:
 
 	CPU 1                               CPU 2
@@ -1021,7 +1033,7 @@ STORE B, STORE C } all occurring before the unordered set of { STORE D, STORE E
 	                   V
 
 
-Secondly, data dependency barriers act as partial orderings on data-dependent
+Secondly, address-dependency barriers act as partial orderings on address-dependent
 loads.  Consider the following sequence of events:
 
 	CPU 1			CPU 2
@@ -1067,7 +1079,7 @@ effectively random order, despite the write barrier issued by CPU 1:
 In the above example, CPU 2 perceives that B is 7, despite the load of *C
 (which would be B) coming after the LOAD of C.
 
-If, however, a data dependency barrier were to be placed between the load of C
+If, however, an address-dependency barrier were to be placed between the load of C
 and the load of *C (ie: B) on CPU 2:
 
 	CPU 1			CPU 2
@@ -1078,7 +1090,7 @@ and the load of *C (ie: B) on CPU 2:
 	<write barrier>
 	STORE C = &B		LOAD X
 	STORE D = 4		LOAD C (gets &B)
-				<data dependency barrier>
+				<address-dependency barrier>
 				LOAD *C (reads B)
 
 then the following will occur:
@@ -1101,7 +1113,7 @@ then the following will occur:
 	                               |        +-------+       |       |
 	                               |        | X->9  |------>|       |
 	                               |        +-------+       |       |
-	  Makes sure all effects --->   \   ddddddddddddddddd   |       |
+	  Makes sure all effects --->   \   aaaaaaaaaaaaaaaaa   |       |
 	  prior to the store of C        \      +-------+       |       |
 	  are perceptible to              ----->| B->2  |------>|       |
 	  subsequent loads                      +-------+       |       |
@@ -1292,7 +1304,7 @@ Which might appear as this:
 	LOAD with immediate effect              :       :       +-------+
 
 
-Placing a read barrier or a data dependency barrier just before the second
+Placing a read barrier or an address-dependency barrier just before the second
 load:
 
 	CPU 1			CPU 2
@@ -1816,20 +1828,20 @@ which may then reorder things however it wishes.
 CPU MEMORY BARRIERS
 -------------------
 
-The Linux kernel has eight basic CPU memory barriers:
+The Linux kernel has seven basic CPU memory barriers:
 
-	TYPE		MANDATORY		SMP CONDITIONAL
-	===============	=======================	===========================
-	GENERAL		mb()			smp_mb()
-	WRITE		wmb()			smp_wmb()
-	READ		rmb()			smp_rmb()
-	DATA DEPENDENCY				READ_ONCE()
+	TYPE			MANDATORY	SMP CONDITIONAL
+	=======================	===============	===============
+	GENERAL			mb()		smp_mb()
+	WRITE			wmb()		smp_wmb()
+	READ			rmb()		smp_rmb()
+	ADDRESS DEPENDENCY			READ_ONCE()
 
 
-All memory barriers except the data dependency barriers imply a compiler
-barrier.  Data dependencies do not impose any additional compiler ordering.
+All memory barriers except the address-dependency barriers imply a compiler
+barrier.  Address dependencies do not impose any additional compiler ordering.
 
-Aside: In the case of data dependencies, the compiler would be expected
+Aside: In the case of address dependencies, the compiler would be expected
 to issue the loads in the correct order (eg. `a[b]` would have to load
 the value of b before loading a[b]), however there is no guarantee in
 the C specification that the compiler may not speculate the value of b
@@ -2889,7 +2901,7 @@ AND THEN THERE'S THE ALPHA
 The DEC Alpha CPU is one of the most relaxed CPUs there is.  Not only that,
 some versions of the Alpha CPU have a split data cache, permitting them to have
 two semantically-related cache lines updated at separate times.  This is where
-the data dependency barrier really becomes necessary as this synchronises both
+the address-dependency barrier really becomes necessary as this synchronises both
 caches with the memory coherence system, thus making it seem like pointer
 changes vs new data occur in the right order.
 
-- 
cgit v1.2.3


From f556082dd7fc5a97326e44aae66199cc639a671c Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Mon, 20 Jun 2022 17:19:35 +0900
Subject: docs/memory-barriers.txt: Fixup long lines

Substitution of "data dependency barrier" with "address-dependency
barrier" left quite a lot of lines exceeding 80 columns.

Reflow those lines as well as a few short ones not related to
the substitution.

No changes in documentation text.

Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Will Deacon <will@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Andrea Parri <parri.andrea@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Daniel Lustig <dlustig@nvidia.com>
Cc: Joel Fernandes <joel@joelfernandes.org>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/memory-barriers.txt | 93 ++++++++++++++++++++-------------------
 1 file changed, 47 insertions(+), 46 deletions(-)

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index b16767cb6d31..06f80e3785c5 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -187,9 +187,9 @@ As a further example, consider this sequence of events:
 	B = 4;		Q = P;
 	P = &B;		D = *Q;
 
-There is an obvious address dependency here, as the value loaded into D depends on
-the address retrieved from P by CPU 2.  At the end of the sequence, any of the
-following results are possible:
+There is an obvious address dependency here, as the value loaded into D depends
+on the address retrieved from P by CPU 2.  At the end of the sequence, any of
+the following results are possible:
 
 	(Q == &A) and (D == 1)
 	(Q == &B) and (D == 2)
@@ -397,25 +397,25 @@ Memory barriers come in four basic varieties:
 
  (2) Address-dependency barriers (historical).
 
-     An address-dependency barrier is a weaker form of read barrier.  In the case
-     where two loads are performed such that the second depends on the result
-     of the first (eg: the first load retrieves the address to which the second
-     load will be directed), an address-dependency barrier would be required to
-     make sure that the target of the second load is updated after the address
-     obtained by the first load is accessed.
+     An address-dependency barrier is a weaker form of read barrier.  In the
+     case where two loads are performed such that the second depends on the
+     result of the first (eg: the first load retrieves the address to which
+     the second load will be directed), an address-dependency barrier would
+     be required to make sure that the target of the second load is updated
+     after the address obtained by the first load is accessed.
 
-     An address-dependency barrier is a partial ordering on interdependent loads
-     only; it is not required to have any effect on stores, independent loads
-     or overlapping loads.
+     An address-dependency barrier is a partial ordering on interdependent
+     loads only; it is not required to have any effect on stores, independent
+     loads or overlapping loads.
 
      As mentioned in (1), the other CPUs in the system can be viewed as
      committing sequences of stores to the memory system that the CPU being
-     considered can then perceive.  An address-dependency barrier issued by the CPU
-     under consideration guarantees that for any load preceding it, if that
-     load touches one of a sequence of stores from another CPU, then by the
-     time the barrier completes, the effects of all the stores prior to that
-     touched by the load will be perceptible to any loads issued after the address-
-     dependency barrier.
+     considered can then perceive.  An address-dependency barrier issued by
+     the CPU under consideration guarantees that for any load preceding it,
+     if that load touches one of a sequence of stores from another CPU, then
+     by the time the barrier completes, the effects of all the stores prior to
+     that touched by the load will be perceptible to any loads issued after
+     the address-dependency barrier.
 
      See the "Examples of memory barrier sequences" subsection for diagrams
      showing the ordering constraints.
@@ -437,16 +437,16 @@ Memory barriers come in four basic varieties:
 
  (3) Read (or load) memory barriers.
 
-     A read barrier is an address-dependency barrier plus a guarantee that all the
-     LOAD operations specified before the barrier will appear to happen before
-     all the LOAD operations specified after the barrier with respect to the
-     other components of the system.
+     A read barrier is an address-dependency barrier plus a guarantee that all
+     the LOAD operations specified before the barrier will appear to happen
+     before all the LOAD operations specified after the barrier with respect to
+     the other components of the system.
 
      A read barrier is a partial ordering on loads only; it is not required to
      have any effect on stores.
 
-     Read memory barriers imply address-dependency barriers, and so can substitute
-     for them.
+     Read memory barriers imply address-dependency barriers, and so can
+     substitute for them.
 
      [!] Note that read barriers should normally be paired with write barriers;
      see the "SMP barrier pairing" subsection.
@@ -584,8 +584,8 @@ following sequence of events:
 [!] READ_ONCE_OLD() corresponds to READ_ONCE() of pre-4.15 kernel, which
 doesn't imply an address-dependency barrier.
 
-There's a clear address dependency here, and it would seem that by the end of the
-sequence, Q must be either &A or &B, and that:
+There's a clear address dependency here, and it would seem that by the end of
+the sequence, Q must be either &A or &B, and that:
 
 	(Q == &A) implies (D == 1)
 	(Q == &B) implies (D == 4)
@@ -599,8 +599,8 @@ While this may seem like a failure of coherency or causality maintenance, it
 isn't, and this behaviour can be observed on certain real CPUs (such as the DEC
 Alpha).
 
-To deal with this, READ_ONCE() provides an implicit address-dependency
-barrier since kernel release v4.15:
+To deal with this, READ_ONCE() provides an implicit address-dependency barrier
+since kernel release v4.15:
 
 	CPU 1		      CPU 2
 	===============	      ===============
@@ -627,12 +627,12 @@ but the old value of the variable B (2).
 
 
 An address-dependency barrier is not required to order dependent writes
-because the CPUs that the Linux kernel supports don't do writes
-until they are certain (1) that the write will actually happen, (2)
-of the location of the write, and (3) of the value to be written.
+because the CPUs that the Linux kernel supports don't do writes until they
+are certain (1) that the write will actually happen, (2) of the location of
+the write, and (3) of the value to be written.
 But please carefully read the "CONTROL DEPENDENCIES" section and the
-Documentation/RCU/rcu_dereference.rst file:  The compiler can and does
-break dependencies in a great many highly creative ways.
+Documentation/RCU/rcu_dereference.rst file:  The compiler can and does break
+dependencies in a great many highly creative ways.
 
 	CPU 1		      CPU 2
 	===============	      ===============
@@ -678,8 +678,8 @@ not understand them.  The purpose of this section is to help you prevent
 the compiler's ignorance from breaking your code.
 
 A load-load control dependency requires a full read memory barrier, not
-simply an (implicit) address-dependency barrier to make it work correctly.  Consider the
-following bit of code:
+simply an (implicit) address-dependency barrier to make it work correctly.
+Consider the following bit of code:
 
 	q = READ_ONCE(a);
 	<implicit address-dependency barrier>
@@ -691,8 +691,8 @@ following bit of code:
 This will not have the desired effect because there is no actual address
 dependency, but rather a control dependency that the CPU may short-circuit
 by attempting to predict the outcome in advance, so that other CPUs see
-the load from b as having happened before the load from a.  In such a
-case what's actually required is:
+the load from b as having happened before the load from a.  In such a case
+what's actually required is:
 
 	q = READ_ONCE(a);
 	if (q) {
@@ -980,8 +980,8 @@ Basically, the read barrier always has to be there, even though it can be of
 the "weaker" type.
 
 [!] Note that the stores before the write barrier would normally be expected to
-match the loads after the read barrier or the address-dependency barrier, and vice
-versa:
+match the loads after the read barrier or the address-dependency barrier, and
+vice versa:
 
 	CPU 1                               CPU 2
 	===================                 ===================
@@ -1033,8 +1033,8 @@ STORE B, STORE C } all occurring before the unordered set of { STORE D, STORE E
 	                   V
 
 
-Secondly, address-dependency barriers act as partial orderings on address-dependent
-loads.  Consider the following sequence of events:
+Secondly, address-dependency barriers act as partial orderings on address-
+dependent loads.  Consider the following sequence of events:
 
 	CPU 1			CPU 2
 	=======================	=======================
@@ -1079,8 +1079,8 @@ effectively random order, despite the write barrier issued by CPU 1:
 In the above example, CPU 2 perceives that B is 7, despite the load of *C
 (which would be B) coming after the LOAD of C.
 
-If, however, an address-dependency barrier were to be placed between the load of C
-and the load of *C (ie: B) on CPU 2:
+If, however, an address-dependency barrier were to be placed between the load
+of C and the load of *C (ie: B) on CPU 2:
 
 	CPU 1			CPU 2
 	=======================	=======================
@@ -2761,7 +2761,8 @@ is discarded from the CPU's cache and reloaded.  To deal with this, the
 appropriate part of the kernel must invalidate the overlapping bits of the
 cache on each CPU.
 
-See Documentation/core-api/cachetlb.rst for more information on cache management.
+See Documentation/core-api/cachetlb.rst for more information on cache
+management.
 
 
 CACHE COHERENCY VS MMIO
@@ -2901,8 +2902,8 @@ AND THEN THERE'S THE ALPHA
 The DEC Alpha CPU is one of the most relaxed CPUs there is.  Not only that,
 some versions of the Alpha CPU have a split data cache, permitting them to have
 two semantically-related cache lines updated at separate times.  This is where
-the address-dependency barrier really becomes necessary as this synchronises both
-caches with the memory coherence system, thus making it seem like pointer
+the address-dependency barrier really becomes necessary as this synchronises
+both caches with the memory coherence system, thus making it seem like pointer
 changes vs new data occur in the right order.
 
 The Alpha defines the Linux kernel's memory model, although as of v4.15
-- 
cgit v1.2.3


From be94ecf7608cc11ff46442012e710bb8fb139b99 Mon Sep 17 00:00:00 2001
From: Paul Heidekrüger <paul.heidekrueger@in.tum.de>
Date: Tue, 14 Jun 2022 15:48:11 +0000
Subject: tools/memory-model: Clarify LKMM's limitations in litmus-tests.txt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As discussed, clarify LKMM not recognizing certain kinds of orderings.
In particular, highlight the fact that LKMM might deliberately make
weaker guarantees than compilers and architectures.

[ paulmck: Fix whitespace issue noted by checkpatch.pl. ]

Link: https://lore.kernel.org/all/YpoW1deb%2FQeeszO1@ethstick13.dse.in.tum.de/T/#u
Co-developed-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Paul Heidekrüger <paul.heidekrueger@in.tum.de>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Charalampos Mainas <charalampos.mainas@gmail.com>
Cc: Pramod Bhatotia <pramod.bhatotia@in.tum.de>
Cc: Soham Chakraborty <s.s.chakraborty@tudelft.nl>
Cc: Martin Fink <martin.fink@in.tum.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/memory-model/Documentation/litmus-tests.txt | 37 +++++++++++++++++------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/tools/memory-model/Documentation/litmus-tests.txt b/tools/memory-model/Documentation/litmus-tests.txt
index 8a9d5d2787f9..26554b1c5575 100644
--- a/tools/memory-model/Documentation/litmus-tests.txt
+++ b/tools/memory-model/Documentation/litmus-tests.txt
@@ -946,22 +946,39 @@ Limitations of the Linux-kernel memory model (LKMM) include:
 	carrying a dependency, then the compiler can break that dependency
 	by substituting a constant of that value.
 
-	Conversely, LKMM sometimes doesn't recognize that a particular
-	optimization is not allowed, and as a result, thinks that a
-	dependency is not present (because the optimization would break it).
-	The memory model misses some pretty obvious control dependencies
-	because of this limitation.  A simple example is:
+	Conversely, LKMM will sometimes overestimate the amount of
+	reordering compilers and CPUs can carry out, leading it to miss
+	some pretty obvious cases of ordering.  A simple example is:
 
 		r1 = READ_ONCE(x);
 		if (r1 == 0)
 			smp_mb();
 		WRITE_ONCE(y, 1);
 
-	There is a control dependency from the READ_ONCE to the WRITE_ONCE,
-	even when r1 is nonzero, but LKMM doesn't realize this and thinks
-	that the write may execute before the read if r1 != 0.  (Yes, that
-	doesn't make sense if you think about it, but the memory model's
-	intelligence is limited.)
+	The WRITE_ONCE() does not depend on the READ_ONCE(), and as a
+	result, LKMM does not claim ordering.  However, even though no
+	dependency is present, the WRITE_ONCE() will not be executed before
+	the READ_ONCE().  There are two reasons for this:
+
+                The presence of the smp_mb() in one of the branches
+                prevents the compiler from moving the WRITE_ONCE()
+                up before the "if" statement, since the compiler has
+                to assume that r1 will sometimes be 0 (but see the
+                comment below);
+
+                CPUs do not execute stores before po-earlier conditional
+                branches, even in cases where the store occurs after the
+                two arms of the branch have recombined.
+
+	It is clear that it is not dangerous in the slightest for LKMM to
+	make weaker guarantees than architectures.  In fact, it is
+	desirable, as it gives compilers room for making optimizations.
+	For instance, suppose that a 0 value in r1 would trigger undefined
+	behavior elsewhere.  Then a clever compiler might deduce that r1
+	can never be 0 in the if condition.  As a result, said clever
+	compiler might deem it safe to optimize away the smp_mb(),
+	eliminating the branch and any ordering an architecture would
+	guarantee otherwise.
 
 2.	Multiple access sizes for a single variable are not supported,
 	and neither are misaligned or partially overlapping accesses.
-- 
cgit v1.2.3


From a30d551f34df66d739949a6140b50496afa36f66 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:32 +0200
Subject: tools/nolibc: make argc 32-bit in riscv startup code

The "ld a0, 0(sp)" instruction doesn't build on RISCV32 because that
would load a 64-bit value into a 32-bit register. But argc 32-bit,
not 64, so we ought to use "lw" here. Tested on both RISCV32 and
RISCV64.

Cc: Pranith Kumar <bobby.prani@gmail.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/arch-riscv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h
index 95e2b7924925..ba04771cb3a3 100644
--- a/tools/include/nolibc/arch-riscv.h
+++ b/tools/include/nolibc/arch-riscv.h
@@ -190,7 +190,7 @@ __asm__ (".section .text\n"
     ".option norelax\n"
     "lla   gp, __global_pointer$\n"
     ".option pop\n"
-    "ld    a0, 0(sp)\n"          // argc (a0) was in the stack
+    "lw    a0, 0(sp)\n"          // argc (a0) was in the stack
     "add   a1, sp, "SZREG"\n"    // argv (a1) = sp
     "slli  a2, a0, "PTRLOG"\n"   // envp (a2) = SZREG*argc ...
     "add   a2, a2, "SZREG"\n"    //             + SZREG (skip null)
-- 
cgit v1.2.3


From 8b53e83b08cfdc3f430b5415cd1031d5e7e1f935 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:33 +0200
Subject: tools/nolibc: fix build warning in sys_mmap() when my_syscall6 is not
 defined

We return -ENOSYS when there's no syscall6() operation, but we must cast
it to void* to avoid a warning.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/sys.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h
index 08491070387b..b8c96878c9ce 100644
--- a/tools/include/nolibc/sys.h
+++ b/tools/include/nolibc/sys.h
@@ -692,7 +692,7 @@ void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd,
 {
 #ifndef my_syscall6
 	/* Function not implemented. */
-	return -ENOSYS;
+	return (void *)-ENOSYS;
 #else
 
 	int n;
-- 
cgit v1.2.3


From 364702f7551451f2fab341f1b31adf911c888375 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:34 +0200
Subject: tools/nolibc: make sys_mmap() automatically use the right __NR_mmap
 definition

__NR_mmap2 was used for i386 but it's also needed for other archs such
as RISCV32 or ARM. Let's decide to use it based on the __NR_mmap2
definition as it's not defined on other archs.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/include/nolibc/sys.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h
index b8c96878c9ce..ce3ee03aa679 100644
--- a/tools/include/nolibc/sys.h
+++ b/tools/include/nolibc/sys.h
@@ -697,7 +697,7 @@ void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd,
 
 	int n;
 
-#if defined(__i386__)
+#if defined(__NR_mmap2)
 	n = __NR_mmap2;
 	offset >>= 12;
 #else
-- 
cgit v1.2.3


From 362aecb2d8cfad0268d6c0ae5f448e9b6eee7ffb Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:35 +0200
Subject: selftests/nolibc: add basic infrastructure to ease creation of nolibc
 tests

This creates a "nolibc" selftest that intends to test various parts of
the nolibc component, both in terms of build and execution for a given
architecture.

The aim is for it to be as simple to run as a kernel build, by just
passing the compiler (for the build) and the ARCH (for kernel and
execution).

It brings a basic squeleton made of a single C file that will ease testing
and error reporting. The code will be arranged so that it remains easy to
add basic tests for syscalls or library calls that may rely on a condition
to be executed, and whose result is compared to a value or to an error
with a specific errno value.

Tests will just use a relative line number in switch/case statements as
an index, saving the user from having to maintain arrays and complicated
functions which can often just be one-liners.

MAINTAINERS was updated.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 MAINTAINERS                                  |   1 +
 tools/testing/selftests/nolibc/Makefile      |  43 +++
 tools/testing/selftests/nolibc/nolibc-test.c | 395 +++++++++++++++++++++++++++
 3 files changed, 439 insertions(+)
 create mode 100644 tools/testing/selftests/nolibc/Makefile
 create mode 100644 tools/testing/selftests/nolibc/nolibc-test.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 8a5012ba6ff9..89f939ad1996 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14444,6 +14444,7 @@ M:	Willy Tarreau <w@1wt.eu>
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/wtarreau/nolibc.git
 F:	tools/include/nolibc/
+F:	tools/testing/selftests/nolibc/
 
 NSDEPS
 M:	Matthias Maennich <maennich@google.com>
diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
new file mode 100644
index 000000000000..fd0a67082334
--- /dev/null
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for nolibc tests
+include ../../../scripts/Makefile.include
+
+# we're in ".../tools/testing/selftests/nolibc"
+ifeq ($(srctree),)
+srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR)))
+endif
+
+ifeq ($(ARCH),)
+include $(srctree)/scripts/subarch.include
+ARCH = $(SUBARCH)
+endif
+
+# OUTPUT is only set when run from the main makefile, otherwise
+# it defaults to this nolibc directory.
+OUTPUT ?= $(CURDIR)/
+
+ifeq ($(V),1)
+Q=
+else
+Q=@
+endif
+
+CFLAGS  ?= -Os -fno-ident -fno-asynchronous-unwind-tables
+LDFLAGS := -s
+
+all: nolibc-test
+
+nolibc-test: nolibc-test.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \
+	  -nostdlib -static -include ../../../include/nolibc/nolibc.h $^ -lgcc
+
+initramfs: nolibc-test
+	$(QUIET_MKDIR)mkdir -p initramfs
+	$(call QUIET_INSTALL, initramfs/init)
+	$(Q)cp nolibc-test initramfs/init
+
+clean:
+	$(call QUIET_CLEAN, nolibc-test)
+	$(Q)rm -f nolibc-test
+	$(call QUIET_CLEAN, initramfs)
+	$(Q)rm -rf initramfs
diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
new file mode 100644
index 000000000000..6c050d4381fe
--- /dev/null
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* platform-specific include files coming from the compiler */
+#include <limits.h>
+
+/* libc-specific include files
+ * The program may be built in 2 ways:
+ *   $(CC) -nostdlib -include /path/to/nolibc.h => NOLIBC already defined
+ *   $(CC) -nostdlib -I/path/to/nolibc/sysroot
+ */
+#ifndef NOLIBC
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+/* will be used by nolibc by getenv() */
+char **environ;
+
+#define CASE_ERR(err) \
+	case err: return #err
+
+/* returns the error name (e.g. "ENOENT") for common errors, "SUCCESS" for 0,
+ * or the decimal value for less common ones.
+ */
+const char *errorname(int err)
+{
+	switch (err) {
+	case 0: return "SUCCESS";
+	CASE_ERR(EPERM);
+	CASE_ERR(ENOENT);
+	CASE_ERR(ESRCH);
+	CASE_ERR(EINTR);
+	CASE_ERR(EIO);
+	CASE_ERR(ENXIO);
+	CASE_ERR(E2BIG);
+	CASE_ERR(ENOEXEC);
+	CASE_ERR(EBADF);
+	CASE_ERR(ECHILD);
+	CASE_ERR(EAGAIN);
+	CASE_ERR(ENOMEM);
+	CASE_ERR(EACCES);
+	CASE_ERR(EFAULT);
+	CASE_ERR(ENOTBLK);
+	CASE_ERR(EBUSY);
+	CASE_ERR(EEXIST);
+	CASE_ERR(EXDEV);
+	CASE_ERR(ENODEV);
+	CASE_ERR(ENOTDIR);
+	CASE_ERR(EISDIR);
+	CASE_ERR(EINVAL);
+	CASE_ERR(ENFILE);
+	CASE_ERR(EMFILE);
+	CASE_ERR(ENOTTY);
+	CASE_ERR(ETXTBSY);
+	CASE_ERR(EFBIG);
+	CASE_ERR(ENOSPC);
+	CASE_ERR(ESPIPE);
+	CASE_ERR(EROFS);
+	CASE_ERR(EMLINK);
+	CASE_ERR(EPIPE);
+	CASE_ERR(EDOM);
+	CASE_ERR(ERANGE);
+	CASE_ERR(ENOSYS);
+	default:
+		return itoa(err);
+	}
+}
+
+static int pad_spc(int llen, int cnt, const char *fmt, ...)
+{
+	va_list args;
+	int len;
+	int ret;
+
+	for (len = 0; len < cnt - llen; len++)
+		putchar(' ');
+
+	va_start(args, fmt);
+	ret = vfprintf(stdout, fmt, args);
+	va_end(args);
+	return ret < 0 ? ret : ret + len;
+}
+
+/* The tests below are intended to be used by the macroes, which evaluate
+ * expression <expr>, print the status to stdout, and update the "ret"
+ * variable to count failures. The functions themselves return the number
+ * of failures, thus either 0 or 1.
+ */
+
+#define EXPECT_ZR(cond, expr)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_zr(expr, llen); } while (0)
+
+static int expect_zr(int expr, int llen)
+{
+	int ret = !(expr == 0);
+
+	llen += printf(" = %d ", expr);
+	pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n");
+	return ret;
+}
+
+
+#define EXPECT_NZ(cond, expr, val)			\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_nz(expr, llen; } while (0)
+
+static int expect_nz(int expr, int llen)
+{
+	int ret = !(expr != 0);
+
+	llen += printf(" = %d ", expr);
+	pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n");
+	return ret;
+}
+
+
+#define EXPECT_EQ(cond, expr, val)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_eq(expr, llen, val); } while (0)
+
+static int expect_eq(int expr, int llen, int val)
+{
+	int ret = !(expr == val);
+
+	llen += printf(" = %d ", expr);
+	pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n");
+	return ret;
+}
+
+
+#define EXPECT_NE(cond, expr, val)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ne(expr, llen, val); } while (0)
+
+static int expect_ne(int expr, int llen, int val)
+{
+	int ret = !(expr != val);
+
+	llen += printf(" = %d ", expr);
+	pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n");
+	return ret;
+}
+
+
+#define EXPECT_GE(cond, expr, val)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ge(expr, llen, val); } while (0)
+
+static int expect_ge(int expr, int llen, int val)
+{
+	int ret = !(expr >= val);
+
+	llen += printf(" = %d ", expr);
+	pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n");
+	return ret;
+}
+
+
+#define EXPECT_GT(cond, expr, val)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_gt(expr, llen, val); } while (0)
+
+static int expect_gt(int expr, int llen, int val)
+{
+	int ret = !(expr > val);
+
+	llen += printf(" = %d ", expr);
+	pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n");
+	return ret;
+}
+
+
+#define EXPECT_LE(cond, expr, val)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_le(expr, llen, val); } while (0)
+
+static int expect_le(int expr, int llen, int val)
+{
+	int ret = !(expr <= val);
+
+	llen += printf(" = %d ", expr);
+	pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n");
+	return ret;
+}
+
+
+#define EXPECT_LT(cond, expr, val)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_lt(expr, llen, val); } while (0)
+
+static int expect_lt(int expr, int llen, int val)
+{
+	int ret = !(expr < val);
+
+	llen += printf(" = %d ", expr);
+	pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n");
+	return ret;
+}
+
+
+#define EXPECT_SYSZR(cond, expr)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_syszr(expr, llen); } while (0)
+
+static int expect_syszr(int expr, int llen)
+{
+	int ret = 0;
+
+	if (expr) {
+		ret = 1;
+		llen += printf(" = %d %s ", expr, errorname(errno));
+		llen += pad_spc(llen, 40, "[FAIL]\n");
+	} else {
+		llen += printf(" = %d ", expr);
+		llen += pad_spc(llen, 40, " [OK]\n");
+	}
+	return ret;
+}
+
+
+#define EXPECT_SYSEQ(cond, expr, val)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_syseq(expr, llen, val); } while (0)
+
+static int expect_syseq(int expr, int llen, int val)
+{
+	int ret = 0;
+
+	if (expr != val) {
+		ret = 1;
+		llen += printf(" = %d %s ", expr, errorname(errno));
+		llen += pad_spc(llen, 40, "[FAIL]\n");
+	} else {
+		llen += printf(" = %d ", expr);
+		llen += pad_spc(llen, 40, " [OK]\n");
+	}
+	return ret;
+}
+
+
+#define EXPECT_SYSNE(cond, expr, val)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_sysne(expr, llen, val); } while (0)
+
+static int expect_sysne(int expr, int llen, int val)
+{
+	int ret = 0;
+
+	if (expr == val) {
+		ret = 1;
+		llen += printf(" = %d %s ", expr, errorname(errno));
+		llen += pad_spc(llen, 40, "[FAIL]\n");
+	} else {
+		llen += printf(" = %d ", expr);
+		llen += pad_spc(llen, 40, " [OK]\n");
+	}
+	return ret;
+}
+
+
+#define EXPECT_SYSER(cond, expr, expret, experr)			\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_syserr(expr, expret, experr, llen); } while (0)
+
+static int expect_syserr(int expr, int expret, int experr, int llen)
+{
+	int ret = 0;
+	int _errno = errno;
+
+	llen += printf(" = %d %s ", expr, errorname(_errno));
+	if (expr != expret || _errno != experr) {
+		ret = 1;
+		llen += printf(" != (%d %s) ", expret, errorname(experr));
+		llen += pad_spc(llen, 40, "[FAIL]\n");
+	} else {
+		llen += pad_spc(llen, 40, " [OK]\n");
+	}
+	return ret;
+}
+
+
+#define EXPECT_PTRZR(cond, expr)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ptrzr(expr, llen); } while (0)
+
+static int expect_ptrzr(const void *expr, int llen)
+{
+	int ret = 0;
+
+	llen += printf(" = <%p> ", expr);
+	if (expr) {
+		ret = 1;
+		llen += pad_spc(llen, 40, "[FAIL]\n");
+	} else {
+		llen += pad_spc(llen, 40, " [OK]\n");
+	}
+	return ret;
+}
+
+
+#define EXPECT_PTRNZ(cond, expr)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ptrnz(expr, llen); } while (0)
+
+static int expect_ptrnz(const void *expr, int llen)
+{
+	int ret = 0;
+
+	llen += printf(" = <%p> ", expr);
+	if (!expr) {
+		ret = 1;
+		llen += pad_spc(llen, 40, "[FAIL]\n");
+	} else {
+		llen += pad_spc(llen, 40, " [OK]\n");
+	}
+	return ret;
+}
+
+
+#define EXPECT_STRZR(cond, expr)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_strzr(expr, llen); } while (0)
+
+static int expect_strzr(const char *expr, int llen)
+{
+	int ret = 0;
+
+	llen += printf(" = <%s> ", expr);
+	if (expr) {
+		ret = 1;
+		llen += pad_spc(llen, 40, "[FAIL]\n");
+	} else {
+		llen += pad_spc(llen, 40, " [OK]\n");
+	}
+	return ret;
+}
+
+
+#define EXPECT_STRNZ(cond, expr)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_strnz(expr, llen); } while (0)
+
+static int expect_strnz(const char *expr, int llen)
+{
+	int ret = 0;
+
+	llen += printf(" = <%s> ", expr);
+	if (!expr) {
+		ret = 1;
+		llen += pad_spc(llen, 40, "[FAIL]\n");
+	} else {
+		llen += pad_spc(llen, 40, " [OK]\n");
+	}
+	return ret;
+}
+
+
+#define EXPECT_STREQ(cond, expr, cmp)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_streq(expr, llen, cmp); } while (0)
+
+static int expect_streq(const char *expr, int llen, const char *cmp)
+{
+	int ret = 0;
+
+	llen += printf(" = <%s> ", expr);
+	if (strcmp(expr, cmp) != 0) {
+		ret = 1;
+		llen += pad_spc(llen, 40, "[FAIL]\n");
+	} else {
+		llen += pad_spc(llen, 40, " [OK]\n");
+	}
+	return ret;
+}
+
+
+#define EXPECT_STRNE(cond, expr, cmp)				\
+	do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_strne(expr, llen, cmp); } while (0)
+
+static int expect_strne(const char *expr, int llen, const char *cmp)
+{
+	int ret = 0;
+
+	llen += printf(" = <%s> ", expr);
+	if (strcmp(expr, cmp) == 0) {
+		ret = 1;
+		llen += pad_spc(llen, 40, "[FAIL]\n");
+	} else {
+		llen += pad_spc(llen, 40, " [OK]\n");
+	}
+	return ret;
+}
+
+/* declare tests based on line numbers. There must be exactly one test per line. */
+#define CASE_TEST(name) \
+	case __LINE__: llen += printf("%d %s", test, #name);
+
+
+int main(int argc, char **argv, char **envp)
+{
+	int min = 0;
+	int max = __INT_MAX__;
+	int ret = 0;
+
+	environ = envp;
+
+	printf("Total number of errors: %d\n", ret);
+	printf("Exiting with status %d\n", !!ret);
+	return !!ret;
+}
-- 
cgit v1.2.3


From 23da7bc923609bc90b6989c6c6a17e74c8f433ed Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:36 +0200
Subject: selftests/nolibc: support a test definition format

It now becomes possible to pass a string either in argv[1] or in the
NOLIBC_TEST environment variable (the former having precedence), to
specify which tests to run. The format is:

   testname[:range]*[,testname...]

Where a range is either a single value or the min and max numbers of the
test IDs in a sequence, delimited by a dash. Multiple ranges are possible.
This should provide enough flexibility to focus on certain failing parts
just by playing with the boot command line in a boot loader or in qemu
depending on what is accessible.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 91 ++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 6c050d4381fe..49177ea9943c 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -17,6 +17,12 @@
 /* will be used by nolibc by getenv() */
 char **environ;
 
+/* definition of a series of tests */
+struct test {
+	const char *name;              // test name
+	int (*func)(int min, int max); // handler
+};
+
 #define CASE_ERR(err) \
 	case err: return #err
 
@@ -376,19 +382,104 @@ static int expect_strne(const char *expr, int llen, const char *cmp)
 	return ret;
 }
 
+
 /* declare tests based on line numbers. There must be exactly one test per line. */
 #define CASE_TEST(name) \
 	case __LINE__: llen += printf("%d %s", test, #name);
 
 
+/* This is the definition of known test names, with their functions */
+static struct test test_names[] = {
+	/* add new tests here */
+	{ 0 }
+};
+
 int main(int argc, char **argv, char **envp)
 {
 	int min = 0;
 	int max = __INT_MAX__;
 	int ret = 0;
+	int err;
+	int idx;
+	char *test;
 
 	environ = envp;
 
+	/* the definition of a series of tests comes from either argv[1] or the
+	 * "NOLIBC_TEST" environment variable. It's made of a comma-delimited
+	 * series of test names and optional ranges:
+	 *    syscall:5-15[:.*],stdlib:8-10
+	 */
+	test = argv[1];
+	if (!test)
+		test = getenv("NOLIBC_TEST");
+
+	if (test) {
+		char *comma, *colon, *dash, *value;
+
+		do {
+			comma = strchr(test, ',');
+			if (comma)
+				*(comma++) = '\0';
+
+			colon = strchr(test, ':');
+			if (colon)
+				*(colon++) = '\0';
+
+			for (idx = 0; test_names[idx].name; idx++) {
+				if (strcmp(test, test_names[idx].name) == 0)
+					break;
+			}
+
+			if (test_names[idx].name) {
+				/* The test was named, it will be called at least
+				 * once. We may have an optional range at <colon>
+				 * here, which defaults to the full range.
+				 */
+				do {
+					min = 0; max = __INT_MAX__;
+					value = colon;
+					if (value && *value) {
+						colon = strchr(value, ':');
+						if (colon)
+							*(colon++) = '\0';
+
+						dash = strchr(value, '-');
+						if (dash)
+							*(dash++) = '\0';
+
+						/* support :val: :min-max: :min-: :-max: */
+						if (*value)
+							min = atoi(value);
+						if (!dash)
+							max = min;
+						else if (*dash)
+							max = atoi(dash);
+
+						value = colon;
+					}
+
+					/* now's time to call the test */
+					printf("Running test '%s'\n", test_names[idx].name);
+					err = test_names[idx].func(min, max);
+					ret += err;
+					printf("Errors during this test: %d\n\n", err);
+				} while (colon && *colon);
+			} else
+				printf("Ignoring unknown test name '%s'\n", test);
+
+			test = comma;
+		} while (test && *test);
+	} else {
+		/* no test mentioned, run everything */
+		for (idx = 0; test_names[idx].name; idx++) {
+			printf("Running test '%s'\n", test_names[idx].name);
+			err = test_names[idx].func(min, max);
+			ret += err;
+			printf("Errors during this test: %d\n\n", err);
+		}
+	}
+
 	printf("Total number of errors: %d\n", ret);
 	printf("Exiting with status %d\n", !!ret);
 	return !!ret;
-- 
cgit v1.2.3


From b4844fa0bdb4075ad842e89d6c41e3d0c6124848 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:37 +0200
Subject: selftests/nolibc: implement a few tests for various syscalls

This adds 63 tests covering about 34 syscalls. Both successes and
failures are tested. Two tests fail when run as unprivileged user
(link_dir which returns EACCESS instead of EPERM, and chroot which
returns EPERM). One test (execve("/")) expects to fail on EACCESS,
but needs to have valid arguments otherwise the kernel will log a
message. And a few tests require /proc to be mounted.

The code is not pretty since all tests are one-liners, sometimes
resulting in long lines, especially when using compount statements to
preset a line, but it's convenient and doesn't obfuscate the code,
which is important to understand what failed.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 110 +++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 49177ea9943c..dc87832912ce 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -388,9 +388,119 @@ static int expect_strne(const char *expr, int llen, const char *cmp)
 	case __LINE__: llen += printf("%d %s", test, #name);
 
 
+/* used by some syscall tests below */
+int test_getdents64(const char *dir)
+{
+	char buffer[4096];
+	int fd, ret;
+	int err;
+
+	ret = fd = open(dir, O_RDONLY | O_DIRECTORY, 0);
+	if (ret < 0)
+		return ret;
+
+	ret = getdents64(fd, (void *)buffer, sizeof(buffer));
+	err = errno;
+	close(fd);
+
+	errno = err;
+	return ret;
+}
+
+/* Run syscall tests between IDs <min> and <max>.
+ * Return 0 on success, non-zero on failure.
+ */
+int run_syscall(int min, int max)
+{
+	struct stat stat_buf;
+	int test;
+	int tmp;
+	int ret = 0;
+	void *p1, *p2;
+
+	for (test = min; test >= 0 && test <= max; test++) {
+		int llen = 0; // line length
+
+		/* avoid leaving empty lines below, this will insert holes into
+		 * test numbers.
+		 */
+		switch (test + __LINE__ + 1) {
+		CASE_TEST(getpid);            EXPECT_SYSNE(1, getpid(), -1); break;
+		CASE_TEST(getppid);           EXPECT_SYSNE(1, getppid(), -1); break;
+		CASE_TEST(gettid);            EXPECT_SYSNE(1, gettid(), -1); break;
+		CASE_TEST(getpgid_self);      EXPECT_SYSNE(1, getpgid(0), -1); break;
+		CASE_TEST(getpgid_bad);       EXPECT_SYSER(1, getpgid(-1), -1, ESRCH); break;
+		CASE_TEST(kill_0);            EXPECT_SYSZR(1, kill(getpid(), 0)); break;
+		CASE_TEST(kill_CONT);         EXPECT_SYSZR(1, kill(getpid(), 0)); break;
+		CASE_TEST(kill_BADPID);       EXPECT_SYSER(1, kill(INT_MAX, 0), -1, ESRCH); break;
+		CASE_TEST(sbrk);              if ((p1 = p2 = sbrk(4096)) != (void *)-1) p2 = sbrk(-4096); EXPECT_SYSZR(1, (p2 == (void *)-1) || p2 == p1); break;
+		CASE_TEST(brk);               EXPECT_SYSZR(1, brk(sbrk(0))); break;
+		CASE_TEST(chdir_root);        EXPECT_SYSZR(1, chdir("/")); break;
+		CASE_TEST(chdir_dot);         EXPECT_SYSZR(1, chdir(".")); break;
+		CASE_TEST(chdir_blah);        EXPECT_SYSER(1, chdir("/blah"), -1, ENOENT); break;
+		CASE_TEST(chmod_net);         EXPECT_SYSZR(1, chmod("/proc/self/net", 0555)); break;
+		CASE_TEST(chmod_self);        EXPECT_SYSER(1, chmod("/proc/self", 0555), -1, EPERM); break;
+		CASE_TEST(chown_self);        EXPECT_SYSER(1, chown("/proc/self", 0, 0), -1, EPERM); break;
+		CASE_TEST(chroot_root);       EXPECT_SYSZR(1, chroot("/")); break;
+		CASE_TEST(chroot_blah);       EXPECT_SYSER(1, chroot("/proc/self/blah"), -1, ENOENT); break;
+		CASE_TEST(chroot_exe);        EXPECT_SYSER(1, chroot("/proc/self/exe"), -1, ENOTDIR); break;
+		CASE_TEST(close_m1);          EXPECT_SYSER(1, close(-1), -1, EBADF); break;
+		CASE_TEST(close_dup);         EXPECT_SYSZR(1, close(dup(0))); break;
+		CASE_TEST(dup_0);             tmp = dup(0);  EXPECT_SYSNE(1, tmp, -1); close(tmp); break;
+		CASE_TEST(dup_m1);            tmp = dup(-1); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break;
+		CASE_TEST(dup2_0);            tmp = dup2(0, 100);  EXPECT_SYSNE(1, tmp, -1); close(tmp); break;
+		CASE_TEST(dup2_m1);           tmp = dup2(-1, 100); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break;
+		CASE_TEST(dup3_0);            tmp = dup3(0, 100, 0);  EXPECT_SYSNE(1, tmp, -1); close(tmp); break;
+		CASE_TEST(dup3_m1);           tmp = dup3(-1, 100, 0); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break;
+		CASE_TEST(execve_root);       EXPECT_SYSER(1, execve("/", (char*[]){ [0] = "/", [1] = NULL }, NULL), -1, EACCES); break;
+		CASE_TEST(getdents64_root);   EXPECT_SYSNE(1, test_getdents64("/"), -1); break;
+		CASE_TEST(getdents64_null);   EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break;
+		CASE_TEST(gettimeofday_null); EXPECT_SYSZR(1, gettimeofday(NULL, NULL)); break;
+		CASE_TEST(gettimeofday_bad1); EXPECT_SYSER(1, gettimeofday((void *)1, NULL), -1, EFAULT); break;
+		CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break;
+		CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break;
+		CASE_TEST(ioctl_tiocinq);     EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break;
+		CASE_TEST(ioctl_tiocinq);     EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break;
+		CASE_TEST(link_root1);        EXPECT_SYSER(1, link("/", "/"), -1, EEXIST); break;
+		CASE_TEST(link_blah);         EXPECT_SYSER(1, link("/proc/self/blah", "/blah"), -1, ENOENT); break;
+		CASE_TEST(link_dir);          EXPECT_SYSER(1, link("/", "/blah"), -1, EPERM); break;
+		CASE_TEST(link_cross);        EXPECT_SYSER(1, link("/proc/self/net", "/blah"), -1, EXDEV); break;
+		CASE_TEST(lseek_m1);          EXPECT_SYSER(1, lseek(-1, 0, SEEK_SET), -1, EBADF); break;
+		CASE_TEST(lseek_0);           EXPECT_SYSER(1, lseek(0, 0, SEEK_SET), -1, ESPIPE); break;
+		CASE_TEST(mkdir_root);        EXPECT_SYSER(1, mkdir("/", 0755), -1, EEXIST); break;
+		CASE_TEST(open_tty);          EXPECT_SYSNE(1, tmp = open("/dev/null", 0), -1); if (tmp != -1) close(tmp); break;
+		CASE_TEST(open_blah);         EXPECT_SYSER(1, tmp = open("/proc/self/blah", 0), -1, ENOENT); if (tmp != -1) close(tmp); break;
+		CASE_TEST(poll_null);         EXPECT_SYSZR(1, poll(NULL, 0, 0)); break;
+		CASE_TEST(poll_stdout);       EXPECT_SYSNE(1, ({ struct pollfd fds = { 1, POLLOUT, 0}; poll(&fds, 1, 0); }), -1); break;
+		CASE_TEST(poll_fault);        EXPECT_SYSER(1, poll((void *)1, 1, 0), -1, EFAULT); break;
+		CASE_TEST(read_badf);         EXPECT_SYSER(1, read(-1, &tmp, 1), -1, EBADF); break;
+		CASE_TEST(sched_yield);       EXPECT_SYSZR(1, sched_yield()); break;
+		CASE_TEST(select_null);       EXPECT_SYSZR(1, ({ struct timeval tv = { 0 }; select(0, NULL, NULL, NULL, &tv); })); break;
+		CASE_TEST(select_stdout);     EXPECT_SYSNE(1, ({ fd_set fds; FD_ZERO(&fds); FD_SET(1, &fds); select(2, NULL, &fds, NULL, NULL); }), -1); break;
+		CASE_TEST(select_fault);      EXPECT_SYSER(1, select(1, (void *)1, NULL, NULL, 0), -1, EFAULT); break;
+		CASE_TEST(stat_blah);         EXPECT_SYSER(1, stat("/proc/self/blah", &stat_buf), -1, ENOENT); break;
+		CASE_TEST(stat_fault);        EXPECT_SYSER(1, stat(NULL, &stat_buf), -1, EFAULT); break;
+		CASE_TEST(symlink_root);      EXPECT_SYSER(1, symlink("/", "/"), -1, EEXIST); break;
+		CASE_TEST(unlink_root);       EXPECT_SYSER(1, unlink("/"), -1, EISDIR); break;
+		CASE_TEST(unlink_blah);       EXPECT_SYSER(1, unlink("/proc/self/blah"), -1, ENOENT); break;
+		CASE_TEST(wait_child);        EXPECT_SYSER(1, wait(&tmp), -1, ECHILD); break;
+		CASE_TEST(waitpid_min);       EXPECT_SYSER(1, waitpid(INT_MIN, &tmp, WNOHANG), -1, ESRCH); break;
+		CASE_TEST(waitpid_child);     EXPECT_SYSER(1, waitpid(getpid(), &tmp, WNOHANG), -1, ECHILD); break;
+		CASE_TEST(write_badf);        EXPECT_SYSER(1, write(-1, &tmp, 1), -1, EBADF); break;
+		CASE_TEST(write_zero);        EXPECT_SYSZR(1, write(1, &tmp, 0)); break;
+		case __LINE__:
+			return ret; /* must be last */
+		/* note: do not set any defaults so as to permit holes above */
+		}
+	}
+	return ret;
+}
+
+
 /* This is the definition of known test names, with their functions */
 static struct test test_names[] = {
 	/* add new tests here */
+	{ .name = "syscall",   .func = run_syscall  },
 	{ 0 }
 };
 
-- 
cgit v1.2.3


From 95bc989488eb7150949cf2fcce3f486741d1c57f Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:38 +0200
Subject: selftests/nolibc: add a few tests for some libc functions

The test series called "stdlib" covers some libc functions (string,
stdlib etc). By default they are automatically run after "syscall"
but may be requested in argument or in variable NOLIBC_TEST.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 35 ++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index dc87832912ce..b928f099431f 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -496,11 +496,46 @@ int run_syscall(int min, int max)
 	return ret;
 }
 
+int run_stdlib(int min, int max)
+{
+	int test;
+	int tmp;
+	int ret = 0;
+	void *p1, *p2;
+
+	for (test = min; test >= 0 && test <= max; test++) {
+		int llen = 0; // line length
+
+		/* avoid leaving empty lines below, this will insert holes into
+		 * test numbers.
+		 */
+		switch (test + __LINE__ + 1) {
+		CASE_TEST(getenv_TERM);        EXPECT_STRNZ(1, getenv("TERM")); break;
+		CASE_TEST(getenv_blah);        EXPECT_STRZR(1, getenv("blah")); break;
+		CASE_TEST(setcmp_blah_blah);   EXPECT_EQ(1, strcmp("blah", "blah"), 0); break;
+		CASE_TEST(setcmp_blah_blah2);  EXPECT_NE(1, strcmp("blah", "blah2"), 0); break;
+		CASE_TEST(setncmp_blah_blah);  EXPECT_EQ(1, strncmp("blah", "blah", 10), 0); break;
+		CASE_TEST(setncmp_blah_blah4); EXPECT_EQ(1, strncmp("blah", "blah4", 4), 0); break;
+		CASE_TEST(setncmp_blah_blah5); EXPECT_NE(1, strncmp("blah", "blah5", 5), 0); break;
+		CASE_TEST(setncmp_blah_blah6); EXPECT_NE(1, strncmp("blah", "blah6", 6), 0); break;
+		CASE_TEST(strchr_foobar_o);    EXPECT_STREQ(1, strchr("foobar", 'o'), "oobar"); break;
+		CASE_TEST(strchr_foobar_z);    EXPECT_STRZR(1, strchr("foobar", 'z')); break;
+		CASE_TEST(strrchr_foobar_o);   EXPECT_STREQ(1, strrchr("foobar", 'o'), "obar"); break;
+		CASE_TEST(strrchr_foobar_z);   EXPECT_STRZR(1, strrchr("foobar", 'z')); break;
+		case __LINE__:
+			return ret; /* must be last */
+		/* note: do not set any defaults so as to permit holes above */
+		}
+	}
+	return ret;
+}
+
 
 /* This is the definition of known test names, with their functions */
 static struct test test_names[] = {
 	/* add new tests here */
 	{ .name = "syscall",   .func = run_syscall  },
+	{ .name = "stdlib",    .func = run_stdlib   },
 	{ 0 }
 };
 
-- 
cgit v1.2.3


From f49896d7d9a69b7565a59f5085c78ca1c08c7dd7 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:40 +0200
Subject: selftests/nolibc: exit with poweroff on success when getpid() == 1

The idea is to ease automated testing under qemu. If the test succeeds
while running as PID 1, indicating the system was booted with init=/test,
let's just power off so that qemu can exit with a successful code. In
other situations it will exit and provoke a panic, which may be caught
for example with CONFIG_PVPANIC.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index b928f099431f..291d96bfd7c1 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -626,6 +626,20 @@ int main(int argc, char **argv, char **envp)
 	}
 
 	printf("Total number of errors: %d\n", ret);
+
+	if (getpid() == 1) {
+		/* we're running as init, there's no other process on the
+		 * system, thus likely started from a VM for a quick check.
+		 * Exiting will provoke a kernel panic that may be reported
+		 * as an error by Qemu or the hypervisor, while stopping
+		 * cleanly will often be reported as a success. This allows
+		 * to use the output of this program for bisecting kernels.
+		 */
+		printf("Leaving init with final status: %d\n", !!ret);
+		if (ret == 0)
+			reboot(LINUX_REBOOT_CMD_POWER_OFF);
+	}
+
 	printf("Exiting with status %d\n", !!ret);
 	return !!ret;
 }
-- 
cgit v1.2.3


From aa73a86cda26705c7f0af1afe9bb255a52accf87 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:41 +0200
Subject: selftests/nolibc: on x86, support exiting with isa-debug-exit

QEMU, when started with "-device isa-debug-exit -no-reboot" will exit
with status code 2N+1 when N is written to 0x501. This is particularly
convenient for automated tests but this is not portable. As such we
only enable this on x86_64 when pid==1. In addition, this requires an
ioperm() call but in order not to have to define arch-specific syscalls
we just perform the syscall by hand there.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 291d96bfd7c1..eeb254749239 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -638,6 +638,15 @@ int main(int argc, char **argv, char **envp)
 		printf("Leaving init with final status: %d\n", !!ret);
 		if (ret == 0)
 			reboot(LINUX_REBOOT_CMD_POWER_OFF);
+#if defined(__x86_64__)
+		/* QEMU started with "-device isa-debug-exit -no-reboot" will
+		 * exit with status code 2N+1 when N is written to 0x501. We
+		 * hard-code the syscall here as it's arch-dependent.
+		 */
+		else if (my_syscall3(__NR_ioperm, 0x501, 1, 1) == 0)
+			asm volatile ("outb %%al, %%dx" :: "d"(0x501), "a"(0));
+		/* if it does nothing, fall back to the regular panic */
+#endif
 	}
 
 	printf("Exiting with status %d\n", !!ret);
-- 
cgit v1.2.3


From 1a5454f625997049d886d8c3dae8e8de2a553125 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:42 +0200
Subject: selftests/nolibc: recreate and populate /dev and /proc if missing

Most of the time the program will be run alone in an initramfs. There
is no value in requiring the user to populate /dev and /proc for such
tests, we can do it ourselves, and it participates to the tests at the
same time.

What's done here is that when called as init (getpid()==1) we check
if /dev exists or create it, if /dev/console and /dev/null exists,
otherwise we try to mount a devtmpfs there, and if it fails we fall
back to mknod. The console is reopened if stdout was closed. Finally
/proc is created and mounted if /proc/self cannot be found. This is
sufficient for most tests.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 56 ++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index eeb254749239..a697182c87f5 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -530,6 +530,54 @@ int run_stdlib(int min, int max)
 	return ret;
 }
 
+/* prepare what needs to be prepared for pid 1 (stdio, /dev, /proc, etc) */
+int prepare(void)
+{
+	struct stat stat_buf;
+
+	/* It's possible that /dev doesn't even exist or was not mounted, so
+	 * we'll try to create it, mount it, or create minimal entries into it.
+	 * We want at least /dev/null and /dev/console.
+	 */
+	if (stat("/dev/.", &stat_buf) == 0 || mkdir("/dev", 0755) == 0) {
+		if (stat("/dev/console", &stat_buf) != 0 ||
+		    stat("/dev/null", &stat_buf) != 0) {
+			/* try devtmpfs first, otherwise fall back to manual creation */
+			if (mount("/dev", "/dev", "devtmpfs", 0, 0) != 0) {
+				mknod("/dev/console", 0600 | S_IFCHR, makedev(5, 1));
+				mknod("/dev/null",    0666 | S_IFCHR, makedev(1, 3));
+			}
+		}
+	}
+
+	/* If no /dev/console was found before calling init, stdio is closed so
+	 * we need to reopen it from /dev/console. If it failed above, it will
+	 * still fail here and we cannot emit a message anyway.
+	 */
+	if (close(dup(1)) == -1) {
+		int fd = open("/dev/console", O_RDWR);
+
+		if (fd >= 0) {
+			if (fd != 0)
+				dup2(fd, 0);
+			if (fd != 1)
+				dup2(fd, 1);
+			if (fd != 2)
+				dup2(fd, 2);
+			if (fd > 2)
+				close(fd);
+			puts("\nSuccessfully reopened /dev/console.");
+		}
+	}
+
+	/* try to mount /proc if not mounted. Silently fail otherwise */
+	if (stat("/proc/.", &stat_buf) == 0 || mkdir("/proc", 0755) == 0) {
+		if (stat("/proc/self", &stat_buf) != 0)
+			mount("/proc", "/proc", "proc", 0, 0);
+	}
+
+	return 0;
+}
 
 /* This is the definition of known test names, with their functions */
 static struct test test_names[] = {
@@ -550,6 +598,14 @@ int main(int argc, char **argv, char **envp)
 
 	environ = envp;
 
+	/* when called as init, it's possible that no console was opened, for
+	 * example if no /dev file system was provided. We'll check that fd#1
+	 * was opened, and if not we'll attempt to create and open /dev/console
+	 * and /dev/null that we'll use for later tests.
+	 */
+	if (getpid() == 1)
+		prepare();
+
 	/* the definition of a series of tests comes from either argv[1] or the
 	 * "NOLIBC_TEST" environment variable. It's made of a comma-delimited
 	 * series of test names and optional ranges:
-- 
cgit v1.2.3


From 7172f1c6854cb424e3be3401e3df34c1c38cffc2 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:43 +0200
Subject: selftests/nolibc: condition some tests on /proc existence

If /proc is not available (program run inside a chroot or without
sufficient permissions), it's better to disable the associated tests.
Some will be preserved like the ones which check for a failure to
create some entries there since they're still supposed to fail.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index a697182c87f5..662dea691749 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -413,11 +413,15 @@ int test_getdents64(const char *dir)
 int run_syscall(int min, int max)
 {
 	struct stat stat_buf;
+	int proc;
 	int test;
 	int tmp;
 	int ret = 0;
 	void *p1, *p2;
 
+	/* <proc> indicates whether or not /proc is mounted */
+	proc = stat("/proc", &stat_buf) == 0;
+
 	for (test = min; test >= 0 && test <= max; test++) {
 		int llen = 0; // line length
 
@@ -438,12 +442,12 @@ int run_syscall(int min, int max)
 		CASE_TEST(chdir_root);        EXPECT_SYSZR(1, chdir("/")); break;
 		CASE_TEST(chdir_dot);         EXPECT_SYSZR(1, chdir(".")); break;
 		CASE_TEST(chdir_blah);        EXPECT_SYSER(1, chdir("/blah"), -1, ENOENT); break;
-		CASE_TEST(chmod_net);         EXPECT_SYSZR(1, chmod("/proc/self/net", 0555)); break;
-		CASE_TEST(chmod_self);        EXPECT_SYSER(1, chmod("/proc/self", 0555), -1, EPERM); break;
-		CASE_TEST(chown_self);        EXPECT_SYSER(1, chown("/proc/self", 0, 0), -1, EPERM); break;
+		CASE_TEST(chmod_net);         EXPECT_SYSZR(proc, chmod("/proc/self/net", 0555)); break;
+		CASE_TEST(chmod_self);        EXPECT_SYSER(proc, chmod("/proc/self", 0555), -1, EPERM); break;
+		CASE_TEST(chown_self);        EXPECT_SYSER(proc, chown("/proc/self", 0, 0), -1, EPERM); break;
 		CASE_TEST(chroot_root);       EXPECT_SYSZR(1, chroot("/")); break;
 		CASE_TEST(chroot_blah);       EXPECT_SYSER(1, chroot("/proc/self/blah"), -1, ENOENT); break;
-		CASE_TEST(chroot_exe);        EXPECT_SYSER(1, chroot("/proc/self/exe"), -1, ENOTDIR); break;
+		CASE_TEST(chroot_exe);        EXPECT_SYSER(proc, chroot("/proc/self/exe"), -1, ENOTDIR); break;
 		CASE_TEST(close_m1);          EXPECT_SYSER(1, close(-1), -1, EBADF); break;
 		CASE_TEST(close_dup);         EXPECT_SYSZR(1, close(dup(0))); break;
 		CASE_TEST(dup_0);             tmp = dup(0);  EXPECT_SYSNE(1, tmp, -1); close(tmp); break;
@@ -464,7 +468,7 @@ int run_syscall(int min, int max)
 		CASE_TEST(link_root1);        EXPECT_SYSER(1, link("/", "/"), -1, EEXIST); break;
 		CASE_TEST(link_blah);         EXPECT_SYSER(1, link("/proc/self/blah", "/blah"), -1, ENOENT); break;
 		CASE_TEST(link_dir);          EXPECT_SYSER(1, link("/", "/blah"), -1, EPERM); break;
-		CASE_TEST(link_cross);        EXPECT_SYSER(1, link("/proc/self/net", "/blah"), -1, EXDEV); break;
+		CASE_TEST(link_cross);        EXPECT_SYSER(proc, link("/proc/self/net", "/blah"), -1, EXDEV); break;
 		CASE_TEST(lseek_m1);          EXPECT_SYSER(1, lseek(-1, 0, SEEK_SET), -1, EBADF); break;
 		CASE_TEST(lseek_0);           EXPECT_SYSER(1, lseek(0, 0, SEEK_SET), -1, ESPIPE); break;
 		CASE_TEST(mkdir_root);        EXPECT_SYSER(1, mkdir("/", 0755), -1, EEXIST); break;
-- 
cgit v1.2.3


From 1da02f510882cd5684dc04dc7119056e01da90bd Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:44 +0200
Subject: selftests/nolibc: support glibc as well

Adding support for glibc can be useful to distinguish between bugs in
nolibc and bugs in the kernel when a syscall reports an unusual value.

It's not that much work and should not affect the long term
maintainability of the tests. The necessary changes can essentially be
summed up like this:
  - set _GNU_SOURCE a the top to access some definitions
  - many includes added when we know we don't come from nolibc (missing
    the stdio include guard)
  - disable gettid() which is not exposed by glibc
  - disable gettimeofday's support of bad pointers since these  crash
    in glibc
  - add a simple itoa() for errorname(); strerror() is too verbose (no
    way to get short messages). strerrorname_np() was added in modern
    glibc (2.32) to do exactly this but that 's too recent to be usable
    as the default fallback.
  - use the standard ioperm() definition. May be we need to implement
    ioperm() in nolibc if that's useful.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/nolibc-test.c | 47 ++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c
index 662dea691749..78bced95ac63 100644
--- a/tools/testing/selftests/nolibc/nolibc-test.c
+++ b/tools/testing/selftests/nolibc/nolibc-test.c
@@ -1,17 +1,41 @@
 // SPDX-License-Identifier: GPL-2.0
 
+#define _GNU_SOURCE
+
 /* platform-specific include files coming from the compiler */
 #include <limits.h>
 
 /* libc-specific include files
- * The program may be built in 2 ways:
+ * The program may be built in 3 ways:
  *   $(CC) -nostdlib -include /path/to/nolibc.h => NOLIBC already defined
- *   $(CC) -nostdlib -I/path/to/nolibc/sysroot
+ *   $(CC) -nostdlib -I/path/to/nolibc/sysroot  => _NOLIBC_* guards are present
+ *   $(CC) with default libc                    => NOLIBC* never defined
  */
 #ifndef NOLIBC
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#ifndef _NOLIBC_STDIO_H
+/* standard libcs need more includes */
+#include <linux/reboot.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/reboot.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/sysmacros.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <unistd.h>
+#endif
 #endif
 
 /* will be used by nolibc by getenv() */
@@ -23,6 +47,17 @@ struct test {
 	int (*func)(int min, int max); // handler
 };
 
+#ifndef _NOLIBC_STDLIB_H
+char *itoa(int i)
+{
+	static char buf[12];
+	int ret;
+
+	ret = snprintf(buf, sizeof(buf), "%d", i);
+	return (ret >= 0 && ret < sizeof(buf)) ? buf : "#err";
+}
+#endif
+
 #define CASE_ERR(err) \
 	case err: return #err
 
@@ -431,7 +466,9 @@ int run_syscall(int min, int max)
 		switch (test + __LINE__ + 1) {
 		CASE_TEST(getpid);            EXPECT_SYSNE(1, getpid(), -1); break;
 		CASE_TEST(getppid);           EXPECT_SYSNE(1, getppid(), -1); break;
+#ifdef NOLIBC
 		CASE_TEST(gettid);            EXPECT_SYSNE(1, gettid(), -1); break;
+#endif
 		CASE_TEST(getpgid_self);      EXPECT_SYSNE(1, getpgid(0), -1); break;
 		CASE_TEST(getpgid_bad);       EXPECT_SYSER(1, getpgid(-1), -1, ESRCH); break;
 		CASE_TEST(kill_0);            EXPECT_SYSZR(1, kill(getpid(), 0)); break;
@@ -460,9 +497,11 @@ int run_syscall(int min, int max)
 		CASE_TEST(getdents64_root);   EXPECT_SYSNE(1, test_getdents64("/"), -1); break;
 		CASE_TEST(getdents64_null);   EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break;
 		CASE_TEST(gettimeofday_null); EXPECT_SYSZR(1, gettimeofday(NULL, NULL)); break;
+#ifdef NOLIBC
 		CASE_TEST(gettimeofday_bad1); EXPECT_SYSER(1, gettimeofday((void *)1, NULL), -1, EFAULT); break;
 		CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break;
 		CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break;
+#endif
 		CASE_TEST(ioctl_tiocinq);     EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break;
 		CASE_TEST(ioctl_tiocinq);     EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break;
 		CASE_TEST(link_root1);        EXPECT_SYSER(1, link("/", "/"), -1, EEXIST); break;
@@ -703,7 +742,11 @@ int main(int argc, char **argv, char **envp)
 		 * exit with status code 2N+1 when N is written to 0x501. We
 		 * hard-code the syscall here as it's arch-dependent.
 		 */
+#if defined(_NOLIBC_SYS_H)
 		else if (my_syscall3(__NR_ioperm, 0x501, 1, 1) == 0)
+#else
+		else if (ioperm(0x501, 1, 1) == 0)
+#endif
 			asm volatile ("outb %%al, %%dx" :: "d"(0x501), "a"(0));
 		/* if it does nothing, fall back to the regular panic */
 #endif
-- 
cgit v1.2.3


From d248cabff5da2f3f2ce0ab99c1f96a15e8fb98c6 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:45 +0200
Subject: selftests/nolibc: add a "kernel" target to build the kernel with the
 initramfs

The "kernel" target rebuilds the kernel with the current config for the
selected arch, with an initramfs containing the nolibc-test utility.

Since image names depend on the architecture, the currently supported
ones are referenced and resolved based on the architecture.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/Makefile | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index fd0a67082334..4a2ab0e73ce2 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -12,6 +12,16 @@ include $(srctree)/scripts/subarch.include
 ARCH = $(SUBARCH)
 endif
 
+# kernel image names by architecture
+IMAGE_i386    = arch/x86/boot/bzImage
+IMAGE_x86     = arch/x86/boot/bzImage
+IMAGE_arm64   = arch/arm64/boot/Image
+IMAGE_arm     = arch/arm/boot/zImage
+IMAGE_mips    = vmlinuz
+IMAGE_riscv   = arch/riscv/boot/Image
+IMAGE         = $(IMAGE_$(ARCH))
+IMAGE_NAME    = $(notdir $(IMAGE))
+
 # OUTPUT is only set when run from the main makefile, otherwise
 # it defaults to this nolibc directory.
 OUTPUT ?= $(CURDIR)/
@@ -36,6 +46,9 @@ initramfs: nolibc-test
 	$(call QUIET_INSTALL, initramfs/init)
 	$(Q)cp nolibc-test initramfs/init
 
+kernel: initramfs
+	$(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs
+
 clean:
 	$(call QUIET_CLEAN, nolibc-test)
 	$(Q)rm -f nolibc-test
-- 
cgit v1.2.3


From 5c43fd7954108bd138f7a1b4db4fb997ae1dc696 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:46 +0200
Subject: selftests/nolibc: add a "defconfig" target

While most archs will work fine with "make defconfig", not all will
do, and it's not always easy to remember the most suitable choice to
use for a specific architecture.

This adds a "defconfig" target to the Makefile so that one may easily
run "make -C ... defconfig" and make sure to clean and rebuild a fresh
config. This is *not* used by default because we want to preserve the
user's config by default.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/Makefile | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 4a2ab0e73ce2..c104719eae8b 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -22,6 +22,15 @@ IMAGE_riscv   = arch/riscv/boot/Image
 IMAGE         = $(IMAGE_$(ARCH))
 IMAGE_NAME    = $(notdir $(IMAGE))
 
+# default kernel configurations that appear to be usable
+DEFCONFIG_i386    = defconfig
+DEFCONFIG_x86     = defconfig
+DEFCONFIG_arm64   = defconfig
+DEFCONFIG_arm     = multi_v7_defconfig
+DEFCONFIG_mips    = malta_defconfig
+DEFCONFIG_riscv   = defconfig
+DEFCONFIG         = $(DEFCONFIG_$(ARCH))
+
 # OUTPUT is only set when run from the main makefile, otherwise
 # it defaults to this nolibc directory.
 OUTPUT ?= $(CURDIR)/
@@ -46,6 +55,9 @@ initramfs: nolibc-test
 	$(call QUIET_INSTALL, initramfs/init)
 	$(Q)cp nolibc-test initramfs/init
 
+defconfig:
+	$(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare
+
 kernel: initramfs
 	$(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs
 
-- 
cgit v1.2.3


From 662ea60e373be8a6b8a925b237cdb93a2af353c1 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:47 +0200
Subject: selftests/nolibc: add a "run" target to start the kernel in QEMU

The "run" target will build the kernel and start it in QEMU. The
"rerun" target will not have the kernel dependency and will just try
to start QEMU. The QEMU architecture used to start the kernel is
derived from the configured ARCH. This might need to be improved
for archs which include different variants under the same name
(mips vs mipsel, +/-64, riscv32 vs riscv64). This could be tested
for i386, x86, arm, arm64, mips and riscv (the later two reporting
issues on some tests).

It is possible to pass a test specification for nolibc-test in the TEST
variable, which will be passed as-is as NOLIBC_TEST.

On success, the number of successful tests is printed. On failure, failed
lines are individually printed.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/Makefile | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index c104719eae8b..7c1f5360f454 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -31,6 +31,27 @@ DEFCONFIG_mips    = malta_defconfig
 DEFCONFIG_riscv   = defconfig
 DEFCONFIG         = $(DEFCONFIG_$(ARCH))
 
+# optional tests to run (default = all)
+TEST =
+
+# QEMU_ARCH: arch names used by qemu
+QEMU_ARCH_i386    = i386
+QEMU_ARCH_x86     = x86_64
+QEMU_ARCH_arm64   = aarch64
+QEMU_ARCH_arm     = arm
+QEMU_ARCH_mips    = mipsel  # works with malta_defconfig
+QEMU_ARCH_riscv   = riscv64
+QEMU_ARCH         = $(QEMU_ARCH_$(ARCH))
+
+# QEMU_ARGS : some arch-specific args to pass to qemu
+QEMU_ARGS_i386    = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_x86     = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_arm64   = -M virt -cpu cortex-a53 -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_arm     = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_mips    = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS_riscv   = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)"
+QEMU_ARGS         = $(QEMU_ARGS_$(ARCH))
+
 # OUTPUT is only set when run from the main makefile, otherwise
 # it defaults to this nolibc directory.
 OUTPUT ?= $(CURDIR)/
@@ -61,8 +82,20 @@ defconfig:
 kernel: initramfs
 	$(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs
 
+# run the tests after building the kernel
+run: kernel
+	$(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(srctree)/$(IMAGE)" -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out"
+	$(Q)grep -w FAIL "$(CURDIR)/run.out" && echo "See all results in $(CURDIR)/run.out" || echo "$$(grep -c ^[0-9].*OK $(CURDIR)/run.out) test(s) passed."
+
+# re-run the tests from an existing kernel
+rerun:
+	$(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(srctree)/$(IMAGE)" -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out"
+	$(Q)grep -w FAIL "$(CURDIR)/run.out" && echo "See all results in $(CURDIR)/run.out" || echo "$$(grep -c ^[0-9].*OK $(CURDIR)/run.out) test(s) passed."
+
 clean:
 	$(call QUIET_CLEAN, nolibc-test)
 	$(Q)rm -f nolibc-test
 	$(call QUIET_CLEAN, initramfs)
 	$(Q)rm -rf initramfs
+	$(call QUIET_CLEAN, run.out)
+	$(Q)rm -rf run.out
-- 
cgit v1.2.3


From b25c5284db0a481a427c9838f32662587d947d27 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:48 +0200
Subject: selftests/nolibc: "sysroot" target installs a local copy of the
 sysroot

It's not convenient to rely on a sysroot built in another directory,
especially when running cross-compilation tests, where one has to
switch back and forth between directories.

Let's make it possible to install the sysroot directly in the test
directory. It's not big and even benefits from being copied by arch
so that it's easier to switch between archs if needed. The new
"sysroot" target does this, it just calls "headers_standalone" from
nolibc to install the sysroot right here.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/Makefile | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 7c1f5360f454..210f5369fdfc 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -67,9 +67,16 @@ LDFLAGS := -s
 
 all: nolibc-test
 
-nolibc-test: nolibc-test.c
+sysroot: sysroot/$(ARCH)/include
+
+sysroot/$(ARCH)/include:
+	$(QUIET_MKDIR)mkdir -p sysroot
+	$(Q)$(MAKE) -C ../../../include/nolibc ARCH=$(ARCH) OUTPUT=$(CURDIR)/sysroot/ headers_standalone
+	$(Q)mv sysroot/sysroot sysroot/$(ARCH)
+
+nolibc-test: nolibc-test.c sysroot/$(ARCH)/include
 	$(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \
-	  -nostdlib -static -include ../../../include/nolibc/nolibc.h $^ -lgcc
+	  -nostdlib -static -Isysroot/$(ARCH)/include $< -lgcc
 
 initramfs: nolibc-test
 	$(QUIET_MKDIR)mkdir -p initramfs
@@ -93,6 +100,8 @@ rerun:
 	$(Q)grep -w FAIL "$(CURDIR)/run.out" && echo "See all results in $(CURDIR)/run.out" || echo "$$(grep -c ^[0-9].*OK $(CURDIR)/run.out) test(s) passed."
 
 clean:
+	$(call QUIET_CLEAN, sysroot)
+	$(Q)rm -rf sysroot
 	$(call QUIET_CLEAN, nolibc-test)
 	$(Q)rm -f nolibc-test
 	$(call QUIET_CLEAN, initramfs)
-- 
cgit v1.2.3


From ffc297fe2259a701f2bd52a6fb8481abc89d331d Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Tue, 19 Jul 2022 23:44:49 +0200
Subject: selftests/nolibc: add a "help" target

It presents the supported targets, and becomes the default target to
save the user from having to read the makefile. The "all" target was
placed after it and now points to "run" to do everything since it's
no longer the default one.

Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/Makefile | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile
index 210f5369fdfc..69ea659caca9 100644
--- a/tools/testing/selftests/nolibc/Makefile
+++ b/tools/testing/selftests/nolibc/Makefile
@@ -65,7 +65,32 @@ endif
 CFLAGS  ?= -Os -fno-ident -fno-asynchronous-unwind-tables
 LDFLAGS := -s
 
-all: nolibc-test
+help:
+	@echo "Supported targets under selftests/nolibc:"
+	@echo "  all          call the \"run\" target below"
+	@echo "  help         this help"
+	@echo "  sysroot      create the nolibc sysroot here (uses \$$ARCH)"
+	@echo "  nolibc-test  build the executable (uses \$$CC and \$$CROSS_COMPILE)"
+	@echo "  initramfs    prepare the initramfs with nolibc-test"
+	@echo "  defconfig    create a fresh new default config (uses \$$ARCH)"
+	@echo "  kernel       (re)build the kernel with the initramfs (uses \$$ARCH)"
+	@echo "  run          runs the kernel in QEMU after building it (uses \$$ARCH, \$$TEST)"
+	@echo "  rerun        runs a previously prebuilt kernel in QEMU (uses \$$ARCH, \$$TEST)"
+	@echo "  clean        clean the sysroot, initramfs, build and output files"
+	@echo ""
+	@echo "The output file is \"run.out\". Test ranges may be passed using \$$TEST."
+	@echo ""
+	@echo "Currently using the following variables:"
+	@echo "  ARCH          = $(ARCH)"
+	@echo "  CROSS_COMPILE = $(CROSS_COMPILE)"
+	@echo "  CC            = $(CC)"
+	@echo "  OUTPUT        = $(OUTPUT)"
+	@echo "  TEST          = $(TEST)"
+	@echo "  QEMU_ARCH     = $(if $(QEMU_ARCH),$(QEMU_ARCH),UNKNOWN_ARCH) [determined from \$$ARCH]"
+	@echo "  IMAGE_NAME    = $(if $(IMAGE_NAME),$(IMAGE_NAME),UNKNOWN_ARCH) [determined from \$$ARCH]"
+	@echo ""
+
+all: run
 
 sysroot: sysroot/$(ARCH)/include
 
-- 
cgit v1.2.3


From 43cf168fa99992ee70ff041a61f866f56aa47f3b Mon Sep 17 00:00:00 2001
From: Fernanda Ma'rouf <fernandafmr12@gnuweeb.org>
Date: Wed, 20 Jul 2022 05:37:45 +0200
Subject: selftests/nolibc: Avoid generated files being committed

After running the nolibc tests, the "git status" is not clean because
the generated files are not ignored. Create a `.gitignore` inside the
selftests/nolibc directory to ignore them.

Cc: Ammar Faizi <ammarfaizi2@gnuweeb.org>
Cc: Fernanda Ma'rouf <fernandafmr2@gmail.com>
Signed-off-by: Fernanda Ma'rouf <fernandafmr12@gnuweeb.org>
Signed-off-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/nolibc/.gitignore | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 tools/testing/selftests/nolibc/.gitignore

diff --git a/tools/testing/selftests/nolibc/.gitignore b/tools/testing/selftests/nolibc/.gitignore
new file mode 100644
index 000000000000..4696df589d68
--- /dev/null
+++ b/tools/testing/selftests/nolibc/.gitignore
@@ -0,0 +1,4 @@
+/initramfs/
+/nolibc-test
+/run.out
+/sysroot/
-- 
cgit v1.2.3


From 52edb4080eb9606536c34d5d642ccd9d35ad5d08 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 29 Aug 2022 14:38:43 +0200
Subject: acl: move idmapping handling into posix_acl_xattr_set()

The uapi POSIX ACL struct passed through the value argument during
setxattr() contains {g,u}id values encoded via ACL_{GROUP,USER} entries
that should actually be stored in the form of k{g,u}id_t (See [1] for a
long explanation of the issue.).

In 0c5fd887d2bb ("acl: move idmapped mount fixup into vfs_{g,s}etxattr()")
we took the mount's idmapping into account in order to let overlayfs
handle POSIX ACLs on idmapped layers correctly. The fixup is currently
performed directly in vfs_setxattr() which piles on top of the earlier
hackiness by handling the mount's idmapping and stuff the vfs{g,u}id_t
values into the uapi struct as well. While that is all correct and works
fine it's just ugly.

Now that we have introduced vfs_make_posix_acl() earlier move handling
idmapped mounts out of vfs_setxattr() and into the POSIX ACL handler
where it belongs.

Note that we also need to call vfs_make_posix_acl() for EVM which
interpretes POSIX ACLs during security_inode_setxattr(). Leave them a
longer comment for future reference.

All filesystems that support idmapped mounts via FS_ALLOW_IDMAP use the
standard POSIX ACL xattr handlers and are covered by this change. This
includes overlayfs which simply calls vfs_{g,s}etxattr().

The following filesystems use custom POSIX ACL xattr handlers: 9p, cifs,
ecryptfs, and ntfs3 (and overlayfs but we've covered that in the paragraph
above) and none of them support idmapped mounts yet.

Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org/ [1]
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
---
 fs/posix_acl.c                    | 52 +++++++++------------------------------
 fs/xattr.c                        |  3 ---
 include/linux/posix_acl_xattr.h   |  9 -------
 security/integrity/evm/evm_main.c | 17 ++++++++++---
 4 files changed, 25 insertions(+), 56 deletions(-)

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 31eac28e6582..c759b8eef62e 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -771,46 +771,6 @@ void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
 	}
 }
 
-void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
-				     const struct inode *inode,
-				     void *value, size_t size)
-{
-	struct posix_acl_xattr_header *header = value;
-	struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
-	struct user_namespace *fs_userns = i_user_ns(inode);
-	int count;
-	vfsuid_t vfsuid;
-	vfsgid_t vfsgid;
-	kuid_t uid;
-	kgid_t gid;
-
-	if (no_idmapping(mnt_userns, i_user_ns(inode)))
-		return;
-
-	count = posix_acl_fix_xattr_common(value, size);
-	if (count <= 0)
-		return;
-
-	for (end = entry + count; entry != end; entry++) {
-		switch (le16_to_cpu(entry->e_tag)) {
-		case ACL_USER:
-			uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
-			vfsuid = VFSUIDT_INIT(uid);
-			uid = from_vfsuid(mnt_userns, fs_userns, vfsuid);
-			entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid));
-			break;
-		case ACL_GROUP:
-			gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
-			vfsgid = VFSGIDT_INIT(gid);
-			gid = from_vfsgid(mnt_userns, fs_userns, vfsgid);
-			entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid));
-			break;
-		default:
-			break;
-		}
-	}
-}
-
 static void posix_acl_fix_xattr_userns(
 	struct user_namespace *to, struct user_namespace *from,
 	void *value, size_t size)
@@ -1211,7 +1171,17 @@ posix_acl_xattr_set(const struct xattr_handler *handler,
 	int ret;
 
 	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+		/*
+		 * By the time we end up here the {g,u}ids stored in
+		 * ACL_{GROUP,USER} have already been mapped according to the
+		 * caller's idmapping. The vfs_set_acl_prepare() helper will
+		 * recover them and take idmapped mounts into account. The
+		 * filesystem will receive the POSIX ACLs in in the correct
+		 * format ready to be cached or written to the backing store
+		 * taking the filesystem idmapping into account.
+		 */
+		acl = vfs_set_acl_prepare(mnt_userns, i_user_ns(inode),
+					  value, size);
 		if (IS_ERR(acl))
 			return PTR_ERR(acl);
 	}
diff --git a/fs/xattr.c b/fs/xattr.c
index a1f4998bc6be..3ac68ec0c023 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -305,9 +305,6 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		size = error;
 	}
 
-	if (size && is_posix_acl_xattr(name))
-		posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size);
-
 retry_deleg:
 	inode_lock(inode);
 	error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size,
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index 47eca15fd842..8163dd48c430 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -38,9 +38,6 @@ void posix_acl_fix_xattr_to_user(void *value, size_t size);
 void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
 				     const struct inode *inode,
 				     void *value, size_t size);
-void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
-				     const struct inode *inode,
-				     void *value, size_t size);
 #else
 static inline void posix_acl_fix_xattr_from_user(void *value, size_t size)
 {
@@ -54,12 +51,6 @@ posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
 				size_t size)
 {
 }
-static inline void
-posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
-				const struct inode *inode, void *value,
-				size_t size)
-{
-}
 #endif
 
 struct posix_acl *posix_acl_from_xattr(struct user_namespace *user_ns, 
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 2e6fb6e2ffd2..23d484e05e6f 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -457,10 +457,21 @@ static int evm_xattr_acl_change(struct user_namespace *mnt_userns,
 	int rc;
 
 	/*
-	 * user_ns is not relevant here, ACL_USER/ACL_GROUP don't have impact
-	 * on the inode mode (see posix_acl_equiv_mode()).
+	 * An earlier comment here mentioned that the idmappings for
+	 * ACL_{GROUP,USER} don't matter since EVM is only interested in the
+	 * mode stored as part of POSIX ACLs. Nonetheless, if it must translate
+	 * from the uapi POSIX ACL representation to the VFS internal POSIX ACL
+	 * representation it should do so correctly. There's no guarantee that
+	 * we won't change POSIX ACLs in a way that ACL_{GROUP,USER} matters
+	 * for the mode at some point and it's difficult to keep track of all
+	 * the LSM and integrity modules and what they do to POSIX ACLs.
+	 *
+	 * Frankly, EVM shouldn't try to interpret the uapi struct for POSIX
+	 * ACLs it received. It requires knowledge that only the VFS is
+	 * guaranteed to have.
 	 */
-	acl = posix_acl_from_xattr(&init_user_ns, xattr_value, xattr_value_len);
+	acl = vfs_set_acl_prepare(mnt_userns, i_user_ns(inode),
+				  xattr_value, xattr_value_len);
 	if (IS_ERR_OR_NULL(acl))
 		return 1;
 
-- 
cgit v1.2.3


From 7e1401acd9f2807ff271100c5ce1fa84bf27a252 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 29 Aug 2022 14:38:44 +0200
Subject: ovl: use vfs_set_acl_prepare()

The posix_acl_from_xattr() helper should mainly be used in
i_op->get_acl() handlers. It translates from the uapi struct into the
kernel internal POSIX ACL representation and doesn't care about mount
idmappings.

Use the vfs_set_acl_prepare() helper to generate a kernel internal POSIX
ACL representation in struct posix_acl format taking care to map from
the mount idmapping into the filesystem's idmapping.

The returned struct posix_acl is in the correct format to be cached by
the VFS or passed to the filesystem's i_op->set_acl() method to write to
the backing store.

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
---
 fs/overlayfs/super.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index ec746d447f1b..5da771b218d1 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1022,7 +1022,20 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
 
 	/* Check that everything is OK before copy-up */
 	if (value) {
-		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+		/* The above comment can be understood in two ways:
+		 *
+		 * 1. We just want to check whether the basic POSIX ACL format
+		 *    is ok. For example, if the header is correct and the size
+		 *    is sane.
+		 * 2. We want to know whether the ACL_{GROUP,USER} entries can
+		 *    be mapped according to the underlying filesystem.
+		 *
+		 * Currently, we only check 1. If we wanted to check 2. we
+		 * would need to pass the mnt_userns and the fs_userns of the
+		 * underlying filesystem. But frankly, I think checking 1. is
+		 * enough to start the copy-up.
+		 */
+		acl = vfs_set_acl_prepare(&init_user_ns, &init_user_ns, value, size);
 		if (IS_ERR(acl))
 			return PTR_ERR(acl);
 	}
-- 
cgit v1.2.3


From 6344e66970c619a1623f457910e78819076e9104 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 29 Aug 2022 14:38:45 +0200
Subject: xattr: constify value argument in vfs_setxattr()

Now that we don't perform translations directly in vfs_setxattr()
anymore we can constify the @value argument in vfs_setxattr(). This also
allows us to remove the hack to cast from a const in ovl_do_setxattr().

Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
---
 fs/overlayfs/overlayfs.h | 2 +-
 fs/xattr.c               | 5 ++---
 include/linux/xattr.h    | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 87759165d32b..ee93c825b06b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -250,7 +250,7 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
 				  size_t size, int flags)
 {
 	int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name,
-			       (void *)value, size, flags);
+			       value, size, flags);
 
 	pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n",
 		 dentry, name, min((int)size, 48), value, size, flags, err);
diff --git a/fs/xattr.c b/fs/xattr.c
index 3ac68ec0c023..74fc8e021ebc 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -290,7 +290,7 @@ static inline bool is_posix_acl_xattr(const char *name)
 
 int
 vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
-	     const char *name, void *value, size_t size, int flags)
+	     const char *name, const void *value, size_t size, int flags)
 {
 	struct inode *inode = dentry->d_inode;
 	struct inode *delegated_inode = NULL;
@@ -298,8 +298,7 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	int error;
 
 	if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
-		error = cap_convert_nscap(mnt_userns, dentry,
-					  (const void **)&value, size);
+		error = cap_convert_nscap(mnt_userns, dentry, &value, size);
 		if (error < 0)
 			return error;
 		size = error;
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 979a9d3e5bfb..4c379d23ec6e 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -61,7 +61,7 @@ int __vfs_setxattr_locked(struct user_namespace *, struct dentry *,
 			  const char *, const void *, size_t, int,
 			  struct inode **);
 int vfs_setxattr(struct user_namespace *, struct dentry *, const char *,
-		 void *, size_t, int);
+		 const void *, size_t, int);
 int __vfs_removexattr(struct user_namespace *, struct dentry *, const char *);
 int __vfs_removexattr_locked(struct user_namespace *, struct dentry *,
 			     const char *, struct inode **);
-- 
cgit v1.2.3


From addbeea6f50b5ac344331652dd7f35faf760969e Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 26 Aug 2022 09:21:15 -0700
Subject: testing/selftests: Add tests for the is_signed_type() macro

Although not documented, is_signed_type() must support the 'bool' and
pointer types next to scalar and enumeration types. Add a selftest that
verifies that this macro handles all supported types correctly.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Isabella Basso <isabbasso@riseup.net>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sander Vanheule <sander@svanheule.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Isabella Basso <isabbasso@riseup.net>
Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220826162116.1050972-2-bvanassche@acm.org
---
 lib/Kconfig.debug          | 12 ++++++++++++
 lib/Makefile               |  1 +
 lib/is_signed_type_kunit.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+)
 create mode 100644 lib/is_signed_type_kunit.c

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 072e4b289c13..36455953d306 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2506,6 +2506,18 @@ config MEMCPY_KUNIT_TEST
 
 	  If unsure, say N.
 
+config IS_SIGNED_TYPE_KUNIT_TEST
+	tristate "Test is_signed_type() macro" if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	default KUNIT_ALL_TESTS
+	help
+	  Builds unit tests for the is_signed_type() macro.
+
+	  For more information on KUnit and unit tests in general please refer
+	  to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+	  If unsure, say N.
+
 config OVERFLOW_KUNIT_TEST
 	tristate "Test check_*_overflow() functions at runtime" if !KUNIT_ALL_TESTS
 	depends on KUNIT
diff --git a/lib/Makefile b/lib/Makefile
index 5927d7fa0806..f545140ed9e7 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -377,6 +377,7 @@ obj-$(CONFIG_BITS_TEST) += test_bits.o
 obj-$(CONFIG_CMDLINE_KUNIT_TEST) += cmdline_kunit.o
 obj-$(CONFIG_SLUB_KUNIT_TEST) += slub_kunit.o
 obj-$(CONFIG_MEMCPY_KUNIT_TEST) += memcpy_kunit.o
+obj-$(CONFIG_IS_SIGNED_TYPE_KUNIT_TEST) += is_signed_type_kunit.o
 obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o
 CFLAGS_stackinit_kunit.o += $(call cc-disable-warning, switch-unreachable)
 obj-$(CONFIG_STACKINIT_KUNIT_TEST) += stackinit_kunit.o
diff --git a/lib/is_signed_type_kunit.c b/lib/is_signed_type_kunit.c
new file mode 100644
index 000000000000..f2eedb1f0935
--- /dev/null
+++ b/lib/is_signed_type_kunit.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ *	./tools/testing/kunit/kunit.py run is_signed_type [--raw_output]
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <kunit/test.h>
+#include <linux/overflow.h>
+
+enum unsigned_enum {
+	constant_a = 3,
+};
+
+enum signed_enum {
+	constant_b = -1,
+	constant_c = 2,
+};
+
+static void is_signed_type_test(struct kunit *test)
+{
+	KUNIT_EXPECT_EQ(test, is_signed_type(bool), false);
+	KUNIT_EXPECT_EQ(test, is_signed_type(signed char), true);
+	KUNIT_EXPECT_EQ(test, is_signed_type(unsigned char), false);
+	KUNIT_EXPECT_EQ(test, is_signed_type(int), true);
+	KUNIT_EXPECT_EQ(test, is_signed_type(unsigned int), false);
+	KUNIT_EXPECT_EQ(test, is_signed_type(long), true);
+	KUNIT_EXPECT_EQ(test, is_signed_type(unsigned long), false);
+	KUNIT_EXPECT_EQ(test, is_signed_type(long long), true);
+	KUNIT_EXPECT_EQ(test, is_signed_type(unsigned long long), false);
+	KUNIT_EXPECT_EQ(test, is_signed_type(enum unsigned_enum), false);
+	KUNIT_EXPECT_EQ(test, is_signed_type(enum signed_enum), true);
+	KUNIT_EXPECT_EQ(test, is_signed_type(void *), false);
+	KUNIT_EXPECT_EQ(test, is_signed_type(const char *), false);
+}
+
+static struct kunit_case is_signed_type_test_cases[] = {
+	KUNIT_CASE(is_signed_type_test),
+	{}
+};
+
+static struct kunit_suite is_signed_type_test_suite = {
+	.name = "is_signed_type",
+	.test_cases = is_signed_type_test_cases,
+};
+
+kunit_test_suite(is_signed_type_test_suite);
+
+MODULE_LICENSE("Dual MIT/GPL");
-- 
cgit v1.2.3


From 92d23c6e94157739b997cacce151586a0d07bb8a Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 26 Aug 2022 09:21:16 -0700
Subject: overflow, tracing: Define the is_signed_type() macro once

There are two definitions of the is_signed_type() macro: one in
<linux/overflow.h> and a second definition in <linux/trace_events.h>.

As suggested by Linus Torvalds, move the definition of the
is_signed_type() macro into the <linux/compiler.h> header file. Change
the definition of the is_signed_type() macro to make sure that it does
not trigger any sparse warnings with future versions of sparse for
bitwise types. See also:
https://lore.kernel.org/all/CAHk-=whjH6p+qzwUdx5SOVVHjS3WvzJQr6mDUwhEyTf6pJWzaQ@mail.gmail.com/
https://lore.kernel.org/all/CAHk-=wjQGnVfb4jehFR0XyZikdQvCZouE96xR_nnf5kqaM5qqQ@mail.gmail.com/

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Isabella Basso <isabbasso@riseup.net>
Cc: "Jason A. Donenfeld" <Jason@zx2c4.com>
Cc: Josh Poimboeuf <jpoimboe@kernel.org>
Cc: Luc Van Oostenryck <luc.vanoostenryck@gmail.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Sander Vanheule <sander@svanheule.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220826162116.1050972-3-bvanassche@acm.org
---
 include/linux/compiler.h     | 6 ++++++
 include/linux/overflow.h     | 1 -
 include/linux/trace_events.h | 2 --
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 01ce94b58b42..7713d7bcdaea 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -239,6 +239,12 @@ static inline void *offset_to_ptr(const int *off)
 /* &a[0] degrades to a pointer: a different type from an array */
 #define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 
+/*
+ * Whether 'type' is a signed type or an unsigned type. Supports scalar types,
+ * bool and also pointer types.
+ */
+#define is_signed_type(type) (((type)(-1)) < (__force type)1)
+
 /*
  * This is needed in functions which generate the stack canary, see
  * arch/x86/kernel/smpboot.c::start_secondary() for an example.
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index f1221d11f8e5..0eb3b192f07a 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -30,7 +30,6 @@
  * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html -
  * credit to Christian Biere.
  */
-#define is_signed_type(type)       (((type)(-1)) < (type)1)
 #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type)))
 #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
 #define type_min(T) ((T)((T)-type_max(T)-(T)1))
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index b18759a673c6..8401dec93c15 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -814,8 +814,6 @@ extern int trace_add_event_call(struct trace_event_call *call);
 extern int trace_remove_event_call(struct trace_event_call *call);
 extern int trace_event_get_offsets(struct trace_event_call *call);
 
-#define is_signed_type(type)	(((type)(-1)) < (type)1)
-
 int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
 int trace_set_clr_event(const char *system, const char *event, int set);
 int trace_array_set_clr_event(struct trace_array *tr, const char *system,
-- 
cgit v1.2.3


From c829dba797360d9a266cabfaac16d1cd80abfc2b Mon Sep 17 00:00:00 2001
From: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Date: Wed, 31 Aug 2022 11:40:39 +0800
Subject: MAINTAINERS: Add include/linux/tnum.h to BPF CORE

Maintainers of the kerne/bpf/tnum.c are also the maintainers of the
corresponding header file include/linux/tnum.h.

Add the file entry for include/linux/tnum.h to the appropriate section
in MAINTAINERS.

Signed-off-by: Shung-Hsi Yu <shung-hsi.yu@suse.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220831034039.17998-1-shung-hsi.yu@suse.com
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index af4848466a08..1a9fe9736ddd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3824,6 +3824,7 @@ F:	kernel/bpf/dispatcher.c
 F:	kernel/bpf/trampoline.c
 F:	include/linux/bpf*
 F:	include/linux/filter.h
+F:	include/linux/tnum.h
 
 BPF [BTF]
 M:	Martin KaFai Lau <martin.lau@linux.dev>
-- 
cgit v1.2.3


From 499c8bb4693d1c8d8f3d6dd38e5bdde3ff5bd906 Mon Sep 17 00:00:00 2001
From: Kohei Tarumizu <tarumizu.kohei@fujitsu.com>
Date: Wed, 24 Aug 2022 09:44:10 -0700
Subject: x86/resctrl: Fix to restore to original value when re-enabling
 hardware prefetch register

The current pseudo_lock.c code overwrites the value of the
MSR_MISC_FEATURE_CONTROL to 0 even if the original value is not 0.
Therefore, modify it to save and restore the original values.

Fixes: 018961ae5579 ("x86/intel_rdt: Pseudo-lock region creation/removal core")
Fixes: 443810fe6160 ("x86/intel_rdt: Create debugfs files for pseudo-locking testing")
Fixes: 8a2fc0e1bc0c ("x86/intel_rdt: More precise L2 hit/miss measurements")
Signed-off-by: Kohei Tarumizu <tarumizu.kohei@fujitsu.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Reinette Chatre <reinette.chatre@intel.com>
Link: https://lkml.kernel.org/r/eb660f3c2010b79a792c573c02d01e8e841206ad.1661358182.git.reinette.chatre@intel.com
---
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index db813f819ad6..4d8398986f78 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -420,6 +420,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
 	struct pseudo_lock_region *plr = rdtgrp->plr;
 	u32 rmid_p, closid_p;
 	unsigned long i;
+	u64 saved_msr;
 #ifdef CONFIG_KASAN
 	/*
 	 * The registers used for local register variables are also used
@@ -463,6 +464,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
 	 * the buffer and evict pseudo-locked memory read earlier from the
 	 * cache.
 	 */
+	saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL);
 	__wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
 	closid_p = this_cpu_read(pqr_state.cur_closid);
 	rmid_p = this_cpu_read(pqr_state.cur_rmid);
@@ -514,7 +516,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
 	__wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p);
 
 	/* Re-enable the hardware prefetcher(s) */
-	wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+	wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr);
 	local_irq_enable();
 
 	plr->thread_done = 1;
@@ -871,6 +873,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
 static int measure_cycles_lat_fn(void *_plr)
 {
 	struct pseudo_lock_region *plr = _plr;
+	u32 saved_low, saved_high;
 	unsigned long i;
 	u64 start, end;
 	void *mem_r;
@@ -879,6 +882,7 @@ static int measure_cycles_lat_fn(void *_plr)
 	/*
 	 * Disable hardware prefetchers.
 	 */
+	rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
 	wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
 	mem_r = READ_ONCE(plr->kmem);
 	/*
@@ -895,7 +899,7 @@ static int measure_cycles_lat_fn(void *_plr)
 		end = rdtsc_ordered();
 		trace_pseudo_lock_mem_latency((u32)(end - start));
 	}
-	wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+	wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
 	local_irq_enable();
 	plr->thread_done = 1;
 	wake_up_interruptible(&plr->lock_thread_wq);
@@ -940,6 +944,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
 	u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
 	struct perf_event *miss_event, *hit_event;
 	int hit_pmcnum, miss_pmcnum;
+	u32 saved_low, saved_high;
 	unsigned int line_size;
 	unsigned int size;
 	unsigned long i;
@@ -973,6 +978,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
 	/*
 	 * Disable hardware prefetchers.
 	 */
+	rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
 	wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
 
 	/* Initialize rest of local variables */
@@ -1031,7 +1037,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
 	 */
 	rmb();
 	/* Re-enable hardware prefetchers */
-	wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+	wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
 	local_irq_enable();
 out_hit:
 	perf_event_release_kernel(hit_event);
-- 
cgit v1.2.3


From b1a2c4ee8be1df1931599e3214402ed9d6f97f8e Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Thu, 4 Aug 2022 13:59:38 +0200
Subject: thermal/drivers/thermal_mmio: Drop of_match_ptr()

Now that the driver depends on OF, we know what of_match_ptr() will
always resolve to, so we might as well save cpp some work.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_mmio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/thermal/thermal_mmio.c b/drivers/thermal/thermal_mmio.c
index 360b0dfdc3b0..c3e5def02cad 100644
--- a/drivers/thermal/thermal_mmio.c
+++ b/drivers/thermal/thermal_mmio.c
@@ -107,7 +107,7 @@ static struct platform_driver thermal_mmio_driver = {
 	.probe = thermal_mmio_probe,
 	.driver = {
 		.name = "thermal-mmio",
-		.of_match_table = of_match_ptr(thermal_mmio_id_table),
+		.of_match_table = thermal_mmio_id_table,
 	},
 };
 
-- 
cgit v1.2.3


From c00c4461689e15ac2cc3b9a595a54e4d8afd3d77 Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Aug 2022 14:17:05 +0200
Subject: xsk: Fix backpressure mechanism on Tx

Commit d678cbd2f867 ("xsk: Fix handling of invalid descriptors in XSK TX
batching API") fixed batch API usage against set of descriptors with
invalid ones but introduced a problem when AF_XDP SW rings are smaller
than HW ones. Mismatch of reported Tx'ed frames between HW generator and
user space app was observed. It turned out that backpressure mechanism
became a bottleneck when the amount of produced descriptors to CQ is
lower than what we grabbed from XSK Tx ring.

Say that 512 entries had been taken from XSK Tx ring but we had only 490
free entries in CQ. Then callsite (ZC driver) will produce only 490
entries onto HW Tx ring but 512 entries will be released from Tx ring
and this is what will be seen by the user space.

In order to fix this case, mix XSK Tx/CQ ring interractions by moving
around internal functions and changing call order:

*  pull out xskq_prod_nb_free() from xskq_prod_reserve_addr_batch()
   up to xsk_tx_peek_release_desc_batch();
** move xskq_cons_release_n() into xskq_cons_read_desc_batch()

After doing so, algorithm can be described as follows:

1. lookup Tx entries
2. use value from 1. to reserve space in CQ (*)
3. Read from Tx ring as much descriptors as value from 2
 3a. release descriptors from XSK Tx ring (**)
4. Finally produce addresses to CQ

Fixes: d678cbd2f867 ("xsk: Fix handling of invalid descriptors in XSK TX batching API")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220830121705.8618-1-maciej.fijalkowski@intel.com
---
 net/xdp/xsk.c       | 22 +++++++++++-----------
 net/xdp/xsk_queue.h | 22 ++++++++++------------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 5b4ce6ba1bc7..639b2c3beb69 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -355,16 +355,15 @@ static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entr
 	return nb_pkts;
 }
 
-u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries)
+u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
 {
 	struct xdp_sock *xs;
-	u32 nb_pkts;
 
 	rcu_read_lock();
 	if (!list_is_singular(&pool->xsk_tx_list)) {
 		/* Fallback to the non-batched version */
 		rcu_read_unlock();
-		return xsk_tx_peek_release_fallback(pool, max_entries);
+		return xsk_tx_peek_release_fallback(pool, nb_pkts);
 	}
 
 	xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
@@ -373,12 +372,7 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries)
 		goto out;
 	}
 
-	max_entries = xskq_cons_nb_entries(xs->tx, max_entries);
-	nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, max_entries);
-	if (!nb_pkts) {
-		xs->tx->queue_empty_descs++;
-		goto out;
-	}
+	nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
 
 	/* This is the backpressure mechanism for the Tx path. Try to
 	 * reserve space in the completion queue for all packets, but
@@ -386,12 +380,18 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries)
 	 * packets. This avoids having to implement any buffering in
 	 * the Tx path.
 	 */
-	nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
+	nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
 	if (!nb_pkts)
 		goto out;
 
-	xskq_cons_release_n(xs->tx, max_entries);
+	nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
+	if (!nb_pkts) {
+		xs->tx->queue_empty_descs++;
+		goto out;
+	}
+
 	__xskq_cons_release(xs->tx);
+	xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
 	xs->sk.sk_write_space(&xs->sk);
 
 out:
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index fb20bf7207cf..c6fb6b763658 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -205,6 +205,11 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q,
 	return false;
 }
 
+static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt)
+{
+	q->cached_cons += cnt;
+}
+
 static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool,
 					    u32 max)
 {
@@ -226,6 +231,8 @@ static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff
 		cached_cons++;
 	}
 
+	/* Release valid plus any invalid entries */
+	xskq_cons_release_n(q, cached_cons - q->cached_cons);
 	return nb_entries;
 }
 
@@ -291,11 +298,6 @@ static inline void xskq_cons_release(struct xsk_queue *q)
 	q->cached_cons++;
 }
 
-static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt)
-{
-	q->cached_cons += cnt;
-}
-
 static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
 {
 	/* No barriers needed since data is not accessed */
@@ -350,21 +352,17 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
 	return 0;
 }
 
-static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
-					       u32 max)
+static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
+					      u32 nb_entries)
 {
 	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
-	u32 nb_entries, i, cached_prod;
-
-	nb_entries = xskq_prod_nb_free(q, max);
+	u32 i, cached_prod;
 
 	/* A, matches D */
 	cached_prod = q->cached_prod;
 	for (i = 0; i < nb_entries; i++)
 		ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr;
 	q->cached_prod = cached_prod;
-
-	return nb_entries;
 }
 
 static inline int xskq_prod_reserve_desc(struct xsk_queue *q,
-- 
cgit v1.2.3


From 8a7d61bdc2fac2c460a2f32a062f5c6dbd21a764 Mon Sep 17 00:00:00 2001
From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Date: Tue, 30 Aug 2022 15:39:05 +0200
Subject: selftests/xsk: Add missing close() on netns fd

Commit 1034b03e54ac ("selftests: xsk: Simplify cleanup of ifobjects")
removed close on netns fd, which is not correct, so let us restore it.

Fixes: 1034b03e54ac ("selftests: xsk: Simplify cleanup of ifobjects")
Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/bpf/20220830133905.9945-1-maciej.fijalkowski@intel.com
---
 tools/testing/selftests/bpf/xskxceiver.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index 74d56d971baf..091402dc5390 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -1606,6 +1606,8 @@ static struct ifobject *ifobject_create(void)
 	if (!ifobj->umem)
 		goto out_umem;
 
+	ifobj->ns_fd = -1;
+
 	return ifobj;
 
 out_umem:
@@ -1617,6 +1619,8 @@ out_xsk_arr:
 
 static void ifobject_delete(struct ifobject *ifobj)
 {
+	if (ifobj->ns_fd != -1)
+		close(ifobj->ns_fd);
 	free(ifobj->umem);
 	free(ifobj->xsk_arr);
 	free(ifobj);
-- 
cgit v1.2.3


From ea3747181dc2705ecc697dc98ada7eb135733561 Mon Sep 17 00:00:00 2001
From: Jason Wang <wangborong@cdjrlc.com>
Date: Thu, 11 Aug 2022 20:04:50 +0800
Subject: cpuidle: coupled: Drop duplicate word from a comment

The double `are' is duplicated in the comment, remove one.

Signed-off-by: Jason Wang <wangborong@cdjrlc.com>
[ rjw: New subject ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/coupled.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 74068742cef3..9acde71558d5 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -54,7 +54,7 @@
  * variable is not locked.  It is only written from the cpu that
  * it stores (or by the on/offlining cpu if that cpu is offline),
  * and only read after all the cpus are ready for the coupled idle
- * state are are no longer updating it.
+ * state are no longer updating it.
  *
  * Three atomic counters are used.  alive_count tracks the number
  * of cpus in the coupled set that are currently or soon will be
-- 
cgit v1.2.3


From 1ea252ef299bbce16cc40d52d0635346fdc4b7ea Mon Sep 17 00:00:00 2001
From: zhaoxiao <zhaoxiao@uniontech.com>
Date: Fri, 12 Aug 2022 14:39:26 +0800
Subject: thermal: da9062-thermal: Drop redundant error message

Since platform_get_irq() already prints an error message on failure, it
is not necessary to print another one for the same purpose.

Signed-off-by: zhaoxiao <zhaoxiao@uniontech.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/da9062-thermal.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/thermal/da9062-thermal.c b/drivers/thermal/da9062-thermal.c
index 180edec34e07..7dcfde7a9f2c 100644
--- a/drivers/thermal/da9062-thermal.c
+++ b/drivers/thermal/da9062-thermal.c
@@ -248,10 +248,9 @@ static int da9062_thermal_probe(struct platform_device *pdev)
 		jiffies_to_msecs(thermal->zone->passive_delay_jiffies));
 
 	ret = platform_get_irq_byname(pdev, "THERMAL");
-	if (ret < 0) {
-		dev_err(&pdev->dev, "Failed to get platform IRQ.\n");
+	if (ret < 0)
 		goto err_zone;
-	}
+
 	thermal->irq = ret;
 
 	ret = request_threaded_irq(thermal->irq, NULL,
-- 
cgit v1.2.3


From 4f59540c3cd00a9a361b2cf2ac475edd83fd4f93 Mon Sep 17 00:00:00 2001
From: Perry Yuan <Perry.Yuan@amd.com>
Date: Mon, 15 Aug 2022 00:35:43 +0800
Subject: cpufreq: amd-pstate: simplify cpudata pointer assignment

move the cpudata assignment to cpudata declaration which
will simplify the functions.

No functional change intended.

Reviewed-by: Huang Rui <ray.huang@amd.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Perry Yuan <Perry.Yuan@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/amd-pstate.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 9ac75c1cde9c..2c17623218ec 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -555,9 +555,7 @@ free_cpudata1:
 
 static int amd_pstate_cpu_exit(struct cpufreq_policy *policy)
 {
-	struct amd_cpudata *cpudata;
-
-	cpudata = policy->driver_data;
+	struct amd_cpudata *cpudata = policy->driver_data;
 
 	freq_qos_remove_request(&cpudata->req[1]);
 	freq_qos_remove_request(&cpudata->req[0]);
@@ -599,9 +597,7 @@ static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy,
 					char *buf)
 {
 	int max_freq;
-	struct amd_cpudata *cpudata;
-
-	cpudata = policy->driver_data;
+	struct amd_cpudata *cpudata = policy->driver_data;
 
 	max_freq = amd_get_max_freq(cpudata);
 	if (max_freq < 0)
@@ -614,9 +610,7 @@ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *poli
 						     char *buf)
 {
 	int freq;
-	struct amd_cpudata *cpudata;
-
-	cpudata = policy->driver_data;
+	struct amd_cpudata *cpudata = policy->driver_data;
 
 	freq = amd_get_lowest_nonlinear_freq(cpudata);
 	if (freq < 0)
-- 
cgit v1.2.3


From d8bee41db83ea207792889df323bb46ee4ebab24 Mon Sep 17 00:00:00 2001
From: Perry Yuan <Perry.Yuan@amd.com>
Date: Mon, 15 Aug 2022 00:35:44 +0800
Subject: cpufreq: amd-pstate: fix white-space

Remove the white space and correct mixed-up indentation

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Perry Yuan <Perry.Yuan@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/amd-pstate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 2c17623218ec..ec02929aaa98 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -120,7 +120,7 @@ struct amd_cpudata {
 	struct amd_aperf_mperf cur;
 	struct amd_aperf_mperf prev;
 
-	u64 freq;
+	u64 	freq;
 	bool	boost_supported;
 };
 
@@ -656,7 +656,7 @@ static struct cpufreq_driver amd_pstate_driver = {
 	.resume		= amd_pstate_cpu_resume,
 	.set_boost	= amd_pstate_set_boost,
 	.name		= "amd-pstate",
-	.attr           = amd_pstate_attr,
+	.attr		= amd_pstate_attr,
 };
 
 static int __init amd_pstate_init(void)
-- 
cgit v1.2.3


From b185c5053c65b7704ead4537e4d4d9b33dc398dc Mon Sep 17 00:00:00 2001
From: Perry Yuan <Perry.Yuan@amd.com>
Date: Mon, 15 Aug 2022 00:35:45 +0800
Subject: cpufreq: amd_pstate: fix wrong lowest perf fetch

Fix the wrong lowest perf value reading which is used for new
des_perf calculation by governor requested, the incorrect min_perf will
get incorrect des_perf to be set , that will cause the system frequency
changing unexpectedly.

Reviewed-by: Huang Rui <ray.huang@amd.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Perry Yuan <Perry.Yuan@amd.com>
Signed-off-by: Su Jinzhou <jinzhou.su@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/amd-pstate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index ec02929aaa98..a47a1a2fdd06 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -312,7 +312,7 @@ static int amd_pstate_target(struct cpufreq_policy *policy,
 		return -ENODEV;
 
 	cap_perf = READ_ONCE(cpudata->highest_perf);
-	min_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);
+	min_perf = READ_ONCE(cpudata->lowest_perf);
 	max_perf = cap_perf;
 
 	freqs.old = policy->cur;
-- 
cgit v1.2.3


From 0e9a86386bca646d3b0fc50e5e15eedb1eb620b3 Mon Sep 17 00:00:00 2001
From: Perry Yuan <Perry.Yuan@amd.com>
Date: Mon, 15 Aug 2022 00:35:46 +0800
Subject: cpufreq: amd_pstate: map desired perf into pstate scope for powersave
 governor

The patch will fix the invalid desired perf value for powersave
governor. This issue is found when testing on one AMD EPYC system, the
actual des_perf is smaller than the min_perf value, that is invalid
value. because the min_perf is the lowest_perf system can support in
idle state.

Reviewed-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Perry Yuan <Perry.Yuan@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/amd-pstate.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index a47a1a2fdd06..fd563a002fd0 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -269,6 +269,7 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
 	u64 prev = READ_ONCE(cpudata->cppc_req_cached);
 	u64 value = prev;
 
+	des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
 	value &= ~AMD_CPPC_MIN_PERF(~0L);
 	value |= AMD_CPPC_MIN_PERF(min_perf);
 
@@ -357,8 +358,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
 	if (max_perf < min_perf)
 		max_perf = min_perf;
 
-	des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
-
 	amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true);
 }
 
-- 
cgit v1.2.3


From ca08e46d4215e85eb3cef2481255431d98da56da Mon Sep 17 00:00:00 2001
From: Perry Yuan <Perry.Yuan@amd.com>
Date: Mon, 15 Aug 2022 00:35:47 +0800
Subject: cpufreq: amd-pstate: update pstate frequency transition delay time

Change the default transition latency to be 20ms that is more
reasonable transition delay for AMD processors in non-EPP driver mode.

Update transition delay time to 1ms, in the AMD CPU autonomous mode and
non-autonomous mode, CPPC firmware will decide frequency at 1ms timescale
based on the workload utilization.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Perry Yuan <Perry.Yuan@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/amd-pstate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index fd563a002fd0..33fbd6a94c43 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -46,8 +46,8 @@
 #include <asm/cpu_device_id.h>
 #include "amd-pstate-trace.h"
 
-#define AMD_PSTATE_TRANSITION_LATENCY	0x20000
-#define AMD_PSTATE_TRANSITION_DELAY	500
+#define AMD_PSTATE_TRANSITION_LATENCY	20000
+#define AMD_PSTATE_TRANSITION_DELAY	1000
 
 /*
  * TODO: We need more time to fine tune processors with shared memory solution
-- 
cgit v1.2.3


From ccf28724604ee9b8577d21022e31c323d104cca2 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 18 Aug 2022 23:00:01 +0200
Subject: cpuidle: powernv: move from strlcpy() with unused retval to strscpy()

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle-powernv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index c32c600b3cf8..0b5461b3d7dd 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -233,8 +233,8 @@ static inline void add_powernv_state(int index, const char *name,
 				     unsigned int exit_latency,
 				     u64 psscr_val, u64 psscr_mask)
 {
-	strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN);
-	strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN);
+	strscpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN);
+	strscpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN);
 	powernv_states[index].flags = flags;
 	powernv_states[index].target_residency = target_residency;
 	powernv_states[index].exit_latency = exit_latency;
-- 
cgit v1.2.3


From 0dbc0f49d6739033a4a2807726d1d6542f16b099 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 18 Aug 2022 23:00:15 +0200
Subject: intel_idle: move from strlcpy() with unused retval to strscpy()

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 3e101719689a..114bd6631578 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1507,7 +1507,7 @@ static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 		state = &drv->states[drv->state_count++];
 
 		snprintf(state->name, CPUIDLE_NAME_LEN, "C%d_ACPI", cstate);
-		strlcpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
+		strscpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
 		state->exit_latency = cx->latency;
 		/*
 		 * For C1-type C-states use the same number for both the exit
-- 
cgit v1.2.3


From 1e6c8fb8b8d3e91e140b505e8a68b05f81ac0f87 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 18 Aug 2022 23:01:11 +0200
Subject: thermal: move from strlcpy() with unused retval to strscpy()

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_core.c  | 2 +-
 drivers/thermal/thermal_hwmon.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 99c0c19fa5a6..7dc7cb53ce6f 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1219,7 +1219,7 @@ thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *t
 	}
 
 	tz->id = id;
-	strlcpy(tz->type, type, sizeof(tz->type));
+	strscpy(tz->type, type, sizeof(tz->type));
 
 	result = dev_set_name(&tz->device, "thermal_zone%d", tz->id);
 	if (result)
diff --git a/drivers/thermal/thermal_hwmon.c b/drivers/thermal/thermal_hwmon.c
index 09e49ec8b6f4..f53f4ceb6a5d 100644
--- a/drivers/thermal/thermal_hwmon.c
+++ b/drivers/thermal/thermal_hwmon.c
@@ -147,7 +147,7 @@ int thermal_add_hwmon_sysfs(struct thermal_zone_device *tz)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&hwmon->tz_list);
-	strlcpy(hwmon->type, tz->type, THERMAL_NAME_LENGTH);
+	strscpy(hwmon->type, tz->type, THERMAL_NAME_LENGTH);
 	strreplace(hwmon->type, '-', '_');
 	hwmon->device = hwmon_device_register_for_thermal(&tz->device,
 							  hwmon->type, hwmon);
-- 
cgit v1.2.3


From bd8092def983f567800f764a5d23b2dca98f078c Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 18 Aug 2022 23:01:56 +0200
Subject: PM: suspend: move from strlcpy() with unused retval to strscpy()

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/suspend.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 70f2921e2e70..23a253df7f6b 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -75,7 +75,7 @@ extern struct suspend_stats suspend_stats;
 
 static inline void dpm_save_failed_dev(const char *name)
 {
-	strlcpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev],
+	strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev],
 		name,
 		sizeof(suspend_stats.failed_devs[0]));
 	suspend_stats.last_failed_dev++;
-- 
cgit v1.2.3


From b8d1d163604bd1e600b062fb00de5dc42baa355f Mon Sep 17 00:00:00 2001
From: Daniel Sneddon <daniel.sneddon@linux.intel.com>
Date: Tue, 16 Aug 2022 16:19:42 -0700
Subject: x86/apic: Don't disable x2APIC if locked

The APIC supports two modes, legacy APIC (or xAPIC), and Extended APIC
(or x2APIC).  X2APIC mode is mostly compatible with legacy APIC, but
it disables the memory-mapped APIC interface in favor of one that uses
MSRs.  The APIC mode is controlled by the EXT bit in the APIC MSR.

The MMIO/xAPIC interface has some problems, most notably the APIC LEAK
[1].  This bug allows an attacker to use the APIC MMIO interface to
extract data from the SGX enclave.

Introduce support for a new feature that will allow the BIOS to lock
the APIC in x2APIC mode.  If the APIC is locked in x2APIC mode and the
kernel tries to disable the APIC or revert to legacy APIC mode a GP
fault will occur.

Introduce support for a new MSR (IA32_XAPIC_DISABLE_STATUS) and handle
the new locked mode when the LEGACY_XAPIC_DISABLED bit is set by
preventing the kernel from trying to disable the x2APIC.

On platforms with the IA32_XAPIC_DISABLE_STATUS MSR, if SGX or TDX are
enabled the LEGACY_XAPIC_DISABLED will be set by the BIOS.  If
legacy APIC is required, then it SGX and TDX need to be disabled in the
BIOS.

[1]: https://aepicleak.com/aepicleak.pdf

Signed-off-by: Daniel Sneddon <daniel.sneddon@linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Tested-by: Neelima Krishnan <neelima.krishnan@intel.com>
Link: https://lkml.kernel.org/r/20220816231943.1152579-1-daniel.sneddon@linux.intel.com
---
 Documentation/admin-guide/kernel-parameters.txt |  4 +++
 arch/x86/Kconfig                                |  7 +++-
 arch/x86/include/asm/cpu.h                      |  2 ++
 arch/x86/include/asm/msr-index.h                | 13 ++++++++
 arch/x86/kernel/apic/apic.c                     | 44 ++++++++++++++++++++++---
 5 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 426fa892d311..2bc11a61c4d0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3805,6 +3805,10 @@
 
 	nox2apic	[X86-64,APIC] Do not enable x2APIC mode.
 
+			NOTE: this parameter will be ignored on systems with the
+			LEGACY_XAPIC_DISABLED bit set in the
+			IA32_XAPIC_DISABLE_STATUS MSR.
+
 	nps_mtm_hs_ctr=	[KNL,ARC]
 			This parameter sets the maximum duration, in
 			cycles, each HW thread of the CTOP can run
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f9920f1341c8..159c025ebb03 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -448,6 +448,11 @@ config X86_X2APIC
 	  This allows 32-bit apic IDs (so it can support very large systems),
 	  and accesses the local apic via MSRs not via mmio.
 
+	  Some Intel systems circa 2022 and later are locked into x2APIC mode
+	  and can not fall back to the legacy APIC modes if SGX or TDX are
+	  enabled in the BIOS.  They will be unable to boot without enabling
+	  this option.
+
 	  If you don't know what to do here, say N.
 
 config X86_MPPARSE
@@ -1919,7 +1924,7 @@ endchoice
 
 config X86_SGX
 	bool "Software Guard eXtensions (SGX)"
-	depends on X86_64 && CPU_SUP_INTEL
+	depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC
 	depends on CRYPTO=y
 	depends on CRYPTO_SHA256=y
 	select SRCU
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 8cbf623f0ecf..b472ef76826a 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -94,4 +94,6 @@ static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1,
 	return p1 & p2;
 }
 
+extern u64 x86_read_arch_cap_msr(void);
+
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6674bdb096f3..1e086b37a307 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -155,6 +155,11 @@
 						 * Return Stack Buffer Predictions.
 						 */
 
+#define ARCH_CAP_XAPIC_DISABLE		BIT(21)	/*
+						 * IA32_XAPIC_DISABLE_STATUS MSR
+						 * supported
+						 */
+
 #define MSR_IA32_FLUSH_CMD		0x0000010b
 #define L1D_FLUSH			BIT(0)	/*
 						 * Writeback and invalidate the
@@ -1054,4 +1059,12 @@
 #define MSR_IA32_HW_FEEDBACK_PTR        0x17d0
 #define MSR_IA32_HW_FEEDBACK_CONFIG     0x17d1
 
+/* x2APIC locked status */
+#define MSR_IA32_XAPIC_DISABLE_STATUS	0xBD
+#define LEGACY_XAPIC_DISABLED		BIT(0) /*
+						* x2APIC mode is locked and
+						* disabling x2APIC will cause
+						* a #GP
+						*/
+
 #endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 6d303d1d276c..c6876d3ea4b1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -61,6 +61,7 @@
 #include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
 #include <asm/irq_regs.h>
+#include <asm/cpu.h>
 
 unsigned int num_processors;
 
@@ -1751,11 +1752,26 @@ EXPORT_SYMBOL_GPL(x2apic_mode);
 
 enum {
 	X2APIC_OFF,
-	X2APIC_ON,
 	X2APIC_DISABLED,
+	/* All states below here have X2APIC enabled */
+	X2APIC_ON,
+	X2APIC_ON_LOCKED
 };
 static int x2apic_state;
 
+static bool x2apic_hw_locked(void)
+{
+	u64 ia32_cap;
+	u64 msr;
+
+	ia32_cap = x86_read_arch_cap_msr();
+	if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) {
+		rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr);
+		return (msr & LEGACY_XAPIC_DISABLED);
+	}
+	return false;
+}
+
 static void __x2apic_disable(void)
 {
 	u64 msr;
@@ -1793,6 +1809,10 @@ static int __init setup_nox2apic(char *str)
 				apicid);
 			return 0;
 		}
+		if (x2apic_hw_locked()) {
+			pr_warn("APIC locked in x2apic mode, can't disable\n");
+			return 0;
+		}
 		pr_warn("x2apic already enabled.\n");
 		__x2apic_disable();
 	}
@@ -1807,10 +1827,18 @@ early_param("nox2apic", setup_nox2apic);
 void x2apic_setup(void)
 {
 	/*
-	 * If x2apic is not in ON state, disable it if already enabled
+	 * Try to make the AP's APIC state match that of the BSP,  but if the
+	 * BSP is unlocked and the AP is locked then there is a state mismatch.
+	 * Warn about the mismatch in case a GP fault occurs due to a locked AP
+	 * trying to be turned off.
+	 */
+	if (x2apic_state != X2APIC_ON_LOCKED && x2apic_hw_locked())
+		pr_warn("x2apic lock mismatch between BSP and AP.\n");
+	/*
+	 * If x2apic is not in ON or LOCKED state, disable it if already enabled
 	 * from BIOS.
 	 */
-	if (x2apic_state != X2APIC_ON) {
+	if (x2apic_state < X2APIC_ON) {
 		__x2apic_disable();
 		return;
 	}
@@ -1831,6 +1859,11 @@ static __init void x2apic_disable(void)
 	if (x2apic_id >= 255)
 		panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
 
+	if (x2apic_hw_locked()) {
+		pr_warn("Cannot disable locked x2apic, id: %08x\n", x2apic_id);
+		return;
+	}
+
 	__x2apic_disable();
 	register_lapic_address(mp_lapic_addr);
 }
@@ -1889,7 +1922,10 @@ void __init check_x2apic(void)
 	if (x2apic_enabled()) {
 		pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n");
 		x2apic_mode = 1;
-		x2apic_state = X2APIC_ON;
+		if (x2apic_hw_locked())
+			x2apic_state = X2APIC_ON_LOCKED;
+		else
+			x2apic_state = X2APIC_ON;
 	} else if (!boot_cpu_has(X86_FEATURE_X2APIC)) {
 		x2apic_state = X2APIC_DISABLED;
 	}
-- 
cgit v1.2.3


From 7987448ffc34f41590bae5b28cd23ca0b8c93291 Mon Sep 17 00:00:00 2001
From: Stephen Kitt <steve@sk2.org>
Date: Wed, 13 Jul 2022 18:08:40 +0200
Subject: x86/Kconfig: Specify idle=poll instead of no-hlt

Commit 27be45700021 ("x86 idle: remove 32-bit-only "no-hlt" parameter,
hlt_works_ok flag") removed no-hlt, but CONFIG_APM still refers to
it. Suggest "idle=poll" instead, based on the commit message:

> If a user wants to avoid HLT, then "idle=poll"
> is much more useful, as it avoids invocation of HLT
> in idle, while "no-hlt" failed to do so.

Signed-off-by: Stephen Kitt <steve@sk2.org>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lkml.kernel.org/r/20220713160840.1577569-1-steve@sk2.org
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f9920f1341c8..83b163a7d870 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2569,7 +2569,7 @@ menuconfig APM
 
 	  1) make sure that you have enough swap space and that it is
 	  enabled.
-	  2) pass the "no-hlt" option to the kernel
+	  2) pass the "idle=poll" option to the kernel
 	  3) switch on floating point emulation in the kernel and pass
 	  the "no387" option to the kernel
 	  4) pass the "floppy=nodma" option to the kernel
-- 
cgit v1.2.3


From 278d933e12f163369d1f18e8e3e5c49d5e77f233 Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Tue, 16 Aug 2022 12:04:34 -0700
Subject: Bluetooth: Normalize HCI_OP_READ_ENC_KEY_SIZE cmdcmplt

The HCI_OP_READ_ENC_KEY_SIZE command is converted from using the
deprecated hci_request mechanism to use hci_send_cmd, with an
accompanying hci_cc_read_enc_key_size to handle it's return response.

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 92 +++++++++++++++++++++++------------------------
 1 file changed, 45 insertions(+), 47 deletions(-)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 1906822a061b..0ed944aaed94 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -712,6 +712,47 @@ static u8 hci_cc_read_local_version(struct hci_dev *hdev, void *data,
 	return rp->status;
 }
 
+static u8 hci_cc_read_enc_key_size(struct hci_dev *hdev, void *data,
+				   struct sk_buff *skb)
+{
+	struct hci_rp_read_enc_key_size *rp = data;
+	struct hci_conn *conn;
+	u16 handle;
+	u8 status = rp->status;
+
+	bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+	handle = le16_to_cpu(rp->handle);
+
+	hci_dev_lock(hdev);
+
+	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	if (!conn) {
+		status = 0xFF;
+		goto done;
+	}
+
+	/* While unexpected, the read_enc_key_size command may fail. The most
+	 * secure approach is to then assume the key size is 0 to force a
+	 * disconnection.
+	 */
+	if (status) {
+		bt_dev_err(hdev, "failed to read key size for handle %u",
+			   handle);
+		conn->enc_key_size = 0;
+	} else {
+		conn->enc_key_size = rp->key_size;
+		status = 0;
+	}
+
+	hci_encrypt_cfm(conn, 0);
+
+done:
+	hci_dev_unlock(hdev);
+
+	return status;
+}
+
 static u8 hci_cc_read_local_commands(struct hci_dev *hdev, void *data,
 				     struct sk_buff *skb)
 {
@@ -3534,47 +3575,6 @@ unlock:
 	hci_dev_unlock(hdev);
 }
 
-static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status,
-				       u16 opcode, struct sk_buff *skb)
-{
-	const struct hci_rp_read_enc_key_size *rp;
-	struct hci_conn *conn;
-	u16 handle;
-
-	BT_DBG("%s status 0x%02x", hdev->name, status);
-
-	if (!skb || skb->len < sizeof(*rp)) {
-		bt_dev_err(hdev, "invalid read key size response");
-		return;
-	}
-
-	rp = (void *)skb->data;
-	handle = le16_to_cpu(rp->handle);
-
-	hci_dev_lock(hdev);
-
-	conn = hci_conn_hash_lookup_handle(hdev, handle);
-	if (!conn)
-		goto unlock;
-
-	/* While unexpected, the read_enc_key_size command may fail. The most
-	 * secure approach is to then assume the key size is 0 to force a
-	 * disconnection.
-	 */
-	if (rp->status) {
-		bt_dev_err(hdev, "failed to read key size for handle %u",
-			   handle);
-		conn->enc_key_size = 0;
-	} else {
-		conn->enc_key_size = rp->key_size;
-	}
-
-	hci_encrypt_cfm(conn, 0);
-
-unlock:
-	hci_dev_unlock(hdev);
-}
-
 static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data,
 				   struct sk_buff *skb)
 {
@@ -3639,7 +3639,6 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data,
 	/* Try reading the encryption key size for encrypted ACL links */
 	if (!ev->status && ev->encrypt && conn->type == ACL_LINK) {
 		struct hci_cp_read_enc_key_size cp;
-		struct hci_request req;
 
 		/* Only send HCI_Read_Encryption_Key_Size if the
 		 * controller really supports it. If it doesn't, assume
@@ -3650,12 +3649,9 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data,
 			goto notify;
 		}
 
-		hci_req_init(&req, hdev);
-
 		cp.handle = cpu_to_le16(conn->handle);
-		hci_req_add(&req, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp);
-
-		if (hci_req_run_skb(&req, read_enc_key_size_complete)) {
+		if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE,
+				 sizeof(cp), &cp)) {
 			bt_dev_err(hdev, "sending read key size failed");
 			conn->enc_key_size = HCI_LINK_KEY_SIZE;
 			goto notify;
@@ -4037,6 +4033,8 @@ static const struct hci_cc {
 	       sizeof(struct hci_rp_read_local_amp_info)),
 	HCI_CC(HCI_OP_READ_CLOCK, hci_cc_read_clock,
 	       sizeof(struct hci_rp_read_clock)),
+	HCI_CC(HCI_OP_READ_ENC_KEY_SIZE, hci_cc_read_enc_key_size,
+	       sizeof(struct hci_rp_read_enc_key_size)),
 	HCI_CC(HCI_OP_READ_INQ_RSP_TX_POWER, hci_cc_read_inq_rsp_tx_power,
 	       sizeof(struct hci_rp_read_inq_rsp_tx_power)),
 	HCI_CC(HCI_OP_READ_DEF_ERR_DATA_REPORTING,
-- 
cgit v1.2.3


From 1a942de092c0b96216864fedcb4d8822ce3fc12e Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Tue, 16 Aug 2022 09:41:20 -0700
Subject: Bluetooth: Move hci_abort_conn to hci_conn.c

hci_abort_conn() is a wrapper around a number of DISCONNECT and
CREATE_CONN_CANCEL commands that was being invoked from hci_request
request queues, which are now deprecated. There are two versions:
hci_abort_conn() which can be invoked from the hci_event thread, and
hci_abort_conn_sync() which can be invoked within a hci_sync cmd chain.

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/hci_conn.c         | 76 ++++++++++++++++++++++++++++++++
 net/bluetooth/hci_request.c      | 93 ----------------------------------------
 net/bluetooth/hci_request.h      |  1 -
 net/bluetooth/mgmt.c             | 15 ++++++-
 5 files changed, 91 insertions(+), 95 deletions(-)

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index e7862903187d..932153e68864 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -2075,6 +2075,7 @@ int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip);
 void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle,
 				  bdaddr_t *bdaddr, u8 addr_type);
 
+int hci_abort_conn(struct hci_conn *conn, u8 reason);
 u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
 		      u16 to_multiplier);
 void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand,
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 337e74d0f8b1..7a59c4487050 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -2760,3 +2760,79 @@ u32 hci_conn_get_phy(struct hci_conn *conn)
 
 	return phys;
 }
+
+int hci_abort_conn(struct hci_conn *conn, u8 reason)
+{
+	int r = 0;
+
+	switch (conn->state) {
+	case BT_CONNECTED:
+	case BT_CONFIG:
+		if (conn->type == AMP_LINK) {
+			struct hci_cp_disconn_phy_link cp;
+
+			cp.phy_handle = HCI_PHY_HANDLE(conn->handle);
+			cp.reason = reason;
+			r = hci_send_cmd(conn->hdev, HCI_OP_DISCONN_PHY_LINK,
+					 sizeof(cp), &cp);
+		} else {
+			struct hci_cp_disconnect dc;
+
+			dc.handle = cpu_to_le16(conn->handle);
+			dc.reason = reason;
+			r = hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT,
+					 sizeof(dc), &dc);
+		}
+
+		conn->state = BT_DISCONN;
+
+		break;
+	case BT_CONNECT:
+		if (conn->type == LE_LINK) {
+			if (test_bit(HCI_CONN_SCANNING, &conn->flags))
+				break;
+			r = hci_send_cmd(conn->hdev,
+					 HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL);
+		} else if (conn->type == ACL_LINK) {
+			if (conn->hdev->hci_ver < BLUETOOTH_VER_1_2)
+				break;
+			r = hci_send_cmd(conn->hdev,
+					 HCI_OP_CREATE_CONN_CANCEL,
+					 6, &conn->dst);
+		}
+		break;
+	case BT_CONNECT2:
+		if (conn->type == ACL_LINK) {
+			struct hci_cp_reject_conn_req rej;
+
+			bacpy(&rej.bdaddr, &conn->dst);
+			rej.reason = reason;
+
+			r = hci_send_cmd(conn->hdev,
+					 HCI_OP_REJECT_CONN_REQ,
+					 sizeof(rej), &rej);
+		} else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) {
+			struct hci_cp_reject_sync_conn_req rej;
+
+			bacpy(&rej.bdaddr, &conn->dst);
+
+			/* SCO rejection has its own limited set of
+			 * allowed error values (0x0D-0x0F) which isn't
+			 * compatible with most values passed to this
+			 * function. To be safe hard-code one of the
+			 * values that's suitable for SCO.
+			 */
+			rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES;
+
+			r = hci_send_cmd(conn->hdev,
+					 HCI_OP_REJECT_SYNC_CONN_REQ,
+					 sizeof(rej), &rej);
+		}
+		break;
+	default:
+		conn->state = BT_CLOSED;
+		break;
+	}
+
+	return r;
+}
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 2e19a271d7a1..5a0296a4352e 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -909,99 +909,6 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
 	hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa);
 }
 
-static void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
-			     u8 reason)
-{
-	switch (conn->state) {
-	case BT_CONNECTED:
-	case BT_CONFIG:
-		if (conn->type == AMP_LINK) {
-			struct hci_cp_disconn_phy_link cp;
-
-			cp.phy_handle = HCI_PHY_HANDLE(conn->handle);
-			cp.reason = reason;
-			hci_req_add(req, HCI_OP_DISCONN_PHY_LINK, sizeof(cp),
-				    &cp);
-		} else {
-			struct hci_cp_disconnect dc;
-
-			dc.handle = cpu_to_le16(conn->handle);
-			dc.reason = reason;
-			hci_req_add(req, HCI_OP_DISCONNECT, sizeof(dc), &dc);
-		}
-
-		conn->state = BT_DISCONN;
-
-		break;
-	case BT_CONNECT:
-		if (conn->type == LE_LINK) {
-			if (test_bit(HCI_CONN_SCANNING, &conn->flags))
-				break;
-			hci_req_add(req, HCI_OP_LE_CREATE_CONN_CANCEL,
-				    0, NULL);
-		} else if (conn->type == ACL_LINK) {
-			if (req->hdev->hci_ver < BLUETOOTH_VER_1_2)
-				break;
-			hci_req_add(req, HCI_OP_CREATE_CONN_CANCEL,
-				    6, &conn->dst);
-		}
-		break;
-	case BT_CONNECT2:
-		if (conn->type == ACL_LINK) {
-			struct hci_cp_reject_conn_req rej;
-
-			bacpy(&rej.bdaddr, &conn->dst);
-			rej.reason = reason;
-
-			hci_req_add(req, HCI_OP_REJECT_CONN_REQ,
-				    sizeof(rej), &rej);
-		} else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) {
-			struct hci_cp_reject_sync_conn_req rej;
-
-			bacpy(&rej.bdaddr, &conn->dst);
-
-			/* SCO rejection has its own limited set of
-			 * allowed error values (0x0D-0x0F) which isn't
-			 * compatible with most values passed to this
-			 * function. To be safe hard-code one of the
-			 * values that's suitable for SCO.
-			 */
-			rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES;
-
-			hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ,
-				    sizeof(rej), &rej);
-		}
-		break;
-	default:
-		conn->state = BT_CLOSED;
-		break;
-	}
-}
-
-static void abort_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode)
-{
-	if (status)
-		bt_dev_dbg(hdev, "Failed to abort connection: status 0x%2.2x", status);
-}
-
-int hci_abort_conn(struct hci_conn *conn, u8 reason)
-{
-	struct hci_request req;
-	int err;
-
-	hci_req_init(&req, conn->hdev);
-
-	__hci_abort_conn(&req, conn, reason);
-
-	err = hci_req_run(&req, abort_conn_complete);
-	if (err && err != -ENODATA) {
-		bt_dev_err(conn->hdev, "failed to run HCI request: err %d", err);
-		return err;
-	}
-
-	return 0;
-}
-
 void hci_request_setup(struct hci_dev *hdev)
 {
 	INIT_DELAYED_WORK(&hdev->interleave_scan, interleave_scan_work);
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 7e1de871fca4..b9c5a9823837 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -73,6 +73,5 @@ void hci_req_add_le_passive_scan(struct hci_request *req);
 
 void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next);
 
-int hci_abort_conn(struct hci_conn *conn, u8 reason);
 void hci_request_setup(struct hci_dev *hdev);
 void hci_request_cancel_all(struct hci_dev *hdev);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 77e7aa63c0c0..8d70f4a709d4 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -3185,6 +3185,18 @@ unlock:
 	return err;
 }
 
+static int abort_conn_sync(struct hci_dev *hdev, void *data)
+{
+	struct hci_conn *conn;
+	u16 handle = PTR_ERR(data);
+
+	conn = hci_conn_hash_lookup_handle(hdev, handle);
+	if (!conn)
+		return 0;
+
+	return hci_abort_conn_sync(hdev, conn, HCI_ERROR_REMOTE_USER_TERM);
+}
+
 static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
 			      u16 len)
 {
@@ -3235,7 +3247,8 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
 					      le_addr_type(addr->type));
 
 	if (conn->conn_reason == CONN_REASON_PAIR_DEVICE)
-		hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
+		hci_cmd_sync_queue(hdev, abort_conn_sync, ERR_PTR(conn->handle),
+				   NULL);
 
 unlock:
 	hci_dev_unlock(hdev);
-- 
cgit v1.2.3


From c4ba5800217b9efa710b27fbe638930830a95d19 Mon Sep 17 00:00:00 2001
From: Max Chou <max.chou@realtek.com>
Date: Mon, 29 Aug 2022 19:45:07 +0800
Subject: Bluetooth: btusb: Add Realtek RTL8852C support ID 0x13D3:0x3592

Add the support ID(0x13D3, 0x3592) to usb_device_id table for
Realtek RTL8852C.

The device info from /sys/kernel/debug/usb/devices as below.

T:  Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#=  2 Spd=12   MxCh= 0
D:  Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=13d3 ProdID=3592 Rev= 0.00
S:  Manufacturer=Realtek
S:  Product=Bluetooth Radio
S:  SerialNumber=00e04c000001
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Max Chou <max.chou@realtek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 30dd443f395f..7dd46b4183f5 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -438,6 +438,8 @@ static const struct usb_device_id blacklist_table[] = {
 						     BTUSB_WIDEBAND_SPEECH },
 	{ USB_DEVICE(0x13d3, 0x3586), .driver_info = BTUSB_REALTEK |
 						     BTUSB_WIDEBAND_SPEECH },
+	{ USB_DEVICE(0x13d3, 0x3592), .driver_info = BTUSB_REALTEK |
+						     BTUSB_WIDEBAND_SPEECH },
 
 	/* Realtek Bluetooth devices */
 	{ USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01),
-- 
cgit v1.2.3


From d8daa3991652a5713061f23276dc91fb846dc0c0 Mon Sep 17 00:00:00 2001
From: Szabolcs Sipos <labuwx@balfug.com>
Date: Fri, 26 Aug 2022 00:42:07 +0200
Subject: Bluetooth: btusb: RTL8761BUV consistent naming

Based on photos available from the FCC, all currently supported RTL8761B
series USB dongles have the same chip: RTL8761BUV.

rtl8761bu is often used to refer to this chip.
rtl8761b sometimes refers to this chip, and other times to its
UART variant (RTL8761BTV).

+----------------+---------+---------+---------------+
|     Dongle     | USB VID | USB PID |    FCC ID     |
+----------------+---------+---------+---------------+
| ASUS USB-BT500 | 0x0b05  | 0x190e  | MSQ-USBBTJB00 |
| TP-Link UB500  | 0x2357  | 0x0604  | 2AXJ4UB500    |
| EDUP EP-B3519  |         |         | 2AHRD-EPB3519 |
| EDUP EP-B3536  |         |         | 2AHRDEP-B3536 |
| UGREEN CM390   |         |         | 2AQI5-CM390   |
+----------------+---------+---------+---------------+

Signed-off-by: Szabolcs Sipos <labuwx@balfug.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 7dd46b4183f5..007d8533c4bc 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -521,17 +521,13 @@ static const struct usb_device_id blacklist_table[] = {
 	{ USB_DEVICE(0x0bda, 0xb009), .driver_info = BTUSB_REALTEK },
 	{ USB_DEVICE(0x2ff8, 0xb011), .driver_info = BTUSB_REALTEK },
 
-	/* Additional Realtek 8761B Bluetooth devices */
+	/* Additional Realtek 8761BUV Bluetooth devices */
 	{ USB_DEVICE(0x2357, 0x0604), .driver_info = BTUSB_REALTEK |
 						     BTUSB_WIDEBAND_SPEECH },
-
-	/* Additional Realtek 8761BU Bluetooth devices */
 	{ USB_DEVICE(0x0b05, 0x190e), .driver_info = BTUSB_REALTEK |
 	  					     BTUSB_WIDEBAND_SPEECH },
 	{ USB_DEVICE(0x2550, 0x8761), .driver_info = BTUSB_REALTEK |
 						     BTUSB_WIDEBAND_SPEECH },
-
-	/* Additional Realtek 8761BUV Bluetooth devices */
 	{ USB_DEVICE(0x0bda, 0x8771), .driver_info = BTUSB_REALTEK |
 						     BTUSB_WIDEBAND_SPEECH },
 
-- 
cgit v1.2.3


From c7577014b74c5369490715015db096182f1a2a23 Mon Sep 17 00:00:00 2001
From: Szabolcs Sipos <labuwx@balfug.com>
Date: Fri, 26 Aug 2022 00:42:08 +0200
Subject: Bluetooth: btusb: Add RTL8761BUV device (Edimax BT-8500)

T:  Bus=01 Lev=01 Prnt=01 Port=01 Cnt=01 Dev#=  2 Spd=12   MxCh= 0
D:  Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=7392 ProdID=c611 Rev= 2.00
S:  Manufacturer=Realtek
S:  Product=Edimax Bluetooth Adapter
S:  SerialNumber=XXXXXXXXXXXX
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Szabolcs Sipos <labuwx@balfug.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 007d8533c4bc..87d0a0ac5cf0 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -530,6 +530,8 @@ static const struct usb_device_id blacklist_table[] = {
 						     BTUSB_WIDEBAND_SPEECH },
 	{ USB_DEVICE(0x0bda, 0x8771), .driver_info = BTUSB_REALTEK |
 						     BTUSB_WIDEBAND_SPEECH },
+	{ USB_DEVICE(0x7392, 0xc611), .driver_info = BTUSB_REALTEK |
+						     BTUSB_WIDEBAND_SPEECH },
 
 	/* Additional Realtek 8821AE Bluetooth devices */
 	{ USB_DEVICE(0x0b05, 0x17dc), .driver_info = BTUSB_REALTEK },
-- 
cgit v1.2.3


From a0476f6a2cac012c0aee4dc981a53e1414cea069 Mon Sep 17 00:00:00 2001
From: Larry Finger <Larry.Finger@lwfinger.net>
Date: Tue, 16 Aug 2022 16:35:22 -0500
Subject: Bluetooth: btusb: Add BT device 0cb8:c549 from RTW8852AE to tables

A new Bluetooth device for the Realtek RTW8852AE has been noted.

This device has the following lsusb output:

Bus 001 Device 003: ID 0cb8:c549 Opticis Co., Ltd Bluetooth Radio

The pertinent part of /sys/kernel/debug/usb/devices is as follows:

T: Bus=01 Lev=01 Prnt=01 Port=03 Cnt=02 Dev#= 3 Spd=12 MxCh= 0
D: Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1
P: Vendor=0cb8 ProdID=c549 Rev= 0.00
S: Manufacturer=Realtek
S: Product=Bluetooth Radio
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms
E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms
E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms
E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms
I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms
E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms
I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms
E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms
I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms
E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms
I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms
E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms
I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms
E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms

Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 87d0a0ac5cf0..7e87139fc991 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -426,6 +426,8 @@ static const struct usb_device_id blacklist_table[] = {
 						     BTUSB_WIDEBAND_SPEECH },
 	{ USB_DEVICE(0x04ca, 0x4006), .driver_info = BTUSB_REALTEK |
 						     BTUSB_WIDEBAND_SPEECH },
+	{ USB_DEVICE(0x0cb8, 0xc549), .driver_info = BTUSB_REALTEK |
+						     BTUSB_WIDEBAND_SPEECH },
 
 	/* Realtek 8852CE Bluetooth devices */
 	{ USB_DEVICE(0x04ca, 0x4007), .driver_info = BTUSB_REALTEK |
-- 
cgit v1.2.3


From 6726d552a6912e88cf63fe2bda87b2efa0efc7d0 Mon Sep 17 00:00:00 2001
From: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Date: Fri, 17 Jun 2022 13:22:54 +0100
Subject: clk: ingenic-tcu: Properly enable registers before accessing timers

Access to registers is guarded by ingenic_tcu_{enable,disable}_regs()
so the stop bit can be cleared before accessing a timer channel, but
those functions did not clear the stop bit on SoCs with a global TCU
clock gate.

Testing on the X1000 has revealed that the stop bits must be cleared
_and_ the global TCU clock must be ungated to access timer registers.
This appears to be the norm on Ingenic SoCs, and is specified in the
documentation for the X1000 and numerous JZ47xx SoCs.

If the stop bit isn't cleared, register writes don't take effect and
the system can be left in a broken state, eg. the watchdog timer may
not run.

The bug probably went unnoticed because stop bits are zeroed when
the SoC is reset, and the kernel does not set them unless a timer
gets disabled at runtime. However, it is possible that a bootloader
or a previous kernel (if using kexec) leaves the stop bits set and
we should not rely on them being cleared.

Fixing this is easy: have ingenic_tcu_{enable,disable}_regs() always
clear the stop bit, regardless of the presence of a global TCU gate.

Reviewed-by: Paul Cercueil <paul@crapouillou.net>
Tested-by: Paul Cercueil <paul@crapouillou.net>
Fixes: 4f89e4b8f121 ("clk: ingenic: Add driver for the TCU clocks")
Cc: stable@vger.kernel.org
Signed-off-by: Aidan MacDonald <aidanmacdonald.0x0@gmail.com>
Link: https://lore.kernel.org/r/20220617122254.738900-1-aidanmacdonald.0x0@gmail.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/ingenic/tcu.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/clk/ingenic/tcu.c b/drivers/clk/ingenic/tcu.c
index 201bf6e6b6e0..d5544cbc5c48 100644
--- a/drivers/clk/ingenic/tcu.c
+++ b/drivers/clk/ingenic/tcu.c
@@ -101,15 +101,11 @@ static bool ingenic_tcu_enable_regs(struct clk_hw *hw)
 	bool enabled = false;
 
 	/*
-	 * If the SoC has no global TCU clock, we must ungate the channel's
-	 * clock to be able to access its registers.
-	 * If we have a TCU clock, it will be enabled automatically as it has
-	 * been attached to the regmap.
+	 * According to the programming manual, a timer channel's registers can
+	 * only be accessed when the channel's stop bit is clear.
 	 */
-	if (!tcu->clk) {
-		enabled = !!ingenic_tcu_is_enabled(hw);
-		regmap_write(tcu->map, TCU_REG_TSCR, BIT(info->gate_bit));
-	}
+	enabled = !!ingenic_tcu_is_enabled(hw);
+	regmap_write(tcu->map, TCU_REG_TSCR, BIT(info->gate_bit));
 
 	return enabled;
 }
@@ -120,8 +116,7 @@ static void ingenic_tcu_disable_regs(struct clk_hw *hw)
 	const struct ingenic_tcu_clk_info *info = tcu_clk->info;
 	struct ingenic_tcu *tcu = tcu_clk->tcu;
 
-	if (!tcu->clk)
-		regmap_write(tcu->map, TCU_REG_TSSR, BIT(info->gate_bit));
+	regmap_write(tcu->map, TCU_REG_TSSR, BIT(info->gate_bit));
 }
 
 static u8 ingenic_tcu_get_parent(struct clk_hw *hw)
-- 
cgit v1.2.3


From 9a1043d43a9ab75d28b8ec54512ea13ec33bc910 Mon Sep 17 00:00:00 2001
From: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Date: Mon, 22 Aug 2022 22:07:22 +0300
Subject: EDAC/mc: Replace spaces with tabs in memtype flags definition

Currently, the memory type macros are partly defined with multiple
spaces between the macro name and its definition. Replace the spaces
with tabs as the kernel coding style requires.

Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220822190730.27277-13-Sergey.Semin@baikalelectronics.ru
---
 include/linux/edac.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/linux/edac.h b/include/linux/edac.h
index e730b3468719..fa4bda2a70f6 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -231,21 +231,21 @@ enum mem_type {
 #define MEM_FLAG_DDR		BIT(MEM_DDR)
 #define MEM_FLAG_RDDR		BIT(MEM_RDDR)
 #define MEM_FLAG_RMBS		BIT(MEM_RMBS)
-#define MEM_FLAG_DDR2           BIT(MEM_DDR2)
-#define MEM_FLAG_FB_DDR2        BIT(MEM_FB_DDR2)
-#define MEM_FLAG_RDDR2          BIT(MEM_RDDR2)
-#define MEM_FLAG_XDR            BIT(MEM_XDR)
-#define MEM_FLAG_DDR3           BIT(MEM_DDR3)
-#define MEM_FLAG_RDDR3          BIT(MEM_RDDR3)
-#define MEM_FLAG_LPDDR3         BIT(MEM_LPDDR3)
-#define MEM_FLAG_DDR4           BIT(MEM_DDR4)
-#define MEM_FLAG_RDDR4          BIT(MEM_RDDR4)
-#define MEM_FLAG_LRDDR4         BIT(MEM_LRDDR4)
-#define MEM_FLAG_LPDDR4         BIT(MEM_LPDDR4)
-#define MEM_FLAG_DDR5           BIT(MEM_DDR5)
-#define MEM_FLAG_RDDR5          BIT(MEM_RDDR5)
-#define MEM_FLAG_LRDDR5         BIT(MEM_LRDDR5)
-#define MEM_FLAG_NVDIMM         BIT(MEM_NVDIMM)
+#define MEM_FLAG_DDR2		BIT(MEM_DDR2)
+#define MEM_FLAG_FB_DDR2	BIT(MEM_FB_DDR2)
+#define MEM_FLAG_RDDR2		BIT(MEM_RDDR2)
+#define MEM_FLAG_XDR		BIT(MEM_XDR)
+#define MEM_FLAG_DDR3		BIT(MEM_DDR3)
+#define MEM_FLAG_RDDR3		BIT(MEM_RDDR3)
+#define MEM_FLAG_LPDDR3		BIT(MEM_LPDDR3)
+#define MEM_FLAG_DDR4		BIT(MEM_DDR4)
+#define MEM_FLAG_RDDR4		BIT(MEM_RDDR4)
+#define MEM_FLAG_LRDDR4		BIT(MEM_LRDDR4)
+#define MEM_FLAG_LPDDR4		BIT(MEM_LPDDR4)
+#define MEM_FLAG_DDR5		BIT(MEM_DDR5)
+#define MEM_FLAG_RDDR5		BIT(MEM_RDDR5)
+#define MEM_FLAG_LRDDR5		BIT(MEM_LRDDR5)
+#define MEM_FLAG_NVDIMM		BIT(MEM_NVDIMM)
 #define MEM_FLAG_WIO2		BIT(MEM_WIO2)
 #define MEM_FLAG_HBM2		BIT(MEM_HBM2)
 
-- 
cgit v1.2.3


From ddf958f39741a1dda74020ac8a0f226cb5aa017b Mon Sep 17 00:00:00 2001
From: ye xingchen <ye.xingchen@zte.com.cn>
Date: Wed, 31 Aug 2022 03:32:13 +0000
Subject: cpufreq: tegra194: Remove the unneeded result variable

Return the value returned by smp_call_function_single() directly instead
of storing it in another redundant variable.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn>
[ Viresh: Minor update to commit log ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/tegra194-cpufreq.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c
index 1216046cf4c2..7e143c06972e 100644
--- a/drivers/cpufreq/tegra194-cpufreq.c
+++ b/drivers/cpufreq/tegra194-cpufreq.c
@@ -314,11 +314,7 @@ static void tegra194_get_cpu_ndiv_sysreg(void *ndiv)
 
 static int tegra194_get_cpu_ndiv(u32 cpu, u32 cpuid, u32 clusterid, u64 *ndiv)
 {
-	int ret;
-
-	ret = smp_call_function_single(cpu, tegra194_get_cpu_ndiv_sysreg, &ndiv, true);
-
-	return ret;
+	return smp_call_function_single(cpu, tegra194_get_cpu_ndiv_sysreg, &ndiv, true);
 }
 
 static void tegra194_set_cpu_ndiv_sysreg(void *data)
-- 
cgit v1.2.3


From 93df19476535a4ed871bc5eae719b6d67cf3abc3 Mon Sep 17 00:00:00 2001
From: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Date: Mon, 22 Aug 2022 22:07:23 +0300
Subject: EDAC/mc: Drop duplicated dimm->nr_pages debug printout

The duplicated edac_dbg()-based dimm->nr_pages print was introduced in

  6e84d359b2be ("edac_mc: Cleanup per-dimm_info debug messages").

The duplicated line can be found even in the commit message text:

  [ 1011.380101] EDAC DEBUG: edac_mc_dump_dimm:   dimm->nr_pages = 0x40000
  [ 1011.380103] EDAC DEBUG: edac_mc_dump_dimm:   dimm->grain = 8
  [ 1011.380104] EDAC DEBUG: edac_mc_dump_dimm:   dimm->nr_pages = 0x40000

Drop the second edac_dbg() call.

  [ bp: Massage commit message. ]

Signed-off-by: Serge Semin <Sergey.Semin@baikalelectronics.ru>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220822190730.27277-14-Sergey.Semin@baikalelectronics.ru
---
 drivers/edac/edac_mc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index eb58644bb019..6faeb2ab3960 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -103,7 +103,6 @@ static void edac_mc_dump_dimm(struct dimm_info *dimm)
 	edac_dbg(4, "  dimm->label = '%s'\n", dimm->label);
 	edac_dbg(4, "  dimm->nr_pages = 0x%x\n", dimm->nr_pages);
 	edac_dbg(4, "  dimm->grain = %d\n", dimm->grain);
-	edac_dbg(4, "  dimm->nr_pages = 0x%x\n", dimm->nr_pages);
 }
 
 static void edac_mc_dump_csrow(struct csrow_info *csrow)
-- 
cgit v1.2.3


From 8a04d2fc700f717104bfb95b0f6694e448a4537f Mon Sep 17 00:00:00 2001
From: Khalid Masum <khalid.masum.92@gmail.com>
Date: Thu, 1 Sep 2022 13:12:10 +0600
Subject: xfrm: Update ipcomp_scratches with NULL when freed

Currently if ipcomp_alloc_scratches() fails to allocate memory
ipcomp_scratches holds obsolete address. So when we try to free the
percpu scratches using ipcomp_free_scratches() it tries to vfree non
existent vm area. Described below:

static void * __percpu *ipcomp_alloc_scratches(void)
{
        ...
        scratches = alloc_percpu(void *);
        if (!scratches)
                return NULL;
ipcomp_scratches does not know about this allocation failure.
Therefore holding the old obsolete address.
        ...
}

So when we free,

static void ipcomp_free_scratches(void)
{
        ...
        scratches = ipcomp_scratches;
Assigning obsolete address from ipcomp_scratches

        if (!scratches)
                return;

        for_each_possible_cpu(i)
               vfree(*per_cpu_ptr(scratches, i));
Trying to free non existent page, causing warning: trying to vfree
existent vm area.
        ...
}

Fix this breakage by updating ipcomp_scrtches with NULL when scratches
is freed

Suggested-by: Herbert Xu <herbert@gondor.apana.org.au>
Reported-by: syzbot+5ec9bb042ddfe9644773@syzkaller.appspotmail.com
Tested-by: syzbot+5ec9bb042ddfe9644773@syzkaller.appspotmail.com
Signed-off-by: Khalid Masum <khalid.masum.92@gmail.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_ipcomp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
index cb40ff0ff28d..92ad336a83ab 100644
--- a/net/xfrm/xfrm_ipcomp.c
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -203,6 +203,7 @@ static void ipcomp_free_scratches(void)
 		vfree(*per_cpu_ptr(scratches, i));
 
 	free_percpu(scratches);
+	ipcomp_scratches = NULL;
 }
 
 static void * __percpu *ipcomp_alloc_scratches(void)
-- 
cgit v1.2.3


From 24919fdea6f8b31d7cdf32ac291bc5dd0b023878 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Wed, 31 Aug 2022 07:27:02 -0700
Subject: perf/x86/intel: Fix unchecked MSR access error for Alder Lake N

For some Alder Lake N machine, the below unchecked MSR access error may be
triggered.

[ 0.088017] rcu: Hierarchical SRCU implementation.
[ 0.088017] unchecked MSR access error: WRMSR to 0x38f (tried to write
0x0001000f0000003f) at rIP: 0xffffffffb5684de8 (native_write_msr+0x8/0x30)
[ 0.088017] Call Trace:
[ 0.088017] <TASK>
[ 0.088017] __intel_pmu_enable_all.constprop.46+0x4a/0xa0

The Alder Lake N only has e-cores. The X86_FEATURE_HYBRID_CPU flag is
not set. The perf cannot retrieve the correct CPU type via
get_this_hybrid_cpu_type(). The model specific get_hybrid_cpu_type() is
hardcode to p-core. The wrong CPU type is given to the PMU of the
Alder Lake N.

Since Alder Lake N isn't in fact a hybrid CPU, remove ALDERLAKE_N from
the rest of {ALDER,RAPTOP}LAKE and create a non-hybrid PMU setup.

The differences between Gracemont and the previous Tremont are,
- Number of GP counters
- Load and store latency Events
- PEBS event_constraints
- Instruction Latency support
- Data source encoding
- Memory access latency encoding

Fixes: c2a960f7c574 ("perf/x86: Add new Alder Lake and Raptor Lake support")
Reported-by: Jianfeng Gao <jianfeng.gao@intel.com>
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220831142702.153110-1-kan.liang@linux.intel.com
---
 arch/x86/events/intel/core.c | 40 +++++++++++++++++++++++++++++++++++++++-
 arch/x86/events/intel/ds.c   |  9 +++++++--
 arch/x86/events/perf_event.h |  2 ++
 3 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index cb98a05ee743..e8eb9365d0b9 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2102,6 +2102,15 @@ static struct extra_reg intel_tnt_extra_regs[] __read_mostly = {
 	EVENT_EXTRA_END
 };
 
+EVENT_ATTR_STR(mem-loads,	mem_ld_grt,	"event=0xd0,umask=0x5,ldlat=3");
+EVENT_ATTR_STR(mem-stores,	mem_st_grt,	"event=0xd0,umask=0x6");
+
+static struct attribute *grt_mem_attrs[] = {
+	EVENT_PTR(mem_ld_grt),
+	EVENT_PTR(mem_st_grt),
+	NULL
+};
+
 static struct extra_reg intel_grt_extra_regs[] __read_mostly = {
 	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
 	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
@@ -5974,6 +5983,36 @@ __init int intel_pmu_init(void)
 		name = "Tremont";
 		break;
 
+	case INTEL_FAM6_ALDERLAKE_N:
+		x86_pmu.mid_ack = true;
+		memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs,
+		       sizeof(hw_cache_extra_regs));
+		hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+
+		x86_pmu.event_constraints = intel_slm_event_constraints;
+		x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_grt_extra_regs;
+
+		x86_pmu.pebs_aliases = NULL;
+		x86_pmu.pebs_prec_dist = true;
+		x86_pmu.pebs_block = true;
+		x86_pmu.lbr_pt_coexist = true;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+
+		intel_pmu_pebs_data_source_grt();
+		x86_pmu.pebs_latency_data = adl_latency_data_small;
+		x86_pmu.get_event_constraints = tnt_get_event_constraints;
+		x86_pmu.limit_period = spr_limit_period;
+		td_attr = tnt_events_attrs;
+		mem_attr = grt_mem_attrs;
+		extra_attr = nhm_format_attr;
+		pr_cont("Gracemont events, ");
+		name = "gracemont";
+		break;
+
 	case INTEL_FAM6_WESTMERE:
 	case INTEL_FAM6_WESTMERE_EP:
 	case INTEL_FAM6_WESTMERE_EX:
@@ -6316,7 +6355,6 @@ __init int intel_pmu_init(void)
 
 	case INTEL_FAM6_ALDERLAKE:
 	case INTEL_FAM6_ALDERLAKE_L:
-	case INTEL_FAM6_ALDERLAKE_N:
 	case INTEL_FAM6_RAPTORLAKE:
 	case INTEL_FAM6_RAPTORLAKE_P:
 		/*
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index de1f55d51784..ac973c6f82ad 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -110,13 +110,18 @@ void __init intel_pmu_pebs_data_source_skl(bool pmem)
 	__intel_pmu_pebs_data_source_skl(pmem, pebs_data_source);
 }
 
-static void __init intel_pmu_pebs_data_source_grt(u64 *data_source)
+static void __init __intel_pmu_pebs_data_source_grt(u64 *data_source)
 {
 	data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
 	data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
 	data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD);
 }
 
+void __init intel_pmu_pebs_data_source_grt(void)
+{
+	__intel_pmu_pebs_data_source_grt(pebs_data_source);
+}
+
 void __init intel_pmu_pebs_data_source_adl(void)
 {
 	u64 *data_source;
@@ -127,7 +132,7 @@ void __init intel_pmu_pebs_data_source_adl(void)
 
 	data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
 	memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
-	intel_pmu_pebs_data_source_grt(data_source);
+	__intel_pmu_pebs_data_source_grt(data_source);
 }
 
 static u64 precise_store_data(u64 status)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ba3d24a6a4ec..266143abcbd8 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1516,6 +1516,8 @@ void intel_pmu_pebs_data_source_skl(bool pmem);
 
 void intel_pmu_pebs_data_source_adl(void);
 
+void intel_pmu_pebs_data_source_grt(void);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);
-- 
cgit v1.2.3


From f2aeea57504cbbc58da3c59b939fc16150087648 Mon Sep 17 00:00:00 2001
From: Like Xu <likexu@tencent.com>
Date: Wed, 31 Aug 2022 11:35:24 +0800
Subject: perf/x86/core: Completely disable guest PEBS via guest's global_ctrl

When a guest PEBS counter is cross-mapped by a host counter, software
will remove the corresponding bit in the arr[global_ctrl].guest and
expect hardware to perform a change of state "from enable to disable"
via the msr_slot[] switch during the vmx transaction.

The real world is that if user adjust the counter overflow value small
enough, it still opens a tiny race window for the previously PEBS-enabled
counter to write cross-mapped PEBS records into the guest's PEBS buffer,
when arr[global_ctrl].guest has been prioritised (switch_msr_special stuff)
to switch into the enabled state, while the arr[pebs_enable].guest has not.

Close this window by clearing invalid bits in the arr[global_ctrl].guest.

Fixes: 854250329c02 ("KVM: x86/pmu: Disable guest PEBS temporarily in two rare situations")
Signed-off-by: Like Xu <likexu@tencent.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220831033524.58561-1-likexu@tencent.com
---
 arch/x86/events/intel/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index e8eb9365d0b9..c20d8cd47c48 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4061,8 +4061,9 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data)
 		/* Disable guest PEBS if host PEBS is enabled. */
 		arr[pebs_enable].guest = 0;
 	} else {
-		/* Disable guest PEBS for cross-mapped PEBS counters. */
+		/* Disable guest PEBS thoroughly for cross-mapped PEBS counters. */
 		arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask;
+		arr[global_ctrl].guest &= ~kvm_pmu->host_cross_mapped_mask;
 		/* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */
 		arr[global_ctrl].guest |= arr[pebs_enable].guest;
 	}
-- 
cgit v1.2.3


From 48297a22a39adcde7a3ba52b913c5aaa9a990364 Mon Sep 17 00:00:00 2001
From: Zqiang <qiang1.zhang@intel.com>
Date: Sun, 31 Jul 2022 18:53:56 +0800
Subject: rcutorture: Use the barrier operation specified by cur_ops

The rcutorture_oom_notify() function unconditionally invokes
rcu_barrier(), which is OK when the rcutorture.torture_type value is
"rcu", but unhelpful otherwise.  The purpose of these barrier calls is to
wait for all outstanding callback-flooding callbacks to be invoked before
cleaning up their data.  Using the wrong barrier function therefore
risks arbitrary memory corruption.  Thus, this commit changes these
rcu_barrier() calls into cur_ops->cb_barrier() to make things work when
torturing non-vanilla flavors of RCU.

Signed-off-by: Zqiang <qiang1.zhang@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d8e1b270a065..08b7b59d5d05 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2600,12 +2600,12 @@ static int rcutorture_oom_notify(struct notifier_block *self,
 	for (i = 0; i < fwd_progress; i++)
 		ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
 	pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
-	rcu_barrier();
+	cur_ops->cb_barrier();
 	ncbs = 0;
 	for (i = 0; i < fwd_progress; i++)
 		ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
 	pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs);
-	rcu_barrier();
+	cur_ops->cb_barrier();
 	ncbs = 0;
 	for (i = 0; i < fwd_progress; i++)
 		ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]);
-- 
cgit v1.2.3


From fd3f106677bac70437dc12e76c827294ed495a44 Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Thu, 11 Aug 2022 08:49:07 +0800
Subject: Bluetooth: btusb: mediatek: fix WMT failure during runtime suspend

WMT cmd/event doesn't follow up the generic HCI cmd/event handling, it
needs constantly polling control pipe until the host received the WMT
event, thus, we should require to specifically acquire PM counter on the
USB to prevent the interface from entering auto suspended while WMT
cmd/event in progress.

Fixes: a1c49c434e15 ("Bluetooth: btusb: Add protocol support for MediaTek MT7668U USB devices")
Co-developed-by: Jing Cai <jing.cai@mediatek.com>
Signed-off-by: Jing Cai <jing.cai@mediatek.com>
Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 7e87139fc991..ef7726a3e701 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -2482,15 +2482,29 @@ static int btusb_mtk_hci_wmt_sync(struct hci_dev *hdev,
 
 	set_bit(BTUSB_TX_WAIT_VND_EVT, &data->flags);
 
+	/* WMT cmd/event doesn't follow up the generic HCI cmd/event handling,
+	 * it needs constantly polling control pipe until the host received the
+	 * WMT event, thus, we should require to specifically acquire PM counter
+	 * on the USB to prevent the interface from entering auto suspended
+	 * while WMT cmd/event in progress.
+	 */
+	err = usb_autopm_get_interface(data->intf);
+	if (err < 0)
+		goto err_free_wc;
+
 	err = __hci_cmd_send(hdev, 0xfc6f, hlen, wc);
 
 	if (err < 0) {
 		clear_bit(BTUSB_TX_WAIT_VND_EVT, &data->flags);
+		usb_autopm_put_interface(data->intf);
 		goto err_free_wc;
 	}
 
 	/* Submit control IN URB on demand to process the WMT event */
 	err = btusb_mtk_submit_wmt_recv_urb(hdev);
+
+	usb_autopm_put_interface(data->intf);
+
 	if (err < 0)
 		goto err_free_wc;
 
-- 
cgit v1.2.3


From 7fce8d6eccbc31a561d07c79f359ad09f0424347 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Mon, 29 Aug 2022 18:10:30 +0000
Subject: x86/microcode: Print previous version of microcode after reload

Print both old and new versions of microcode after a reload is complete
because knowing the previous microcode version is sometimes important
from a debugging perspective.

  [ bp: Massage commit message. ]

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/r/20220829181030.722891-1-ashok.raj@intel.com
---
 arch/x86/kernel/cpu/microcode/core.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index ad57e0e4d674..6a41cee242f6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -491,7 +491,7 @@ wait_for_siblings:
  */
 static int microcode_reload_late(void)
 {
-	int ret;
+	int old = boot_cpu_data.microcode, ret;
 
 	pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n");
 	pr_err("You should switch to early loading, if possible.\n");
@@ -503,7 +503,8 @@ static int microcode_reload_late(void)
 	if (ret == 0)
 		microcode_check();
 
-	pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode);
+	pr_info("Reload completed, microcode revision: 0x%x -> 0x%x\n",
+		old, boot_cpu_data.microcode);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 96f1c52992e5a91207aabf6cf8d1b0f3e6479a97 Mon Sep 17 00:00:00 2001
From: Jilin Yuan <yuanjilin@cdjrlc.com>
Date: Mon, 22 Aug 2022 20:32:39 +0800
Subject: thermal: Drop duplicate words from comments

Drop duplicate words from two kerneldoc comments in the thermal
subsystem.

Signed-off-by: Jilin Yuan <yuanjilin@cdjrlc.com>
[ rjw: Subject edits and changelog rewrite ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/cpufreq_cooling.c | 2 +-
 drivers/thermal/thermal_of.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index b76293cc989c..b40ff133db83 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -475,7 +475,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
 
 /**
  * __cpufreq_cooling_register - helper function to create cpufreq cooling device
- * @np: a valid struct device_node to the cooling device device tree node
+ * @np: a valid struct device_node to the cooling device tree node
  * @policy: cpufreq policy
  * Normally this should be same as cpufreq policy->related_cpus.
  * @em: Energy Model of the cpufreq policy
diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c
index 802c30b72a92..3aa9e2024a2f 100644
--- a/drivers/thermal/thermal_of.c
+++ b/drivers/thermal/thermal_of.c
@@ -630,7 +630,7 @@ EXPORT_SYMBOL_GPL(devm_thermal_zone_of_sensor_register);
 /**
  * devm_thermal_zone_of_sensor_unregister - Resource managed version of
  *				thermal_zone_of_sensor_unregister().
- * @dev: Device for which which resource was allocated.
+ * @dev: Device for which resource was allocated.
  * @tzd: a pointer to struct thermal_zone_device where the sensor is registered.
  *
  * This function removes the sensor callbacks and private data from the
-- 
cgit v1.2.3


From cb3e7d624c3ff34a300587929c82af7364cf5c09 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 24 Aug 2022 20:36:30 -0500
Subject: PM: wakeup: Add extra debugging statement for multiple active IRQs

Since commit cb1f65c1e1424 ("PM: s2idle: ACPI: Fix wakeup interrupts
handling") was introduced the kernel can now handle multiple
simultaneous interrupts during wakeup.  Ths uncovered some existing
subtle firmware bugs where multiple IRQs are unintentionally active.

To help with fixing those bugs add an extra message when PM debugging
is enabled that can show the individual IRQs triggered as if a variety
are fired they'll potentially be lost as /sys/power/pm_wakeup_irq only
contains the first one that triggered the wakeup after resume is
complete but all may be needed to demonstrate the whole picture.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=215770
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
[ rjw: Added empty line after if () ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index e3befa2c1b66..7cc0c0cf8eaa 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -944,6 +944,8 @@ void pm_system_irq_wakeup(unsigned int irq_number)
 	else
 		irq_number = 0;
 
+	pm_pr_dbg("Triggering wakeup from IRQ %d\n", irq_number);
+
 	raw_spin_unlock_irqrestore(&wakeup_irq_lock, flags);
 
 	if (irq_number)
-- 
cgit v1.2.3


From cff895277c8558221ba180aefe26799dcb4eec86 Mon Sep 17 00:00:00 2001
From: Xuewen Yan <xuewen.yan@unisoc.com>
Date: Thu, 25 Aug 2022 19:40:17 +0800
Subject: thermal: cpufreq_cooling: Check the policy first in
 cpufreq_cooling_register()

Since the policy needs to be accessed first when obtaining cpu devices,
first check whether the policy is legal before this.

Fixes: 5130802ddbb1 ("thermal: cpu_cooling: Switch to QoS requests for freq limits")
Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/cpufreq_cooling.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index b76293cc989c..7838b6e2dba5 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -501,17 +501,17 @@ __cpufreq_cooling_register(struct device_node *np,
 	struct thermal_cooling_device_ops *cooling_ops;
 	char *name;
 
+	if (IS_ERR_OR_NULL(policy)) {
+		pr_err("%s: cpufreq policy isn't valid: %p\n", __func__, policy);
+		return ERR_PTR(-EINVAL);
+	}
+
 	dev = get_cpu_device(policy->cpu);
 	if (unlikely(!dev)) {
 		pr_warn("No cpu device for cpu %d\n", policy->cpu);
 		return ERR_PTR(-ENODEV);
 	}
 
-	if (IS_ERR_OR_NULL(policy)) {
-		pr_err("%s: cpufreq policy isn't valid: %p\n", __func__, policy);
-		return ERR_PTR(-EINVAL);
-	}
-
 	i = cpufreq_table_count_valid_entries(policy);
 	if (!i) {
 		pr_debug("%s: CPUFreq table not found or has no valid entries\n",
-- 
cgit v1.2.3


From 06f7c08751939f33d79712590c2b21d02aba61b6 Mon Sep 17 00:00:00 2001
From: Yu Liao <liaoyu15@huawei.com>
Date: Mon, 29 Aug 2022 09:15:34 +0800
Subject: cpuidle: Remove redundant check in cpuidle_switch_governor()

gov has already been NULL checked at the beginning of
cpuidle_switch_governor, so remove redundant check.

While at it, use pr_info() instead printk() to address the following
checkpatch warning:

WARNING: Prefer [subsystem eg: netdev]_info([subsystem]dev, ... then
dev_info(dev, ... then pr_info(...  to printk(KERN_INFO ...

Signed-off-by: Yu Liao <liaoyu15@huawei.com>
[ rjw: Subject and changelog edits, added empty line after if () ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/governor.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index 29acaf48e575..0d0f9751ff8f 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -63,12 +63,11 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov)
 
 	cpuidle_curr_governor = gov;
 
-	if (gov) {
-		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
-			cpuidle_enable_device(dev);
-		cpuidle_install_idle_handler();
-		printk(KERN_INFO "cpuidle: using governor %s\n", gov->name);
-	}
+	list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
+		cpuidle_enable_device(dev);
+
+	cpuidle_install_idle_handler();
+	pr_info("cpuidle: using governor %s\n", gov->name);
 
 	return 0;
 }
-- 
cgit v1.2.3


From bedadcfb011fef55273bd686e8893fdd8911dcdb Mon Sep 17 00:00:00 2001
From: Perry Yuan <Perry.Yuan@amd.com>
Date: Tue, 30 Aug 2022 09:56:00 +0800
Subject: cpufreq: amd-pstate: Fix initial highest_perf value

To avoid some new AMD processors use wrong highest perf when amd pstate
driver loaded, this fix will query the highest perf from MSR register
MSR_AMD_CPPC_CAP1 and cppc_acpi interface firstly, then compare with the
highest perf value got by calling amd_get_highest_perf() function.

The lower value will be the correct highest perf we need to use.
Otherwise the CPU max MHz will be incorrect if the
amd_get_highest_perf() did not cover the new process family and model ID.

Like this lscpu info, the max frequency is incorrect.

Vendor ID:               AuthenticAMD
    Socket(s):           1
    Stepping:            2
    CPU max MHz:         5410.0000
    CPU min MHz:         400.0000
    BogoMIPS:            5600.54

Fixes: 3743d55b289c2 (x86, sched: Fix the AMD CPPC maximum performance value on certain AMD Ryzen generations)
Acked-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Perry Yuan <Perry.Yuan@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/amd-pstate.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 33fbd6a94c43..bf0ed8e54fdf 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -152,6 +152,7 @@ static inline int amd_pstate_enable(bool enable)
 static int pstate_init_perf(struct amd_cpudata *cpudata)
 {
 	u64 cap1;
+	u32 highest_perf;
 
 	int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
 				     &cap1);
@@ -163,7 +164,11 @@ static int pstate_init_perf(struct amd_cpudata *cpudata)
 	 *
 	 * CPPC entry doesn't indicate the highest performance in some ASICs.
 	 */
-	WRITE_ONCE(cpudata->highest_perf, amd_get_highest_perf());
+	highest_perf = amd_get_highest_perf();
+	if (highest_perf > AMD_CPPC_HIGHEST_PERF(cap1))
+		highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
+
+	WRITE_ONCE(cpudata->highest_perf, highest_perf);
 
 	WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
 	WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
@@ -175,12 +180,17 @@ static int pstate_init_perf(struct amd_cpudata *cpudata)
 static int cppc_init_perf(struct amd_cpudata *cpudata)
 {
 	struct cppc_perf_caps cppc_perf;
+	u32 highest_perf;
 
 	int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
 	if (ret)
 		return ret;
 
-	WRITE_ONCE(cpudata->highest_perf, amd_get_highest_perf());
+	highest_perf = amd_get_highest_perf();
+	if (highest_perf > cppc_perf.highest_perf)
+		highest_perf = cppc_perf.highest_perf;
+
+	WRITE_ONCE(cpudata->highest_perf, highest_perf);
 
 	WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
 	WRITE_ONCE(cpudata->lowest_nonlinear_perf,
-- 
cgit v1.2.3


From 0d7a23b5f8e162bf2c5caab06f5df4aee2619073 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 30 Aug 2022 11:01:34 +0800
Subject: powercap: intel_rapl: Add support for RAPTORLAKE_S

Add intel_rapl support for RAPTORLAKE_S platform, which behaves the same
as RAPTORLAKE and RAPTORLAKE_P platforms.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 21d624f9f5fb..6812c1ca3100 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -1110,6 +1110,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N,		&rapl_defaults_core),
 	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE,		&rapl_defaults_core),
 	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P,        &rapl_defaults_core),
+	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S,	&rapl_defaults_core),
 	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&rapl_defaults_spr_server),
 	X86_MATCH_INTEL_FAM6_MODEL(LAKEFIELD,		&rapl_defaults_core),
 
-- 
cgit v1.2.3


From e9a7c526c29b0ae60c888b335bd6cf6e2ee80154 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 30 Aug 2022 20:33:37 +0200
Subject: thermal: int340x_thermal: Consolidate priv->data_vault checks

It is sufficient to check priv->data_vault once in the error code path
of int3400_thermal_probe(), so do that.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/intel/int340x_thermal/int3400_thermal.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
index 365489bf4b8c..db8a6f63657d 100644
--- a/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
+++ b/drivers/thermal/intel/int340x_thermal/int3400_thermal.c
@@ -614,9 +614,8 @@ static int int3400_thermal_probe(struct platform_device *pdev)
 
 free_sysfs:
 	cleanup_odvp(priv);
-	if (priv->data_vault) {
-		if (!ZERO_OR_NULL_PTR(priv->data_vault))
-			sysfs_remove_group(&pdev->dev.kobj, &data_attribute_group);
+	if (!ZERO_OR_NULL_PTR(priv->data_vault)) {
+		sysfs_remove_group(&pdev->dev.kobj, &data_attribute_group);
 		kfree(priv->data_vault);
 	}
 free_uuid:
-- 
cgit v1.2.3


From 37096428962d125591144876db6ae037096e39ed Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmtrmonakhov@yandex-team.ru>
Date: Wed, 24 Aug 2022 16:01:13 +0300
Subject: ACPI: APEI: Add BERT error log footer

Print total number of records found during BERT log parsing.
This also simplify dmesg parser implementation for BERT events.

Signed-off-by: Dmitry Monakhov <dmtrmonakhov@yandex-team.ru>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/apei/bert.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/acpi/apei/bert.c b/drivers/acpi/apei/bert.c
index 45973aa6e06d..c23eb75866d0 100644
--- a/drivers/acpi/apei/bert.c
+++ b/drivers/acpi/apei/bert.c
@@ -90,6 +90,9 @@ static void __init bert_print_all(struct acpi_bert_region *region,
 
 	if (skipped)
 		pr_info(HW_ERR "Skipped %d error records\n", skipped);
+
+	if (printed + skipped)
+		pr_info("Total records found: %d\n", printed + skipped);
 }
 
 static int __init setup_bert_disable(char *str)
-- 
cgit v1.2.3


From 574160b8548deff8b80b174f03201e94ab8431e2 Mon Sep 17 00:00:00 2001
From: Arvid Norlander <lkml@vorpal.se>
Date: Wed, 24 Aug 2022 20:49:50 +0200
Subject: ACPI: video: Add Toshiba Satellite/Portege Z830 quirk

Toshiba Satellite Z830 needs the quirk video_disable_backlight_sysfs_if
for proper backlight control after suspend/resume cycles.

Toshiba Portege Z830 is simply the same laptop rebranded for certain
markets (I looked through the manual to other language sections to confirm
this) and thus also needs this quirk.

Thanks to Hans de Goede for suggesting this fix.

Link: https://www.spinics.net/lists/platform-driver-x86/msg34394.html
Suggested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Arvid Norlander <lkml@vorpal.se>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Tested-by: Arvid Norlander <lkml@vorpal.se>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_video.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c
index 5cbe2196176d..2a4990733cf0 100644
--- a/drivers/acpi/acpi_video.c
+++ b/drivers/acpi/acpi_video.c
@@ -496,6 +496,22 @@ static const struct dmi_system_id video_dmi_table[] = {
 		DMI_MATCH(DMI_PRODUCT_NAME, "SATELLITE R830"),
 		},
 	},
+	{
+	 .callback = video_disable_backlight_sysfs_if,
+	 .ident = "Toshiba Satellite Z830",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "SATELLITE Z830"),
+		},
+	},
+	{
+	 .callback = video_disable_backlight_sysfs_if,
+	 .ident = "Toshiba Portege Z830",
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
+		DMI_MATCH(DMI_PRODUCT_NAME, "PORTEGE Z830"),
+		},
+	},
 	/*
 	 * Some machine's _DOD IDs don't have bit 31(Device ID Scheme) set
 	 * but the IDs actually follow the Device ID Scheme.
-- 
cgit v1.2.3


From b3c0e38bdb52abd581da70d7048a173f8cf8d327 Mon Sep 17 00:00:00 2001
From: ye xingchen <ye.xingchen@zte.com.cn>
Date: Thu, 25 Aug 2022 07:27:44 +0000
Subject: ACPI: EC: Drop unneeded result variable from ec_write()

Return the acpi_ec_write() return value directly instead of storing it
in another redundant variable.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/ec.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index c95e535035a0..9b42628cf21b 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -917,14 +917,10 @@ EXPORT_SYMBOL(ec_read);
 
 int ec_write(u8 addr, u8 val)
 {
-	int err;
-
 	if (!first_ec)
 		return -ENODEV;
 
-	err = acpi_ec_write(first_ec, addr, val);
-
-	return err;
+	return acpi_ec_write(first_ec, addr, val);
 }
 EXPORT_SYMBOL(ec_write);
 
-- 
cgit v1.2.3


From 40083734d9f94713cedf2c386e4a668dfc6774a9 Mon Sep 17 00:00:00 2001
From: Shi junming <junming@nfschina.com>
Date: Thu, 25 Aug 2022 17:03:47 +0800
Subject: ACPI: tools: pfrut: Do not initialize ret in main()

The initialization is unnecessary, because ret is always assigned a new
value before reading it.

Signed-off-by: Shi junming <junming@nfschina.com>
[ rjw: Subject edits, new changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 tools/power/acpi/tools/pfrut/pfrut.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/power/acpi/tools/pfrut/pfrut.c b/tools/power/acpi/tools/pfrut/pfrut.c
index d79c335594b2..52aa0351533c 100644
--- a/tools/power/acpi/tools/pfrut/pfrut.c
+++ b/tools/power/acpi/tools/pfrut/pfrut.c
@@ -190,7 +190,7 @@ int main(int argc, char *argv[])
 	void *addr_map_capsule;
 	struct stat st;
 	char *log_buf;
-	int ret = 0;
+	int ret;
 
 	if (getuid() != 0) {
 		printf("Please run the tool as root - Exiting.\n");
-- 
cgit v1.2.3


From e6e8c6c2380d88606c991b5855881769297b5653 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Fri, 26 Aug 2022 15:14:57 +0200
Subject: ACPI: docs: enumeration: Fix a few typos and wording mistakes

"sturct" -> "struct"
"similar than with" -> "similar to"
Missing comma, "it" and "to"

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/firmware-guide/acpi/enumeration.rst | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/Documentation/firmware-guide/acpi/enumeration.rst b/Documentation/firmware-guide/acpi/enumeration.rst
index dbb03022b127..b9dc0c603f36 100644
--- a/Documentation/firmware-guide/acpi/enumeration.rst
+++ b/Documentation/firmware-guide/acpi/enumeration.rst
@@ -21,7 +21,7 @@ possible we decided to do following:
   - Devices behind real busses where there is a connector resource
     are represented as struct spi_device or struct i2c_device. Note
     that standard UARTs are not busses so there is no struct uart_device,
-    although some of them may be represented by sturct serdev_device.
+    although some of them may be represented by struct serdev_device.
 
 As both ACPI and Device Tree represent a tree of devices (and their
 resources) this implementation follows the Device Tree way as much as
@@ -205,7 +205,7 @@ Here is what the ACPI namespace for a SPI slave might look like::
 		}
 		...
 
-The SPI device drivers only need to add ACPI IDs in a similar way than with
+The SPI device drivers only need to add ACPI IDs in a similar way to
 the platform device drivers. Below is an example where we add ACPI support
 to at25 SPI eeprom driver (this is meant for the above ACPI snippet)::
 
@@ -362,7 +362,7 @@ These GPIO numbers are controller relative and path "\\_SB.PCI0.GPI0"
 specifies the path to the controller. In order to use these GPIOs in Linux
 we need to translate them to the corresponding Linux GPIO descriptors.
 
-There is a standard GPIO API for that and is documented in
+There is a standard GPIO API for that and it is documented in
 Documentation/admin-guide/gpio/.
 
 In the above example we can get the corresponding two GPIO descriptors with
@@ -538,8 +538,8 @@ information.
 PCI hierarchy representation
 ============================
 
-Sometimes could be useful to enumerate a PCI device, knowing its position on the
-PCI bus.
+Sometimes it could be useful to enumerate a PCI device, knowing its position on
+the PCI bus.
 
 For example, some systems use PCI devices soldered directly on the mother board,
 in a fixed position (ethernet, Wi-Fi, serial ports, etc.). In this conditions it
@@ -550,7 +550,7 @@ To identify a PCI device, a complete hierarchical description is required, from
 the chipset root port to the final device, through all the intermediate
 bridges/switches of the board.
 
-For example, let us assume to have a system with a PCIe serial port, an
+For example, let's assume we have a system with a PCIe serial port, an
 Exar XR17V3521, soldered on the main board. This UART chip also includes
 16 GPIOs and we want to add the property ``gpio-line-names`` [1] to these pins.
 In this case, the ``lspci`` output for this component is::
@@ -593,8 +593,8 @@ of the chipset bridge (also called "root port") with address::
 
 	Bus: 0 - Device: 14 - Function: 1
 
-To find this information is necessary disassemble the BIOS ACPI tables, in
-particular the DSDT (see also [2])::
+To find this information, it is necessary to disassemble the BIOS ACPI tables,
+in particular the DSDT (see also [2])::
 
 	mkdir ~/tables/
 	cd ~/tables/
-- 
cgit v1.2.3


From 1cd43acf0b61fc93aaede45cdc7d61b5ddcb54a4 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 26 Aug 2022 20:16:34 +0300
Subject: ACPI: bus: Drop kernel doc annotation from acpi_bus_notify()

The description for acpi_bus_notify() is quite far from what
kernel doc expects. It complains about this:

  Function parameter or member 'handle' not described in 'acpi_bus_notify'
  Function parameter or member 'type' not described in 'acpi_bus_notify'
  Function parameter or member 'data' not described in 'acpi_bus_notify'

Fix this by dropping kernel doc annotation.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 661a63ea3248..6e2dad6ff757 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -456,7 +456,7 @@ out_free:
                              Notification Handling
    -------------------------------------------------------------------------- */
 
-/**
+/*
  * acpi_bus_notify
  * ---------------
  * Callback for all 'system-level' device notifications (values 0x00-0x7F).
-- 
cgit v1.2.3


From fe79e392cf08a38ca0e25ffadbe35e45d5651989 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 26 Aug 2022 20:16:35 +0300
Subject: ACPI: bus: Refactor ACPI matching functions for better readability

With temporary variables for OF and ACPI IDs, it's easier to read
the code. No functional change intended.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 6e2dad6ff757..a55d7313fd4a 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -925,12 +925,13 @@ static const void *acpi_of_device_get_match_data(const struct device *dev)
 
 const void *acpi_device_get_match_data(const struct device *dev)
 {
+	const struct acpi_device_id *acpi_ids = dev->driver->acpi_match_table;
 	const struct acpi_device_id *match;
 
-	if (!dev->driver->acpi_match_table)
+	if (!acpi_ids)
 		return acpi_of_device_get_match_data(dev);
 
-	match = acpi_match_device(dev->driver->acpi_match_table, dev);
+	match = acpi_match_device(acpi_ids, dev);
 	if (!match)
 		return NULL;
 
@@ -948,14 +949,13 @@ EXPORT_SYMBOL(acpi_match_device_ids);
 bool acpi_driver_match_device(struct device *dev,
 			      const struct device_driver *drv)
 {
-	if (!drv->acpi_match_table)
-		return acpi_of_match_device(ACPI_COMPANION(dev),
-					    drv->of_match_table,
-					    NULL);
-
-	return __acpi_match_device(acpi_companion_match(dev),
-				   drv->acpi_match_table, drv->of_match_table,
-				   NULL, NULL);
+	const struct acpi_device_id *acpi_ids = drv->acpi_match_table;
+	const struct of_device_id *of_ids = drv->of_match_table;
+
+	if (!acpi_ids)
+		return acpi_of_match_device(ACPI_COMPANION(dev), of_ids, NULL);
+
+	return __acpi_match_device(acpi_companion_match(dev), acpi_ids, of_ids, NULL, NULL);
 }
 EXPORT_SYMBOL_GPL(acpi_driver_match_device);
 
-- 
cgit v1.2.3


From 98378956a407392109817278c6edd464252c7a83 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 29 Aug 2022 18:12:33 +0200
Subject: ACPI: property: Use acpi_dev_parent()

After introducing acpi_dev_parent() in commit 62fcb99bdf10 ("ACPI: Drop
parent field from struct acpi_device"), it is better to use it instead
of accessing the dev.parent field in struct acpi_device directly.

Modify acpi_node_get_parent() accordingly.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/acpi/property.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 3dab3ac9c4a1..f7c38481f42c 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -1270,10 +1270,11 @@ acpi_node_get_parent(const struct fwnode_handle *fwnode)
 		return to_acpi_data_node(fwnode)->parent;
 	}
 	if (is_acpi_device_node(fwnode)) {
-		struct device *dev = to_acpi_device_node(fwnode)->dev.parent;
+		struct acpi_device *parent;
 
-		if (dev)
-			return acpi_fwnode_handle(to_acpi_device(dev));
+		parent = acpi_dev_parent(to_acpi_device_node(fwnode));
+		if (parent)
+			return acpi_fwnode_handle(parent);
 	}
 
 	return NULL;
-- 
cgit v1.2.3


From f3bc9ca5285de3b3a149f18c60c05ab57b0a0a4e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 31 Aug 2022 17:03:24 +0300
Subject: ACPI: platform: Get rid of redundant 'else'

In the snippets like the following

	if (...)
		return / goto / break / continue ...;
	else
		...

the 'else' is redundant. Get rid of it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_platform.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index de3cbf152dee..f09903ec950e 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -113,9 +113,9 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev,
 
 	INIT_LIST_HEAD(&resource_list);
 	count = acpi_dev_get_resources(adev, &resource_list, NULL, NULL);
-	if (count < 0) {
+	if (count < 0)
 		return NULL;
-	} else if (count > 0) {
+	if (count > 0) {
 		resources = kcalloc(count, sizeof(struct resource),
 				    GFP_KERNEL);
 		if (!resources) {
-- 
cgit v1.2.3


From 1d190148cc22495a01668417243121ad0ad5df01 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 31 Aug 2022 17:03:25 +0300
Subject: ACPI: platform: Remove redundant print on -ENOMEM

We rely on somebody else to print enough information on memory
allocation failures. So remove the log in
the acpi_create_platform_device() when return -ENOMEM.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_platform.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index f09903ec950e..3a4d3d7772aa 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -119,7 +119,6 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev,
 		resources = kcalloc(count, sizeof(struct resource),
 				    GFP_KERNEL);
 		if (!resources) {
-			dev_err(&adev->dev, "No memory for resources\n");
 			acpi_dev_free_resource_list(&resource_list);
 			return ERR_PTR(-ENOMEM);
 		}
-- 
cgit v1.2.3


From 895a4d6ce17b377316ebe7e7090efc1ce9ffd1fe Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 31 Aug 2022 17:03:26 +0300
Subject: ACPI: platform: Use sizeof(*pointer) instead of sizeof(type)

It is preferred to use sizeof(*pointer) instead of sizeof(type).
The type of the variable can change and one needs not change
the former (unlike the latter). No functional change intended.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_platform.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index 3a4d3d7772aa..5e757b53476a 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -116,8 +116,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev,
 	if (count < 0)
 		return NULL;
 	if (count > 0) {
-		resources = kcalloc(count, sizeof(struct resource),
-				    GFP_KERNEL);
+		resources = kcalloc(count, sizeof(*resources), GFP_KERNEL);
 		if (!resources) {
 			acpi_dev_free_resource_list(&resource_list);
 			return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3


From 1902d158bc86370c29bbb7d5f0e4b9a86a6c6c48 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 31 Aug 2022 17:03:27 +0300
Subject: ACPI: platform: Sort forbidden_id_list[] in ascending order

For easier maintenance, sort the forbidden_id_list[] table rows in
ascending order with respect to the device ID field.

While at it, use an empty row as the list terminator, which is more
usual in the kernel.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_platform.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index 5e757b53476a..176cc6ee14d8 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -20,13 +20,13 @@
 #include "internal.h"
 
 static const struct acpi_device_id forbidden_id_list[] = {
+	{"ACPI0009", 0},	/* IOxAPIC */
+	{"ACPI000A", 0},	/* IOAPIC */
 	{"PNP0000",  0},	/* PIC */
 	{"PNP0100",  0},	/* Timer */
 	{"PNP0200",  0},	/* AT DMA Controller */
-	{"ACPI0009", 0},	/* IOxAPIC */
-	{"ACPI000A", 0},	/* IOAPIC */
 	{"SMB0001",  0},	/* ACPI SMBUS virtual device */
-	{"", 0},
+	{ }
 };
 
 static struct platform_device *acpi_platform_device_find_by_companion(struct acpi_device *adev)
-- 
cgit v1.2.3


From 2814108cbf54cd4737cd13e17291b21590472dca Mon Sep 17 00:00:00 2001
From: John Garry <john.garry@huawei.com>
Date: Thu, 1 Sep 2022 18:04:11 +0800
Subject: ACPI: platform: Use PLATFORM_DEVID_NONE in
 acpi_create_platform_device()

Instead of hardcoding the value for the id, use PLATFORM_DEVID_NONE.

Signed-off-by: John Garry <john.garry@huawei.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_platform.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index 176cc6ee14d8..ebd18c716b50 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -138,7 +138,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev,
 	pdevinfo.parent = adev->parent ?
 		acpi_get_first_physical_node(adev->parent) : NULL;
 	pdevinfo.name = dev_name(&adev->dev);
-	pdevinfo.id = -1;
+	pdevinfo.id = PLATFORM_DEVID_NONE;
 	pdevinfo.res = resources;
 	pdevinfo.num_res = count;
 	pdevinfo.fwnode = acpi_fwnode_handle(adev);
-- 
cgit v1.2.3


From a1cf1fd62ae71b585b35f78f8bcf6b1bc9de2c40 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Fri, 2 Sep 2022 04:15:55 +0200
Subject: ACPI: property: Silence missing-declarations warning in apple.c

Silence an annoying message emitted for W=1 builds:

drivers/acpi/x86/apple.c:30:6: warning: no previous declaration for 'acpi_extract_apple_properties' [-Wmissing-declarations]

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/x86/apple.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/acpi/x86/apple.c b/drivers/acpi/x86/apple.c
index c285c91a5e9c..8812ecd03d55 100644
--- a/drivers/acpi/x86/apple.c
+++ b/drivers/acpi/x86/apple.c
@@ -8,6 +8,7 @@
 #include <linux/bitmap.h>
 #include <linux/platform_data/x86/apple.h>
 #include <linux/uuid.h>
+#include "../internal.h"
 
 /* Apple _DSM device properties GUID */
 static const guid_t apple_prp_guid =
-- 
cgit v1.2.3


From 6b959ba22d34ca793ffdb15b5715457c78e38b1a Mon Sep 17 00:00:00 2001
From: Yang Jihong <yangjihong1@huawei.com>
Date: Fri, 2 Sep 2022 16:29:18 +0800
Subject: perf/core: Fix reentry problem in perf_output_read_group()

perf_output_read_group may respond to IPI request of other cores and invoke
__perf_install_in_context function. As a result, hwc configuration is modified.
causing inconsistency and unexpected consequences.

Interrupts are not disabled when perf_output_read_group reads PMU counter.
In this case, IPI request may be received from other cores.
As a result, PMU configuration is modified and an error occurs when
reading PMU counter:

		     CPU0                                         CPU1
						      __se_sys_perf_event_open
							perf_install_in_context
  perf_output_read_group                                  smp_call_function_single
    for_each_sibling_event(sub, leader) {                   generic_exec_single
      if ((sub != event) &&                                   remote_function
	  (sub->state == PERF_EVENT_STATE_ACTIVE))                    |
  <enter IPI handler: __perf_install_in_context>   <----RAISE IPI-----+
  __perf_install_in_context
    ctx_resched
      event_sched_out
	armpmu_del
	  ...
	  hwc->idx = -1; // event->hwc.idx is set to -1
  ...
  <exit IPI>
	      sub->pmu->read(sub);
		armpmu_read
		  armv8pmu_read_counter
		    armv8pmu_read_hw_counter
		      int idx = event->hw.idx; // idx = -1
		      u64 val = armv8pmu_read_evcntr(idx);
			u32 counter = ARMV8_IDX_TO_COUNTER(idx); // invalid counter = 30
			read_pmevcntrn(counter) // undefined instruction

Signed-off-by: Yang Jihong <yangjihong1@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20220902082918.179248-1-yangjihong1@huawei.com
---
 kernel/events/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2621fd24ad26..ff4bffc502c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6893,9 +6893,16 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 {
 	struct perf_event *leader = event->group_leader, *sub;
 	u64 read_format = event->attr.read_format;
+	unsigned long flags;
 	u64 values[6];
 	int n = 0;
 
+	/*
+	 * Disabling interrupts avoids all counter scheduling
+	 * (context switches, timer based rotation and IPIs).
+	 */
+	local_irq_save(flags);
+
 	values[n++] = 1 + leader->nr_siblings;
 
 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -6931,6 +6938,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 
 		__output_copy(handle, values, n * sizeof(u64));
 	}
+
+	local_irq_restore(flags);
 }
 
 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
-- 
cgit v1.2.3


From b338d91703fae6f6afd67f3f75caa3b8f36ddef3 Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Thu, 1 Sep 2022 12:19:13 -0700
Subject: Bluetooth: Implement support for Mesh

The patch adds state bits, storage and HCI command chains for sending
and receiving Bluetooth Mesh advertising packets, and delivery to
requesting user space processes. It specifically creates 4 new MGMT
commands and 2 new MGMT events:

MGMT_OP_SET_MESH_RECEIVER - Sets passive scan parameters and a list of
AD Types which will trigger Mesh Packet Received events

MGMT_OP_MESH_READ_FEATURES - Returns information on how many outbound
Mesh packets can be simultaneously queued, and what the currently queued
handles are.

MGMT_OP_MESH_SEND - Command to queue a specific outbound Mesh packet,
with the number of times it should be sent, and the BD Addr to use.
Discrete advertisments are added to the ADV Instance list.

MGMT_OP_MESH_SEND_CANCEL - Command to cancel a prior outbound message
request.

MGMT_EV_MESH_DEVICE_FOUND - Event to deliver entire received Mesh
Advertisement packet, along with timing information.

MGMT_EV_MESH_PACKET_CMPLT - Event to indicate that an outbound packet is
no longer queued for delivery.

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h |   1 +
 include/net/bluetooth/hci.h       |   3 +
 include/net/bluetooth/hci_core.h  |  16 +-
 include/net/bluetooth/mgmt.h      |  52 +++++
 net/bluetooth/hci_core.c          |  13 +-
 net/bluetooth/hci_event.c         |  61 +++--
 net/bluetooth/hci_sock.c          |   1 +
 net/bluetooth/hci_sync.c          |  87 ++++++-
 net/bluetooth/mgmt.c              | 480 +++++++++++++++++++++++++++++++++++++-
 net/bluetooth/mgmt_util.c         |  74 ++++++
 net/bluetooth/mgmt_util.h         |  18 ++
 11 files changed, 760 insertions(+), 46 deletions(-)

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index e72f3b247b5e..bcc5a4cd2c17 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -627,6 +627,7 @@ static inline bool iso_enabled(void)
 
 int mgmt_init(void);
 void mgmt_exit(void);
+void mgmt_cleanup(struct sock *sk);
 
 void bt_sock_reclassify_lock(struct sock *sk, int proto);
 
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index cf29511b25a8..b3ade687531f 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -354,6 +354,9 @@ enum {
 	HCI_LE_SIMULTANEOUS_ROLES,
 	HCI_CMD_DRAIN_WORKQUEUE,
 
+	HCI_MESH,
+	HCI_MESH_SENDING,
+
 	__HCI_NUM_FLAGS,
 };
 
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 932153e68864..c54bc71254af 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -238,6 +238,7 @@ struct adv_info {
 	bool	enabled;
 	bool	pending;
 	bool	periodic;
+	__u8	mesh;
 	__u8	instance;
 	__u32	flags;
 	__u16	timeout;
@@ -372,6 +373,8 @@ struct hci_dev {
 	__u8		le_resolv_list_size;
 	__u8		le_num_of_adv_sets;
 	__u8		le_states[8];
+	__u8		mesh_ad_types[16];
+	__u8		mesh_send_ref;
 	__u8		commands[64];
 	__u8		hci_ver;
 	__u16		hci_rev;
@@ -511,6 +514,7 @@ struct hci_dev {
 	struct list_head	cmd_sync_work_list;
 	struct mutex		cmd_sync_work_lock;
 	struct work_struct	cmd_sync_cancel_work;
+	struct work_struct	reenable_adv_work;
 
 	__u16			discov_timeout;
 	struct delayed_work	discov_off;
@@ -561,6 +565,7 @@ struct hci_dev {
 
 	struct hci_conn_hash	conn_hash;
 
+	struct list_head	mesh_pending;
 	struct list_head	mgmt_pending;
 	struct list_head	reject_list;
 	struct list_head	accept_list;
@@ -614,6 +619,8 @@ struct hci_dev {
 	struct delayed_work	rpa_expired;
 	bdaddr_t		rpa;
 
+	struct delayed_work	mesh_send_done;
+
 	enum {
 		INTERLEAVE_SCAN_NONE,
 		INTERLEAVE_SCAN_NO_FILTER,
@@ -1576,7 +1583,8 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance,
 				      u32 flags, u16 adv_data_len, u8 *adv_data,
 				      u16 scan_rsp_len, u8 *scan_rsp_data,
 				      u16 timeout, u16 duration, s8 tx_power,
-				      u32 min_interval, u32 max_interval);
+				      u32 min_interval, u32 max_interval,
+				      u8 mesh_handle);
 struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance,
 				      u32 flags, u8 data_len, u8 *data,
 				      u32 min_interval, u32 max_interval);
@@ -1997,6 +2005,9 @@ void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c);
 #define DISCOV_LE_FAST_ADV_INT_MAX	0x00F0	/* 150 msec */
 #define DISCOV_LE_PER_ADV_INT_MIN	0x00A0	/* 200 msec */
 #define DISCOV_LE_PER_ADV_INT_MAX	0x00A0	/* 200 msec */
+#define DISCOV_LE_ADV_MESH_MIN		0x00A0  /* 100 msec */
+#define DISCOV_LE_ADV_MESH_MAX		0x00A0  /* 100 msec */
+#define INTERVAL_TO_MS(x)		(((x) * 10) / 0x10)
 
 #define NAME_RESOLVE_DURATION		msecs_to_jiffies(10240)	/* 10.24 sec */
 
@@ -2048,7 +2059,8 @@ void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status);
 void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status);
 void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
 		       u8 addr_type, u8 *dev_class, s8 rssi, u32 flags,
-		       u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len);
+		       u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len,
+		       u64 instant);
 void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
 		      u8 addr_type, s8 rssi, u8 *name, u8 name_len);
 void mgmt_discovering(struct hci_dev *hdev, u8 discovering);
diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index 7c1ad0f6fcec..743f6f59dff8 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -837,6 +837,42 @@ struct mgmt_cp_add_adv_patterns_monitor_rssi {
 	struct mgmt_adv_pattern patterns[];
 } __packed;
 #define MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE	8
+#define MGMT_OP_SET_MESH_RECEIVER		0x0057
+struct mgmt_cp_set_mesh {
+	__u8   enable;
+	__le16 window;
+	__le16 period;
+	__u8   num_ad_types;
+	__u8   ad_types[];
+} __packed;
+#define MGMT_SET_MESH_RECEIVER_SIZE	6
+
+#define MGMT_OP_MESH_READ_FEATURES	0x0058
+#define MGMT_MESH_READ_FEATURES_SIZE	0
+#define MESH_HANDLES_MAX	3
+struct mgmt_rp_mesh_read_features {
+	__le16	index;
+	__u8   max_handles;
+	__u8   used_handles;
+	__u8   handles[MESH_HANDLES_MAX];
+} __packed;
+
+#define MGMT_OP_MESH_SEND		0x0059
+struct mgmt_cp_mesh_send {
+	struct mgmt_addr_info addr;
+	__le64  instant;
+	__le16  delay;
+	__u8   cnt;
+	__u8   adv_data_len;
+	__u8   adv_data[];
+} __packed;
+#define MGMT_MESH_SEND_SIZE		19
+
+#define MGMT_OP_MESH_SEND_CANCEL	0x005A
+struct mgmt_cp_mesh_send_cancel {
+	__u8  handle;
+} __packed;
+#define MGMT_MESH_SEND_CANCEL_SIZE	1
 
 #define MGMT_EV_CMD_COMPLETE		0x0001
 struct mgmt_ev_cmd_complete {
@@ -1120,3 +1156,19 @@ struct mgmt_ev_adv_monitor_device_lost {
 	__le16 monitor_handle;
 	struct mgmt_addr_info addr;
 } __packed;
+
+#define MGMT_EV_MESH_DEVICE_FOUND	0x0031
+struct mgmt_ev_mesh_device_found {
+	struct mgmt_addr_info addr;
+	__s8	rssi;
+	__le64	instant;
+	__le32	flags;
+	__le16	eir_len;
+	__u8	eir[];
+} __packed;
+
+
+#define MGMT_EV_MESH_PACKET_CMPLT		0x0032
+struct mgmt_ev_mesh_pkt_cmplt {
+	__u8	handle;
+} __packed;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 9d2c33f6b065..3803e54f23c0 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1706,7 +1706,8 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance,
 				      u32 flags, u16 adv_data_len, u8 *adv_data,
 				      u16 scan_rsp_len, u8 *scan_rsp_data,
 				      u16 timeout, u16 duration, s8 tx_power,
-				      u32 min_interval, u32 max_interval)
+				      u32 min_interval, u32 max_interval,
+				      u8 mesh_handle)
 {
 	struct adv_info *adv;
 
@@ -1717,7 +1718,7 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance,
 		memset(adv->per_adv_data, 0, sizeof(adv->per_adv_data));
 	} else {
 		if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets ||
-		    instance < 1 || instance > hdev->le_num_of_adv_sets)
+		    instance < 1 || instance > hdev->le_num_of_adv_sets + 1)
 			return ERR_PTR(-EOVERFLOW);
 
 		adv = kzalloc(sizeof(*adv), GFP_KERNEL);
@@ -1734,6 +1735,11 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance,
 	adv->min_interval = min_interval;
 	adv->max_interval = max_interval;
 	adv->tx_power = tx_power;
+	/* Defining a mesh_handle changes the timing units to ms,
+	 * rather than seconds, and ties the instance to the requested
+	 * mesh_tx queue.
+	 */
+	adv->mesh = mesh_handle;
 
 	hci_set_adv_instance_data(hdev, instance, adv_data_len, adv_data,
 				  scan_rsp_len, scan_rsp_data);
@@ -1762,7 +1768,7 @@ struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance,
 
 	adv = hci_add_adv_instance(hdev, instance, flags, 0, NULL, 0, NULL,
 				   0, 0, HCI_ADV_TX_POWER_NO_PREFERENCE,
-				   min_interval, max_interval);
+				   min_interval, max_interval, 0);
 	if (IS_ERR(adv))
 		return adv;
 
@@ -2486,6 +2492,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv)
 	mutex_init(&hdev->lock);
 	mutex_init(&hdev->req_lock);
 
+	INIT_LIST_HEAD(&hdev->mesh_pending);
 	INIT_LIST_HEAD(&hdev->mgmt_pending);
 	INIT_LIST_HEAD(&hdev->reject_list);
 	INIT_LIST_HEAD(&hdev->accept_list);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 0ed944aaed94..5acb6fa6d676 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1756,6 +1756,8 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable)
 		hci_dev_set_flag(hdev, HCI_LE_SCAN);
 		if (hdev->le_scan_type == LE_SCAN_ACTIVE)
 			clear_pending_adv_report(hdev);
+		if (hci_dev_test_flag(hdev, HCI_MESH))
+			hci_discovery_set_state(hdev, DISCOVERY_FINDING);
 		break;
 
 	case LE_SCAN_DISABLE:
@@ -1770,7 +1772,7 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable)
 					  d->last_adv_addr_type, NULL,
 					  d->last_adv_rssi, d->last_adv_flags,
 					  d->last_adv_data,
-					  d->last_adv_data_len, NULL, 0);
+					  d->last_adv_data_len, NULL, 0, 0);
 		}
 
 		/* Cancel this timer so that we don't try to disable scanning
@@ -1786,6 +1788,9 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable)
 		 */
 		if (hci_dev_test_and_clear_flag(hdev, HCI_LE_SCAN_INTERRUPTED))
 			hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+		else if (!hci_dev_test_flag(hdev, HCI_LE_ADV) &&
+			 hdev->discovery.state == DISCOVERY_FINDING)
+			queue_work(hdev->workqueue, &hdev->reenable_adv_work);
 
 		break;
 
@@ -3112,7 +3117,7 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, void *edata,
 
 		mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
 				  info->dev_class, HCI_RSSI_INVALID,
-				  flags, NULL, 0, NULL, 0);
+				  flags, NULL, 0, NULL, 0, 0);
 	}
 
 	hci_dev_unlock(hdev);
@@ -4827,7 +4832,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata,
 
 			mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
 					  info->dev_class, info->rssi,
-					  flags, NULL, 0, NULL, 0);
+					  flags, NULL, 0, NULL, 0, 0);
 		}
 	} else if (skb->len == array_size(ev->num,
 					  sizeof(struct inquiry_info_rssi))) {
@@ -4858,7 +4863,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata,
 
 			mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
 					  info->dev_class, info->rssi,
-					  flags, NULL, 0, NULL, 0);
+					  flags, NULL, 0, NULL, 0, 0);
 		}
 	} else {
 		bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
@@ -5114,7 +5119,7 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, void *edata,
 
 		mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
 				  info->dev_class, info->rssi,
-				  flags, info->data, eir_len, NULL, 0);
+				  flags, info->data, eir_len, NULL, 0, 0);
 	}
 
 	hci_dev_unlock(hdev);
@@ -6170,7 +6175,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
 static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 			       u8 bdaddr_type, bdaddr_t *direct_addr,
 			       u8 direct_addr_type, s8 rssi, u8 *data, u8 len,
-			       bool ext_adv)
+			       bool ext_adv, bool ctl_time, u64 instant)
 {
 	struct discovery_state *d = &hdev->discovery;
 	struct smp_irk *irk;
@@ -6218,7 +6223,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 	 * important to see if the address is matching the local
 	 * controller address.
 	 */
-	if (direct_addr) {
+	if (!hci_dev_test_flag(hdev, HCI_MESH) && direct_addr) {
 		direct_addr_type = ev_bdaddr_type(hdev, direct_addr_type,
 						  &bdaddr_resolved);
 
@@ -6266,6 +6271,18 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 		conn->le_adv_data_len = len;
 	}
 
+	if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND)
+		flags = MGMT_DEV_FOUND_NOT_CONNECTABLE;
+	else
+		flags = 0;
+
+	/* All scan results should be sent up for Mesh systems */
+	if (hci_dev_test_flag(hdev, HCI_MESH)) {
+		mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
+				  rssi, flags, data, len, NULL, 0, instant);
+		return;
+	}
+
 	/* Passive scanning shouldn't trigger any device found events,
 	 * except for devices marked as CONN_REPORT for which we do send
 	 * device found events, or advertisement monitoring requested.
@@ -6279,12 +6296,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 		    idr_is_empty(&hdev->adv_monitors_idr))
 			return;
 
-		if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND)
-			flags = MGMT_DEV_FOUND_NOT_CONNECTABLE;
-		else
-			flags = 0;
 		mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
-				  rssi, flags, data, len, NULL, 0);
+				  rssi, flags, data, len, NULL, 0, 0);
 		return;
 	}
 
@@ -6303,11 +6316,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 	 * and just sends a scan response event, then it is marked as
 	 * not connectable as well.
 	 */
-	if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND ||
-	    type == LE_ADV_SCAN_RSP)
+	if (type == LE_ADV_SCAN_RSP)
 		flags = MGMT_DEV_FOUND_NOT_CONNECTABLE;
-	else
-		flags = 0;
 
 	/* If there's nothing pending either store the data from this
 	 * event or send an immediate device found event if the data
@@ -6324,7 +6334,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 		}
 
 		mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
-				  rssi, flags, data, len, NULL, 0);
+				  rssi, flags, data, len, NULL, 0, 0);
 		return;
 	}
 
@@ -6343,7 +6353,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 					  d->last_adv_addr_type, NULL,
 					  d->last_adv_rssi, d->last_adv_flags,
 					  d->last_adv_data,
-					  d->last_adv_data_len, NULL, 0);
+					  d->last_adv_data_len, NULL, 0, 0);
 
 		/* If the new report will trigger a SCAN_REQ store it for
 		 * later merging.
@@ -6360,7 +6370,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 		 */
 		clear_pending_adv_report(hdev);
 		mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
-				  rssi, flags, data, len, NULL, 0);
+				  rssi, flags, data, len, NULL, 0, 0);
 		return;
 	}
 
@@ -6370,7 +6380,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
 	 */
 	mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK,
 			  d->last_adv_addr_type, NULL, rssi, d->last_adv_flags,
-			  d->last_adv_data, d->last_adv_data_len, data, len);
+			  d->last_adv_data, d->last_adv_data_len, data, len, 0);
 	clear_pending_adv_report(hdev);
 }
 
@@ -6378,6 +6388,7 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data,
 				  struct sk_buff *skb)
 {
 	struct hci_ev_le_advertising_report *ev = data;
+	u64 instant = jiffies;
 
 	if (!ev->num)
 		return;
@@ -6402,7 +6413,8 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data,
 			rssi = info->data[info->length];
 			process_adv_report(hdev, info->type, &info->bdaddr,
 					   info->bdaddr_type, NULL, 0, rssi,
-					   info->data, info->length, false);
+					   info->data, info->length, false,
+					   false, instant);
 		} else {
 			bt_dev_err(hdev, "Dropping invalid advertising data");
 		}
@@ -6459,6 +6471,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data,
 				      struct sk_buff *skb)
 {
 	struct hci_ev_le_ext_adv_report *ev = data;
+	u64 instant = jiffies;
 
 	if (!ev->num)
 		return;
@@ -6485,7 +6498,8 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data,
 			process_adv_report(hdev, legacy_evt_type, &info->bdaddr,
 					   info->bdaddr_type, NULL, 0,
 					   info->rssi, info->data, info->length,
-					   !(evt_type & LE_EXT_ADV_LEGACY_PDU));
+					   !(evt_type & LE_EXT_ADV_LEGACY_PDU),
+					   false, instant);
 		}
 	}
 
@@ -6708,6 +6722,7 @@ static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, void *data,
 					 struct sk_buff *skb)
 {
 	struct hci_ev_le_direct_adv_report *ev = data;
+	u64 instant = jiffies;
 	int i;
 
 	if (!hci_le_ev_skb_pull(hdev, skb, HCI_EV_LE_DIRECT_ADV_REPORT,
@@ -6725,7 +6740,7 @@ static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, void *data,
 		process_adv_report(hdev, info->type, &info->bdaddr,
 				   info->bdaddr_type, &info->direct_addr,
 				   info->direct_addr_type, info->rssi, NULL, 0,
-				   false);
+				   false, false, instant);
 	}
 
 	hci_dev_unlock(hdev);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 0d015d4a8e41..b2a33a05c93e 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -2065,6 +2065,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname,
 
 static void hci_sock_destruct(struct sock *sk)
 {
+	mgmt_cleanup(sk);
 	skb_queue_purge(&sk->sk_receive_queue);
 	skb_queue_purge(&sk->sk_write_queue);
 }
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index e08c0503027d..fa433896ddc7 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -246,7 +246,7 @@ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen,
 	skb = __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, sk);
 	if (IS_ERR(skb)) {
 		bt_dev_err(hdev, "Opcode 0x%4x failed: %ld", opcode,
-			   PTR_ERR(skb));
+				PTR_ERR(skb));
 		return PTR_ERR(skb);
 	}
 
@@ -465,6 +465,48 @@ unlock:
 	hci_dev_unlock(hdev);
 }
 
+static int reenable_adv_sync(struct hci_dev *hdev, void *data)
+{
+	bt_dev_dbg(hdev, "");
+
+	if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+	    list_empty(&hdev->adv_instances))
+		return 0;
+
+	if (hdev->cur_adv_instance) {
+		return hci_schedule_adv_instance_sync(hdev,
+						      hdev->cur_adv_instance,
+						      true);
+	} else {
+		if (ext_adv_capable(hdev)) {
+			hci_start_ext_adv_sync(hdev, 0x00);
+		} else {
+			hci_update_adv_data_sync(hdev, 0x00);
+			hci_update_scan_rsp_data_sync(hdev, 0x00);
+			hci_enable_advertising_sync(hdev);
+		}
+	}
+
+	return 0;
+}
+
+static void reenable_adv(struct work_struct *work)
+{
+	struct hci_dev *hdev = container_of(work, struct hci_dev,
+					    reenable_adv_work);
+	int status;
+
+	bt_dev_dbg(hdev, "");
+
+	hci_dev_lock(hdev);
+
+	status = hci_cmd_sync_queue(hdev, reenable_adv_sync, NULL, NULL);
+	if (status)
+		bt_dev_err(hdev, "failed to reenable ADV: %d", status);
+
+	hci_dev_unlock(hdev);
+}
+
 static void cancel_adv_timeout(struct hci_dev *hdev)
 {
 	if (hdev->adv_instance_timeout) {
@@ -587,6 +629,7 @@ void hci_cmd_sync_init(struct hci_dev *hdev)
 	mutex_init(&hdev->cmd_sync_work_lock);
 
 	INIT_WORK(&hdev->cmd_sync_cancel_work, hci_cmd_sync_cancel_work);
+	INIT_WORK(&hdev->reenable_adv_work, reenable_adv);
 	INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable);
 	INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart);
 	INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
@@ -597,6 +640,7 @@ void hci_cmd_sync_clear(struct hci_dev *hdev)
 	struct hci_cmd_sync_work_entry *entry, *tmp;
 
 	cancel_work_sync(&hdev->cmd_sync_work);
+	cancel_work_sync(&hdev->reenable_adv_work);
 
 	list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) {
 		if (entry->destroy)
@@ -1746,10 +1790,13 @@ static int hci_clear_adv_sets_sync(struct hci_dev *hdev, struct sock *sk)
 static int hci_clear_adv_sync(struct hci_dev *hdev, struct sock *sk, bool force)
 {
 	struct adv_info *adv, *n;
+	int err = 0;
 
 	if (ext_adv_capable(hdev))
 		/* Remove all existing sets */
-		return hci_clear_adv_sets_sync(hdev, sk);
+		err = hci_clear_adv_sets_sync(hdev, sk);
+	if (ext_adv_capable(hdev))
+		return err;
 
 	/* This is safe as long as there is no command send while the lock is
 	 * held.
@@ -1777,11 +1824,13 @@ static int hci_clear_adv_sync(struct hci_dev *hdev, struct sock *sk, bool force)
 static int hci_remove_adv_sync(struct hci_dev *hdev, u8 instance,
 			       struct sock *sk)
 {
-	int err;
+	int err = 0;
 
 	/* If we use extended advertising, instance has to be removed first. */
 	if (ext_adv_capable(hdev))
-		return hci_remove_ext_adv_instance_sync(hdev, instance, sk);
+		err = hci_remove_ext_adv_instance_sync(hdev, instance, sk);
+	if (ext_adv_capable(hdev))
+		return err;
 
 	/* This is safe as long as there is no command send while the lock is
 	 * held.
@@ -1880,13 +1929,16 @@ int hci_read_tx_power_sync(struct hci_dev *hdev, __le16 handle, u8 type)
 int hci_disable_advertising_sync(struct hci_dev *hdev)
 {
 	u8 enable = 0x00;
+	int err = 0;
 
 	/* If controller is not advertising we are done. */
 	if (!hci_dev_test_flag(hdev, HCI_LE_ADV))
 		return 0;
 
 	if (ext_adv_capable(hdev))
-		return hci_disable_ext_adv_instance_sync(hdev, 0x00);
+		err = hci_disable_ext_adv_instance_sync(hdev, 0x00);
+	if (ext_adv_capable(hdev))
+		return err;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE,
 				     sizeof(enable), &enable, HCI_CMD_TIMEOUT);
@@ -1899,7 +1951,11 @@ static int hci_le_set_ext_scan_enable_sync(struct hci_dev *hdev, u8 val,
 
 	memset(&cp, 0, sizeof(cp));
 	cp.enable = val;
-	cp.filter_dup = filter_dup;
+
+	if (hci_dev_test_flag(hdev, HCI_MESH))
+		cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE;
+	else
+		cp.filter_dup = filter_dup;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE,
 				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
@@ -1915,7 +1971,11 @@ static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val,
 
 	memset(&cp, 0, sizeof(cp));
 	cp.enable = val;
-	cp.filter_dup = filter_dup;
+
+	if (val && hci_dev_test_flag(hdev, HCI_MESH))
+		cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE;
+	else
+		cp.filter_dup = filter_dup;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_ENABLE,
 				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
@@ -2554,6 +2614,7 @@ static int hci_passive_scan_sync(struct hci_dev *hdev)
 	u8 own_addr_type;
 	u8 filter_policy;
 	u16 window, interval;
+	u8 filter_dups = LE_SCAN_FILTER_DUP_ENABLE;
 	int err;
 
 	if (hdev->scanning_paused) {
@@ -2616,11 +2677,16 @@ static int hci_passive_scan_sync(struct hci_dev *hdev)
 		interval = hdev->le_scan_interval;
 	}
 
+	/* Disable all filtering for Mesh */
+	if (hci_dev_test_flag(hdev, HCI_MESH)) {
+		filter_policy = 0;
+		filter_dups = LE_SCAN_FILTER_DUP_DISABLE;
+	}
+
 	bt_dev_dbg(hdev, "LE passive scan with acceptlist = %d", filter_policy);
 
 	return hci_start_scan_sync(hdev, LE_SCAN_PASSIVE, interval, window,
-				   own_addr_type, filter_policy,
-				   LE_SCAN_FILTER_DUP_ENABLE);
+				   own_addr_type, filter_policy, filter_dups);
 }
 
 /* This function controls the passive scanning based on hdev->pend_le_conns
@@ -2670,7 +2736,8 @@ int hci_update_passive_scan_sync(struct hci_dev *hdev)
 	bt_dev_dbg(hdev, "ADV monitoring is %s",
 		   hci_is_adv_monitoring(hdev) ? "on" : "off");
 
-	if (list_empty(&hdev->pend_le_conns) &&
+	if (!hci_dev_test_flag(hdev, HCI_MESH) &&
+	    list_empty(&hdev->pend_le_conns) &&
 	    list_empty(&hdev->pend_le_reports) &&
 	    !hci_is_adv_monitoring(hdev) &&
 	    !hci_dev_test_flag(hdev, HCI_PA_SYNC)) {
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 8d70f4a709d4..e1c404ac8ce6 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -129,6 +129,10 @@ static const u16 mgmt_commands[] = {
 	MGMT_OP_ADD_EXT_ADV_PARAMS,
 	MGMT_OP_ADD_EXT_ADV_DATA,
 	MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI,
+	MGMT_OP_SET_MESH_RECEIVER,
+	MGMT_OP_MESH_READ_FEATURES,
+	MGMT_OP_MESH_SEND,
+	MGMT_OP_MESH_SEND_CANCEL,
 };
 
 static const u16 mgmt_events[] = {
@@ -1048,6 +1052,63 @@ static void discov_off(struct work_struct *work)
 	hci_dev_unlock(hdev);
 }
 
+static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev);
+
+static void mesh_send_complete(struct hci_dev *hdev,
+			       struct mgmt_mesh_tx *mesh_tx, bool silent)
+{
+	u8 handle = mesh_tx->handle;
+
+	if (!silent)
+		mgmt_event(MGMT_EV_MESH_PACKET_CMPLT, hdev, &handle,
+			   sizeof(handle), NULL);
+
+	mgmt_mesh_remove(mesh_tx);
+}
+
+static int mesh_send_done_sync(struct hci_dev *hdev, void *data)
+{
+	struct mgmt_mesh_tx *mesh_tx;
+
+	hci_dev_clear_flag(hdev, HCI_MESH_SENDING);
+	hci_disable_advertising_sync(hdev);
+	mesh_tx = mgmt_mesh_next(hdev, NULL);
+
+	if (mesh_tx)
+		mesh_send_complete(hdev, mesh_tx, false);
+
+	return 0;
+}
+
+static int mesh_send_sync(struct hci_dev *hdev, void *data);
+static void mesh_send_start_complete(struct hci_dev *hdev, void *data, int err);
+static void mesh_next(struct hci_dev *hdev, void *data, int err)
+{
+	struct mgmt_mesh_tx *mesh_tx = mgmt_mesh_next(hdev, NULL);
+
+	if (!mesh_tx)
+		return;
+
+	err = hci_cmd_sync_queue(hdev, mesh_send_sync, mesh_tx,
+				 mesh_send_start_complete);
+
+	if (err < 0)
+		mesh_send_complete(hdev, mesh_tx, false);
+	else
+		hci_dev_set_flag(hdev, HCI_MESH_SENDING);
+}
+
+static void mesh_send_done(struct work_struct *work)
+{
+	struct hci_dev *hdev = container_of(work, struct hci_dev,
+					    mesh_send_done.work);
+
+	if (!hci_dev_test_flag(hdev, HCI_MESH_SENDING))
+		return;
+
+	hci_cmd_sync_queue(hdev, mesh_send_done_sync, NULL, mesh_next);
+}
+
 static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev)
 {
 	if (hci_dev_test_and_set_flag(hdev, HCI_MGMT))
@@ -1058,6 +1119,7 @@ static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev)
 	INIT_DELAYED_WORK(&hdev->discov_off, discov_off);
 	INIT_DELAYED_WORK(&hdev->service_cache, service_cache_off);
 	INIT_DELAYED_WORK(&hdev->rpa_expired, rpa_expired);
+	INIT_DELAYED_WORK(&hdev->mesh_send_done, mesh_send_done);
 
 	/* Non-mgmt controlled devices get this bit set
 	 * implicitly so that pairing works for them, however
@@ -2094,6 +2156,306 @@ static int set_le_sync(struct hci_dev *hdev, void *data)
 	return err;
 }
 
+static void set_mesh_complete(struct hci_dev *hdev, void *data, int err)
+{
+	struct mgmt_pending_cmd *cmd = data;
+	u8 status = mgmt_status(err);
+	struct sock *sk = cmd->sk;
+
+	if (status) {
+		mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev,
+				     cmd_status_rsp, &status);
+		return;
+	}
+
+	mgmt_pending_remove(cmd);
+	mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, 0, NULL, 0);
+}
+
+static int set_mesh_sync(struct hci_dev *hdev, void *data)
+{
+	struct mgmt_pending_cmd *cmd = data;
+	struct mgmt_cp_set_mesh *cp = cmd->param;
+	size_t len = cmd->param_len;
+
+	memset(hdev->mesh_ad_types, 0, sizeof(hdev->mesh_ad_types));
+
+	if (cp->enable)
+		hci_dev_set_flag(hdev, HCI_MESH);
+	else
+		hci_dev_clear_flag(hdev, HCI_MESH);
+
+	len -= sizeof(*cp);
+
+	/* If filters don't fit, forward all adv pkts */
+	if (len <= sizeof(hdev->mesh_ad_types))
+		memcpy(hdev->mesh_ad_types, cp->ad_types, len);
+
+	hci_update_passive_scan_sync(hdev);
+	return 0;
+}
+
+static int set_mesh(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+	struct mgmt_cp_set_mesh *cp = data;
+	struct mgmt_pending_cmd *cmd;
+	int err = 0;
+
+	bt_dev_dbg(hdev, "sock %p", sk);
+
+	if (!lmp_le_capable(hdev))
+		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+				       MGMT_STATUS_NOT_SUPPORTED);
+
+	if (cp->enable != 0x00 && cp->enable != 0x01)
+		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+				       MGMT_STATUS_INVALID_PARAMS);
+
+	hci_dev_lock(hdev);
+
+	cmd = mgmt_pending_add(sk, MGMT_OP_SET_MESH_RECEIVER, hdev, data, len);
+	if (!cmd)
+		err = -ENOMEM;
+	else
+		err = hci_cmd_sync_queue(hdev, set_mesh_sync, cmd,
+					 set_mesh_complete);
+
+	if (err < 0) {
+		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+				      MGMT_STATUS_FAILED);
+
+		if (cmd)
+			mgmt_pending_remove(cmd);
+	}
+
+	hci_dev_unlock(hdev);
+	return err;
+}
+
+static void mesh_send_start_complete(struct hci_dev *hdev, void *data, int err)
+{
+	struct mgmt_mesh_tx *mesh_tx = data;
+	struct mgmt_cp_mesh_send *send = (void *)mesh_tx->param;
+	unsigned long mesh_send_interval;
+	u8 mgmt_err = mgmt_status(err);
+
+	/* Report any errors here, but don't report completion */
+
+	if (mgmt_err) {
+		hci_dev_clear_flag(hdev, HCI_MESH_SENDING);
+		/* Send Complete Error Code for handle */
+		mesh_send_complete(hdev, mesh_tx, false);
+		return;
+	}
+
+	mesh_send_interval = msecs_to_jiffies((send->cnt) * 25);
+	queue_delayed_work(hdev->req_workqueue, &hdev->mesh_send_done,
+			   mesh_send_interval);
+}
+
+static int mesh_send_sync(struct hci_dev *hdev, void *data)
+{
+	struct mgmt_mesh_tx *mesh_tx = data;
+	struct mgmt_cp_mesh_send *send = (void *)mesh_tx->param;
+	struct adv_info *adv, *next_instance;
+	u8 instance = hdev->le_num_of_adv_sets + 1;
+	u16 timeout, duration;
+	int err = 0;
+
+	if (hdev->le_num_of_adv_sets <= hdev->adv_instance_cnt)
+		return MGMT_STATUS_BUSY;
+
+	timeout = 1000;
+	duration = send->cnt * INTERVAL_TO_MS(hdev->le_adv_max_interval);
+	adv = hci_add_adv_instance(hdev, instance, 0,
+				   send->adv_data_len, send->adv_data,
+				   0, NULL,
+				   timeout, duration,
+				   HCI_ADV_TX_POWER_NO_PREFERENCE,
+				   hdev->le_adv_min_interval,
+				   hdev->le_adv_max_interval,
+				   mesh_tx->handle);
+
+	if (!IS_ERR(adv))
+		mesh_tx->instance = instance;
+	else
+		err = PTR_ERR(adv);
+
+	if (hdev->cur_adv_instance == instance) {
+		/* If the currently advertised instance is being changed then
+		 * cancel the current advertising and schedule the next
+		 * instance. If there is only one instance then the overridden
+		 * advertising data will be visible right away.
+		 */
+		cancel_adv_timeout(hdev);
+
+		next_instance = hci_get_next_instance(hdev, instance);
+		if (next_instance)
+			instance = next_instance->instance;
+		else
+			instance = 0;
+	} else if (hdev->adv_instance_timeout) {
+		/* Immediately advertise the new instance if no other, or
+		 * let it go naturally from queue if ADV is already happening
+		 */
+		instance = 0;
+	}
+
+	if (instance)
+		return hci_schedule_adv_instance_sync(hdev, instance, true);
+
+	return err;
+}
+
+static void send_count(struct mgmt_mesh_tx *mesh_tx, void *data)
+{
+	struct mgmt_rp_mesh_read_features *rp = data;
+
+	if (rp->used_handles >= rp->max_handles)
+		return;
+
+	rp->handles[rp->used_handles++] = mesh_tx->handle;
+}
+
+static int mesh_features(struct sock *sk, struct hci_dev *hdev,
+			 void *data, u16 len)
+{
+	struct mgmt_rp_mesh_read_features rp;
+
+	if (!lmp_le_capable(hdev))
+		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_READ_FEATURES,
+				       MGMT_STATUS_NOT_SUPPORTED);
+
+	memset(&rp, 0, sizeof(rp));
+	rp.index = cpu_to_le16(hdev->id);
+	if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+		rp.max_handles = MESH_HANDLES_MAX;
+
+	hci_dev_lock(hdev);
+
+	if (rp.max_handles)
+		mgmt_mesh_foreach(hdev, send_count, &rp, sk);
+
+	mgmt_cmd_complete(sk, hdev->id, MGMT_OP_MESH_READ_FEATURES, 0, &rp,
+			  rp.used_handles + sizeof(rp) - MESH_HANDLES_MAX);
+
+	hci_dev_unlock(hdev);
+	return 0;
+}
+
+static int send_cancel(struct hci_dev *hdev, void *data)
+{
+	struct mgmt_pending_cmd *cmd = data;
+	struct mgmt_cp_mesh_send_cancel *cancel = (void *)cmd->param;
+	struct mgmt_mesh_tx *mesh_tx;
+
+	if (!cancel->handle) {
+		do {
+			mesh_tx = mgmt_mesh_next(hdev, cmd->sk);
+
+			if (mesh_tx)
+				mesh_send_complete(hdev, mesh_tx, false);
+		} while (mesh_tx);
+	} else {
+		mesh_tx = mgmt_mesh_find(hdev, cancel->handle);
+
+		if (mesh_tx && mesh_tx->sk == cmd->sk)
+			mesh_send_complete(hdev, mesh_tx, false);
+	}
+
+	mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
+			  0, NULL, 0);
+	mgmt_pending_free(cmd);
+
+	return 0;
+}
+
+static int mesh_send_cancel(struct sock *sk, struct hci_dev *hdev,
+			    void *data, u16 len)
+{
+	struct mgmt_pending_cmd *cmd;
+	int err;
+
+	if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
+				       MGMT_STATUS_REJECTED);
+
+	hci_dev_lock(hdev);
+	cmd = mgmt_pending_new(sk, MGMT_OP_MESH_SEND_CANCEL, hdev, data, len);
+	if (!cmd)
+		err = -ENOMEM;
+	else
+		err = hci_cmd_sync_queue(hdev, send_cancel, cmd, NULL);
+
+	if (err < 0) {
+		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
+				      MGMT_STATUS_FAILED);
+
+		if (cmd)
+			mgmt_pending_free(cmd);
+	}
+
+	hci_dev_unlock(hdev);
+	return err;
+}
+
+static int mesh_send(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+	struct mgmt_mesh_tx *mesh_tx;
+	struct mgmt_cp_mesh_send *send = data;
+	struct mgmt_rp_mesh_read_features rp;
+	bool sending;
+	int err = 0;
+
+	if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) ||
+	    len <= MGMT_MESH_SEND_SIZE ||
+	    len > (MGMT_MESH_SEND_SIZE + 31))
+		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND,
+				       MGMT_STATUS_REJECTED);
+
+	hci_dev_lock(hdev);
+
+	memset(&rp, 0, sizeof(rp));
+	rp.max_handles = MESH_HANDLES_MAX;
+
+	mgmt_mesh_foreach(hdev, send_count, &rp, sk);
+
+	if (rp.max_handles <= rp.used_handles) {
+		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND,
+				      MGMT_STATUS_BUSY);
+		goto done;
+	}
+
+	sending = hci_dev_test_flag(hdev, HCI_MESH_SENDING);
+	mesh_tx = mgmt_mesh_add(sk, hdev, send, len);
+
+	if (!mesh_tx)
+		err = -ENOMEM;
+	else if (!sending)
+		err = hci_cmd_sync_queue(hdev, mesh_send_sync, mesh_tx,
+					 mesh_send_start_complete);
+
+	if (err < 0) {
+		bt_dev_err(hdev, "Send Mesh Failed %d", err);
+		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND,
+				      MGMT_STATUS_FAILED);
+
+		if (mesh_tx) {
+			if (sending)
+				mgmt_mesh_remove(mesh_tx);
+		}
+	} else {
+		hci_dev_set_flag(hdev, HCI_MESH_SENDING);
+
+		mgmt_cmd_complete(sk, hdev->id, MGMT_OP_MESH_SEND, 0,
+				  &mesh_tx->handle, 1);
+	}
+
+done:
+	hci_dev_unlock(hdev);
+	return err;
+}
+
 static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 {
 	struct mgmt_mode *cp = data;
@@ -5993,6 +6355,7 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
 	if (!hdev_is_powered(hdev) ||
 	    (val == hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
 	     (cp->val == 0x02) == hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) ||
+	    hci_dev_test_flag(hdev, HCI_MESH) ||
 	    hci_conn_num(hdev, LE_LINK) > 0 ||
 	    (hci_dev_test_flag(hdev, HCI_LE_SCAN) &&
 	     hdev->le_scan_type == LE_SCAN_ACTIVE)) {
@@ -7921,8 +8284,7 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
 	/* In extended adv TX_POWER returned from Set Adv Param
 	 * will be always valid.
 	 */
-	if ((hdev->adv_tx_power != HCI_TX_POWER_INVALID) ||
-	    ext_adv_capable(hdev))
+	if (hdev->adv_tx_power != HCI_TX_POWER_INVALID || ext_adv_capable(hdev))
 		flags |= MGMT_ADV_FLAG_TX_POWER;
 
 	if (ext_adv_capable(hdev)) {
@@ -7975,8 +8337,14 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
 
 	instance = rp->instance;
 	list_for_each_entry(adv_instance, &hdev->adv_instances, list) {
-		*instance = adv_instance->instance;
-		instance++;
+		/* Only instances 1-le_num_of_adv_sets are externally visible */
+		if (adv_instance->instance <= hdev->adv_instance_cnt) {
+			*instance = adv_instance->instance;
+			instance++;
+		} else {
+			rp->num_instances--;
+			rp_len--;
+		}
 	}
 
 	hci_dev_unlock(hdev);
@@ -8238,7 +8606,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
 				   timeout, duration,
 				   HCI_ADV_TX_POWER_NO_PREFERENCE,
 				   hdev->le_adv_min_interval,
-				   hdev->le_adv_max_interval);
+				   hdev->le_adv_max_interval, 0);
 	if (IS_ERR(adv)) {
 		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
 				      MGMT_STATUS_FAILED);
@@ -8442,7 +8810,7 @@ static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev,
 	/* Create advertising instance with no advertising or response data */
 	adv = hci_add_adv_instance(hdev, cp->instance, flags, 0, NULL, 0, NULL,
 				   timeout, duration, tx_power, min_interval,
-				   max_interval);
+				   max_interval, 0);
 
 	if (IS_ERR(adv)) {
 		err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
@@ -8888,8 +9256,13 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
 	{ add_ext_adv_data,        MGMT_ADD_EXT_ADV_DATA_SIZE,
 						HCI_MGMT_VAR_LEN },
 	{ add_adv_patterns_monitor_rssi,
-				   MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE,
+				   MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE },
+	{ set_mesh,                MGMT_SET_MESH_RECEIVER_SIZE,
 						HCI_MGMT_VAR_LEN },
+	{ mesh_features,           MGMT_MESH_READ_FEATURES_SIZE },
+	{ mesh_send,               MGMT_MESH_SEND_SIZE,
+						HCI_MGMT_VAR_LEN },
+	{ mesh_send_cancel,        MGMT_MESH_SEND_CANCEL_SIZE },
 };
 
 void mgmt_index_added(struct hci_dev *hdev)
@@ -9829,14 +10202,86 @@ static void mgmt_adv_monitor_device_found(struct hci_dev *hdev,
 		kfree_skb(skb);
 }
 
+static void mesh_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr,
+			      u8 addr_type, s8 rssi, u32 flags, u8 *eir,
+			      u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len,
+			      u64 instant)
+{
+	struct sk_buff *skb;
+	struct mgmt_ev_mesh_device_found *ev;
+	int i, j;
+
+	if (!hdev->mesh_ad_types[0])
+		goto accepted;
+
+	/* Scan for requested AD types */
+	if (eir_len > 0) {
+		for (i = 0; i + 1 < eir_len; i += eir[i] + 1) {
+			for (j = 0; j < sizeof(hdev->mesh_ad_types); j++) {
+				if (!hdev->mesh_ad_types[j])
+					break;
+
+				if (hdev->mesh_ad_types[j] == eir[i + 1])
+					goto accepted;
+			}
+		}
+	}
+
+	if (scan_rsp_len > 0) {
+		for (i = 0; i + 1 < scan_rsp_len; i += scan_rsp[i] + 1) {
+			for (j = 0; j < sizeof(hdev->mesh_ad_types); j++) {
+				if (!hdev->mesh_ad_types[j])
+					break;
+
+				if (hdev->mesh_ad_types[j] == scan_rsp[i + 1])
+					goto accepted;
+			}
+		}
+	}
+
+	return;
+
+accepted:
+	skb = mgmt_alloc_skb(hdev, MGMT_EV_MESH_DEVICE_FOUND,
+			     sizeof(*ev) + eir_len + scan_rsp_len);
+	if (!skb)
+		return;
+
+	ev = skb_put(skb, sizeof(*ev));
+
+	bacpy(&ev->addr.bdaddr, bdaddr);
+	ev->addr.type = link_to_bdaddr(LE_LINK, addr_type);
+	ev->rssi = rssi;
+	ev->flags = cpu_to_le32(flags);
+	ev->instant = cpu_to_le64(instant);
+
+	if (eir_len > 0)
+		/* Copy EIR or advertising data into event */
+		skb_put_data(skb, eir, eir_len);
+
+	if (scan_rsp_len > 0)
+		/* Append scan response data to event */
+		skb_put_data(skb, scan_rsp, scan_rsp_len);
+
+	ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len);
+
+	mgmt_event_skb(skb, NULL);
+}
+
 void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
 		       u8 addr_type, u8 *dev_class, s8 rssi, u32 flags,
-		       u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len)
+		       u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len,
+		       u64 instant)
 {
 	struct sk_buff *skb;
 	struct mgmt_ev_device_found *ev;
 	bool report_device = hci_discovery_active(hdev);
 
+	if (hci_dev_test_flag(hdev, HCI_MESH) && link_type == LE_LINK)
+		mesh_device_found(hdev, bdaddr, addr_type, rssi, flags,
+				  eir, eir_len, scan_rsp, scan_rsp_len,
+				  instant);
+
 	/* Don't send events for a non-kernel initiated discovery. With
 	 * LE one exception is if we have pend_le_reports > 0 in which
 	 * case we're doing passive scanning and want these events.
@@ -9995,3 +10440,22 @@ void mgmt_exit(void)
 {
 	hci_mgmt_chan_unregister(&chan);
 }
+
+void mgmt_cleanup(struct sock *sk)
+{
+	struct mgmt_mesh_tx *mesh_tx;
+	struct hci_dev *hdev;
+
+	read_lock(&hci_dev_list_lock);
+
+	list_for_each_entry(hdev, &hci_dev_list, list) {
+		do {
+			mesh_tx = mgmt_mesh_next(hdev, sk);
+
+			if (mesh_tx)
+				mesh_send_complete(hdev, mesh_tx, true);
+		} while (mesh_tx);
+	}
+
+	read_unlock(&hci_dev_list_lock);
+}
diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c
index b69cfed62088..0115f783bde8 100644
--- a/net/bluetooth/mgmt_util.c
+++ b/net/bluetooth/mgmt_util.c
@@ -314,3 +314,77 @@ void mgmt_pending_remove(struct mgmt_pending_cmd *cmd)
 	list_del(&cmd->list);
 	mgmt_pending_free(cmd);
 }
+
+void mgmt_mesh_foreach(struct hci_dev *hdev,
+		       void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data),
+		       void *data, struct sock *sk)
+{
+	struct mgmt_mesh_tx *mesh_tx, *tmp;
+
+	list_for_each_entry_safe(mesh_tx, tmp, &hdev->mgmt_pending, list) {
+		if (!sk || mesh_tx->sk == sk)
+			cb(mesh_tx, data);
+	}
+}
+
+struct mgmt_mesh_tx *mgmt_mesh_next(struct hci_dev *hdev, struct sock *sk)
+{
+	struct mgmt_mesh_tx *mesh_tx;
+
+	if (list_empty(&hdev->mesh_pending))
+		return NULL;
+
+	list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) {
+		if (!sk || mesh_tx->sk == sk)
+			return mesh_tx;
+	}
+
+	return NULL;
+}
+
+struct mgmt_mesh_tx *mgmt_mesh_find(struct hci_dev *hdev, u8 handle)
+{
+	struct mgmt_mesh_tx *mesh_tx;
+
+	if (list_empty(&hdev->mesh_pending))
+		return NULL;
+
+	list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) {
+		if (mesh_tx->handle == handle)
+			return mesh_tx;
+	}
+
+	return NULL;
+}
+
+struct mgmt_mesh_tx *mgmt_mesh_add(struct sock *sk, struct hci_dev *hdev,
+				   void *data, u16 len)
+{
+	struct mgmt_mesh_tx *mesh_tx;
+
+	mesh_tx = kzalloc(sizeof(*mesh_tx), GFP_KERNEL);
+	if (!mesh_tx)
+		return NULL;
+
+	hdev->mesh_send_ref++;
+	if (!hdev->mesh_send_ref)
+		hdev->mesh_send_ref++;
+
+	mesh_tx->handle = hdev->mesh_send_ref;
+	mesh_tx->index = hdev->id;
+	memcpy(mesh_tx->param, data, len);
+	mesh_tx->param_len = len;
+	mesh_tx->sk = sk;
+	sock_hold(sk);
+
+	list_add_tail(&mesh_tx->list, &hdev->mesh_pending);
+
+	return mesh_tx;
+}
+
+void mgmt_mesh_remove(struct mgmt_mesh_tx *mesh_tx)
+{
+	list_del(&mesh_tx->list);
+	sock_put(mesh_tx->sk);
+	kfree(mesh_tx);
+}
diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h
index 98e40395a383..6a8b7e84293d 100644
--- a/net/bluetooth/mgmt_util.h
+++ b/net/bluetooth/mgmt_util.h
@@ -20,6 +20,16 @@
    SOFTWARE IS DISCLAIMED.
 */
 
+struct mgmt_mesh_tx {
+	struct list_head list;
+	int index;
+	size_t param_len;
+	struct sock *sk;
+	u8 handle;
+	u8 instance;
+	u8 param[sizeof(struct mgmt_cp_mesh_send) + 29];
+};
+
 struct mgmt_pending_cmd {
 	struct list_head list;
 	u16 opcode;
@@ -59,3 +69,11 @@ struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode,
 					  void *data, u16 len);
 void mgmt_pending_free(struct mgmt_pending_cmd *cmd);
 void mgmt_pending_remove(struct mgmt_pending_cmd *cmd);
+void mgmt_mesh_foreach(struct hci_dev *hdev,
+		       void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data),
+		       void *data, struct sock *sk);
+struct mgmt_mesh_tx *mgmt_mesh_find(struct hci_dev *hdev, u8 handle);
+struct mgmt_mesh_tx *mgmt_mesh_next(struct hci_dev *hdev, struct sock *sk);
+struct mgmt_mesh_tx *mgmt_mesh_add(struct sock *sk, struct hci_dev *hdev,
+				   void *data, u16 len);
+void mgmt_mesh_remove(struct mgmt_mesh_tx *mesh_tx);
-- 
cgit v1.2.3


From af6bcc1921ff0b644d2d750c0e3a88623b7211f5 Mon Sep 17 00:00:00 2001
From: Brian Gix <brian.gix@intel.com>
Date: Thu, 1 Sep 2022 12:19:14 -0700
Subject: Bluetooth: Add experimental wrapper for MGMT based mesh

This introduces a "Mesh UUID" and an Experimental Feature bit to the
hdev mask, and depending all underlying Mesh functionality on it.

Signed-off-by: Brian Gix <brian.gix@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h |   1 +
 net/bluetooth/mgmt.c        | 112 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 105 insertions(+), 8 deletions(-)

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index b3ade687531f..e004ba04a9ae 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -354,6 +354,7 @@ enum {
 	HCI_LE_SIMULTANEOUS_ROLES,
 	HCI_CMD_DRAIN_WORKQUEUE,
 
+	HCI_MESH_EXPERIMENTAL,
 	HCI_MESH,
 	HCI_MESH_SENDING,
 
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index e1c404ac8ce6..4c421ebac669 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -2203,7 +2203,8 @@ static int set_mesh(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 
 	bt_dev_dbg(hdev, "sock %p", sk);
 
-	if (!lmp_le_capable(hdev))
+	if (!lmp_le_capable(hdev) ||
+	    !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
 				       MGMT_STATUS_NOT_SUPPORTED);
 
@@ -2322,7 +2323,8 @@ static int mesh_features(struct sock *sk, struct hci_dev *hdev,
 {
 	struct mgmt_rp_mesh_read_features rp;
 
-	if (!lmp_le_capable(hdev))
+	if (!lmp_le_capable(hdev) ||
+	    !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_READ_FEATURES,
 				       MGMT_STATUS_NOT_SUPPORTED);
 
@@ -2376,6 +2378,11 @@ static int mesh_send_cancel(struct sock *sk, struct hci_dev *hdev,
 	struct mgmt_pending_cmd *cmd;
 	int err;
 
+	if (!lmp_le_capable(hdev) ||
+	    !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
+				       MGMT_STATUS_NOT_SUPPORTED);
+
 	if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
 		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
 				       MGMT_STATUS_REJECTED);
@@ -2407,6 +2414,10 @@ static int mesh_send(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
 	bool sending;
 	int err = 0;
 
+	if (!lmp_le_capable(hdev) ||
+	    !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+		return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND,
+				       MGMT_STATUS_NOT_SUPPORTED);
 	if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) ||
 	    len <= MGMT_MESH_SEND_SIZE ||
 	    len > (MGMT_MESH_SEND_SIZE + 31))
@@ -4365,17 +4376,30 @@ static const u8 iso_socket_uuid[16] = {
 	0x6a, 0x49, 0xe0, 0x05, 0x88, 0xf1, 0xba, 0x6f,
 };
 
+/* 2ce463d7-7a03-4d8d-bf05-5f24e8f36e76 */
+static const u8 mgmt_mesh_uuid[16] = {
+	0x76, 0x6e, 0xf3, 0xe8, 0x24, 0x5f, 0x05, 0xbf,
+	0x8d, 0x4d, 0x03, 0x7a, 0xd7, 0x63, 0xe4, 0x2c,
+};
+
 static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
 				  void *data, u16 data_len)
 {
-	char buf[122];   /* Enough space for 6 features: 2 + 20 * 6 */
-	struct mgmt_rp_read_exp_features_info *rp = (void *)buf;
+	struct mgmt_rp_read_exp_features_info *rp;
+	size_t len;
 	u16 idx = 0;
 	u32 flags;
+	int status;
 
 	bt_dev_dbg(hdev, "sock %p", sk);
 
-	memset(&buf, 0, sizeof(buf));
+	/* Enough space for 7 features */
+	len = sizeof(*rp) + (sizeof(rp->features[0]) * 7);
+	rp = kmalloc(len, GFP_KERNEL);
+	if (!rp)
+		return -ENOMEM;
+
+	memset(rp, 0, len);
 
 #ifdef CONFIG_BT_FEATURE_DEBUG
 	if (!hdev) {
@@ -4439,6 +4463,17 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
 		idx++;
 	}
 
+	if (hdev && lmp_le_capable(hdev)) {
+		if (hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+			flags = BIT(0);
+		else
+			flags = 0;
+
+		memcpy(rp->features[idx].uuid, mgmt_mesh_uuid, 16);
+		rp->features[idx].flags = cpu_to_le32(flags);
+		idx++;
+	}
+
 	rp->feature_count = cpu_to_le16(idx);
 
 	/* After reading the experimental features information, enable
@@ -4446,9 +4481,12 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
 	 */
 	hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
 
-	return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
-				 MGMT_OP_READ_EXP_FEATURES_INFO,
-				 0, rp, sizeof(*rp) + (20 * idx));
+	status = mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
+				   MGMT_OP_READ_EXP_FEATURES_INFO,
+				   0, rp, sizeof(*rp) + (20 * idx));
+
+	kfree(rp);
+	return status;
 }
 
 static int exp_ll_privacy_feature_changed(bool enabled, struct hci_dev *hdev,
@@ -4576,6 +4614,63 @@ static int set_debug_func(struct sock *sk, struct hci_dev *hdev,
 }
 #endif
 
+static int set_mgmt_mesh_func(struct sock *sk, struct hci_dev *hdev,
+			      struct mgmt_cp_set_exp_feature *cp, u16 data_len)
+{
+	struct mgmt_rp_set_exp_feature rp;
+	bool val, changed;
+	int err;
+
+	/* Command requires to use the controller index */
+	if (!hdev)
+		return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+				       MGMT_OP_SET_EXP_FEATURE,
+				       MGMT_STATUS_INVALID_INDEX);
+
+	/* Changes can only be made when controller is powered down */
+	if (hdev_is_powered(hdev))
+		return mgmt_cmd_status(sk, hdev->id,
+				       MGMT_OP_SET_EXP_FEATURE,
+				       MGMT_STATUS_REJECTED);
+
+	/* Parameters are limited to a single octet */
+	if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+		return mgmt_cmd_status(sk, hdev->id,
+				       MGMT_OP_SET_EXP_FEATURE,
+				       MGMT_STATUS_INVALID_PARAMS);
+
+	/* Only boolean on/off is supported */
+	if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+		return mgmt_cmd_status(sk, hdev->id,
+				       MGMT_OP_SET_EXP_FEATURE,
+				       MGMT_STATUS_INVALID_PARAMS);
+
+	val = !!cp->param[0];
+
+	if (val) {
+		changed = !hci_dev_test_and_set_flag(hdev,
+						     HCI_MESH_EXPERIMENTAL);
+	} else {
+		hci_dev_clear_flag(hdev, HCI_MESH);
+		changed = hci_dev_test_and_clear_flag(hdev,
+						      HCI_MESH_EXPERIMENTAL);
+	}
+
+	memcpy(rp.uuid, mgmt_mesh_uuid, 16);
+	rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+
+	hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+	err = mgmt_cmd_complete(sk, hdev->id,
+				MGMT_OP_SET_EXP_FEATURE, 0,
+				&rp, sizeof(rp));
+
+	if (changed)
+		exp_feature_changed(hdev, mgmt_mesh_uuid, val, sk);
+
+	return err;
+}
+
 static int set_rpa_resolution_func(struct sock *sk, struct hci_dev *hdev,
 				   struct mgmt_cp_set_exp_feature *cp,
 				   u16 data_len)
@@ -4891,6 +4986,7 @@ static const struct mgmt_exp_feature {
 #ifdef CONFIG_BT_FEATURE_DEBUG
 	EXP_FEAT(debug_uuid, set_debug_func),
 #endif
+	EXP_FEAT(mgmt_mesh_uuid, set_mgmt_mesh_func),
 	EXP_FEAT(rpa_resolution_uuid, set_rpa_resolution_func),
 	EXP_FEAT(quality_report_uuid, set_quality_report_func),
 	EXP_FEAT(offload_codecs_uuid, set_offload_codec_func),
-- 
cgit v1.2.3


From c1631dbc00c1e432713396aaa10d8bd825822db0 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 1 Sep 2022 17:24:13 -0700
Subject: Bluetooth: hci_sync: Fix hci_read_buffer_size_sync

hci_read_buffer_size_sync shall not use HCI_OP_LE_READ_BUFFER_SIZE_V2
sinze that is LE specific, instead it is hci_le_read_buffer_size_sync
version that shall use it.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=216382
Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_sync.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index fa433896ddc7..41b6d19c70b0 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -3339,12 +3339,6 @@ static const struct hci_init_stage amp_init2[] = {
 /* Read Buffer Size (ACL mtu, max pkt, etc.) */
 static int hci_read_buffer_size_sync(struct hci_dev *hdev)
 {
-	/* Use Read LE Buffer Size V2 if supported */
-	if (hdev->commands[41] & 0x20)
-		return __hci_cmd_sync_status(hdev,
-					     HCI_OP_LE_READ_BUFFER_SIZE_V2,
-					     0, NULL, HCI_CMD_TIMEOUT);
-
 	return __hci_cmd_sync_status(hdev, HCI_OP_READ_BUFFER_SIZE,
 				     0, NULL, HCI_CMD_TIMEOUT);
 }
@@ -3558,6 +3552,12 @@ static const struct hci_init_stage hci_init2[] = {
 /* Read LE Buffer Size */
 static int hci_le_read_buffer_size_sync(struct hci_dev *hdev)
 {
+	/* Use Read LE Buffer Size V2 if supported */
+	if (hdev->commands[41] & 0x20)
+		return __hci_cmd_sync_status(hdev,
+					     HCI_OP_LE_READ_BUFFER_SIZE_V2,
+					     0, NULL, HCI_CMD_TIMEOUT);
+
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE,
 				     0, NULL, HCI_CMD_TIMEOUT);
 }
-- 
cgit v1.2.3


From 14db0b3c7b837f4edeb7c1794290c2f345c7f627 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 15 Aug 2022 16:50:51 -0700
Subject: fscrypt: stop using PG_error to track error status

As a step towards freeing the PG_error flag for other uses, change ext4
and f2fs to stop using PG_error to track decryption errors.  Instead, if
a decryption error occurs, just mark the whole bio as failed.  The
coarser granularity isn't really a problem since it isn't any worse than
what the block layer provides, and errors from a multi-page readahead
aren't reported to applications unless a single-page read fails too.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <chao@kernel.org> # for f2fs part
Link: https://lore.kernel.org/r/20220815235052.86545-2-ebiggers@kernel.org
---
 fs/crypto/bio.c         | 16 ++++++++++------
 fs/ext4/readpage.c      | 10 ++++++----
 fs/f2fs/data.c          | 18 ++++++++++--------
 include/linux/fscrypt.h |  5 +++--
 4 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 2217fe5ece6f..1b4403136d05 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -25,21 +25,25 @@
  * then this function isn't applicable.  This function may sleep, so it must be
  * called from a workqueue rather than from the bio's bi_end_io callback.
  *
- * This function sets PG_error on any pages that contain any blocks that failed
- * to be decrypted.  The filesystem must not mark such pages uptodate.
+ * Return: %true on success; %false on failure.  On failure, bio->bi_status is
+ *	   also set to an error status.
  */
-void fscrypt_decrypt_bio(struct bio *bio)
+bool fscrypt_decrypt_bio(struct bio *bio)
 {
 	struct bio_vec *bv;
 	struct bvec_iter_all iter_all;
 
 	bio_for_each_segment_all(bv, bio, iter_all) {
 		struct page *page = bv->bv_page;
-		int ret = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len,
+		int err = fscrypt_decrypt_pagecache_blocks(page, bv->bv_len,
 							   bv->bv_offset);
-		if (ret)
-			SetPageError(page);
+
+		if (err) {
+			bio->bi_status = errno_to_blk_status(err);
+			return false;
+		}
 	}
+	return true;
 }
 EXPORT_SYMBOL(fscrypt_decrypt_bio);
 
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index e02a5f14e021..3d21eae267fc 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -75,7 +75,7 @@ static void __read_end_io(struct bio *bio)
 	bio_for_each_segment_all(bv, bio, iter_all) {
 		page = bv->bv_page;
 
-		/* PG_error was set if any post_read step failed */
+		/* PG_error was set if verity failed. */
 		if (bio->bi_status || PageError(page)) {
 			ClearPageUptodate(page);
 			/* will re-read again later */
@@ -96,10 +96,12 @@ static void decrypt_work(struct work_struct *work)
 {
 	struct bio_post_read_ctx *ctx =
 		container_of(work, struct bio_post_read_ctx, work);
+	struct bio *bio = ctx->bio;
 
-	fscrypt_decrypt_bio(ctx->bio);
-
-	bio_post_read_processing(ctx);
+	if (fscrypt_decrypt_bio(bio))
+		bio_post_read_processing(ctx);
+	else
+		__read_end_io(bio);
 }
 
 static void verity_work(struct work_struct *work)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aa3ccddfa037..93cc2ec51c2a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -139,7 +139,7 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
 			continue;
 		}
 
-		/* PG_error was set if decryption or verity failed. */
+		/* PG_error was set if verity failed. */
 		if (bio->bi_status || PageError(page)) {
 			ClearPageUptodate(page);
 			/* will re-read again later */
@@ -185,7 +185,7 @@ static void f2fs_verify_bio(struct work_struct *work)
 			struct page *page = bv->bv_page;
 
 			if (!f2fs_is_compressed_page(page) &&
-			    !PageError(page) && !fsverity_verify_page(page))
+			    !fsverity_verify_page(page))
 				SetPageError(page);
 		}
 	} else {
@@ -236,10 +236,9 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx,
 	bio_for_each_segment_all(bv, ctx->bio, iter_all) {
 		struct page *page = bv->bv_page;
 
-		/* PG_error was set if decryption failed. */
 		if (f2fs_is_compressed_page(page))
-			f2fs_end_read_compressed_page(page, PageError(page),
-						blkaddr, in_task);
+			f2fs_end_read_compressed_page(page, false, blkaddr,
+						      in_task);
 		else
 			all_compressed = false;
 
@@ -259,14 +258,17 @@ static void f2fs_post_read_work(struct work_struct *work)
 {
 	struct bio_post_read_ctx *ctx =
 		container_of(work, struct bio_post_read_ctx, work);
+	struct bio *bio = ctx->bio;
 
-	if (ctx->enabled_steps & STEP_DECRYPT)
-		fscrypt_decrypt_bio(ctx->bio);
+	if ((ctx->enabled_steps & STEP_DECRYPT) && !fscrypt_decrypt_bio(bio)) {
+		f2fs_finish_read_bio(bio, true);
+		return;
+	}
 
 	if (ctx->enabled_steps & STEP_DECOMPRESS)
 		f2fs_handle_step_decompress(ctx, true);
 
-	f2fs_verify_and_finish_bio(ctx->bio, true);
+	f2fs_verify_and_finish_bio(bio, true);
 }
 
 static void f2fs_read_end_io(struct bio *bio)
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index b95b8601b9c1..488fd8c8f8af 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -351,7 +351,7 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name);
 int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags);
 
 /* bio.c */
-void fscrypt_decrypt_bio(struct bio *bio);
+bool fscrypt_decrypt_bio(struct bio *bio);
 int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 			  sector_t pblk, unsigned int len);
 
@@ -644,8 +644,9 @@ static inline int fscrypt_d_revalidate(struct dentry *dentry,
 }
 
 /* bio.c */
-static inline void fscrypt_decrypt_bio(struct bio *bio)
+static inline bool fscrypt_decrypt_bio(struct bio *bio)
 {
+	return true;
 }
 
 static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
-- 
cgit v1.2.3


From 1e660f7ebe0ff6ac65ee0000280392d878630a67 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 6 Sep 2022 19:38:53 -0700
Subject: bpf: Replace __ksize with ksize.

__ksize() was made private. Use ksize() instead.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 5cc952da7d41..20621f5407d8 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -610,7 +610,7 @@ void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr)
 	if (!ptr)
 		return;
 
-	idx = bpf_mem_cache_idx(__ksize(ptr - LLIST_NODE_SZ));
+	idx = bpf_mem_cache_idx(ksize(ptr - LLIST_NODE_SZ));
 	if (idx < 0)
 		return;
 
-- 
cgit v1.2.3


From 720e6a435194fb5237833a4a7ec6aa60a78964a8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 31 Aug 2022 08:26:46 -0700
Subject: bpf: Allow struct argument in trampoline based programs

Allow struct argument in trampoline based programs where
the struct size should be <= 16 bytes. In such cases, the argument
will be put into up to 2 registers for bpf, x86_64 and arm64
architectures.

To support arch-specific trampoline manipulation,
add arg_flags for additional struct information about arguments
in btf_func_model. Such information will be used in arch specific
function arch_prepare_bpf_trampoline() to prepare argument access
properly in trampoline.

Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220831152646.2078089-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h |  4 ++++
 kernel/bpf/btf.c    | 45 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9c1674973e03..4d32f125f4af 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -727,10 +727,14 @@ enum bpf_cgroup_storage_type {
  */
 #define MAX_BPF_FUNC_REG_ARGS 5
 
+/* The argument is a structure. */
+#define BTF_FMODEL_STRUCT_ARG		BIT(0)
+
 struct btf_func_model {
 	u8 ret_size;
 	u8 nr_args;
 	u8 arg_size[MAX_BPF_FUNC_ARGS];
+	u8 arg_flags[MAX_BPF_FUNC_ARGS];
 };
 
 /* Restore arguments before returning from trampoline to let original function
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 903719b89238..ea94527e5d70 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5328,6 +5328,34 @@ static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
 	return btf_type_is_int(t);
 }
 
+static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
+			   int off)
+{
+	const struct btf_param *args;
+	const struct btf_type *t;
+	u32 offset = 0, nr_args;
+	int i;
+
+	if (!func_proto)
+		return off / 8;
+
+	nr_args = btf_type_vlen(func_proto);
+	args = (const struct btf_param *)(func_proto + 1);
+	for (i = 0; i < nr_args; i++) {
+		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+		offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+		if (off < offset)
+			return i;
+	}
+
+	t = btf_type_skip_modifiers(btf, func_proto->type, NULL);
+	offset += btf_type_is_ptr(t) ? 8 : roundup(t->size, 8);
+	if (off < offset)
+		return nr_args;
+
+	return nr_args + 1;
+}
+
 bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 		    const struct bpf_prog *prog,
 		    struct bpf_insn_access_aux *info)
@@ -5347,7 +5375,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			tname, off);
 		return false;
 	}
-	arg = off / 8;
+	arg = get_ctx_arg_idx(btf, t, off);
 	args = (const struct btf_param *)(t + 1);
 	/* if (t == NULL) Fall back to default BPF prog with
 	 * MAX_BPF_FUNC_REG_ARGS u64 arguments.
@@ -5417,7 +5445,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	/* skip modifiers */
 	while (btf_type_is_modifier(t))
 		t = btf_type_by_id(btf, t->type);
-	if (btf_type_is_small_int(t) || btf_is_any_enum(t))
+	if (btf_type_is_small_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
 		/* accessing a scalar */
 		return true;
 	if (!btf_type_is_ptr(t)) {
@@ -5881,7 +5909,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id,
 	if (btf_type_is_ptr(t))
 		/* kernel size of pointer. Not BPF's size of pointer*/
 		return sizeof(void *);
-	if (btf_type_is_int(t) || btf_is_any_enum(t))
+	if (btf_type_is_int(t) || btf_is_any_enum(t) || __btf_type_is_struct(t))
 		return t->size;
 	return -EINVAL;
 }
@@ -5901,8 +5929,10 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 		/* BTF function prototype doesn't match the verifier types.
 		 * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
 		 */
-		for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+		for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
 			m->arg_size[i] = 8;
+			m->arg_flags[i] = 0;
+		}
 		m->ret_size = 8;
 		m->nr_args = MAX_BPF_FUNC_REG_ARGS;
 		return 0;
@@ -5916,7 +5946,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 		return -EINVAL;
 	}
 	ret = __get_type_size(btf, func->type, &t);
-	if (ret < 0) {
+	if (ret < 0 || __btf_type_is_struct(t)) {
 		bpf_log(log,
 			"The function %s return type %s is unsupported.\n",
 			tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
@@ -5932,7 +5962,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			return -EINVAL;
 		}
 		ret = __get_type_size(btf, args[i].type, &t);
-		if (ret < 0) {
+
+		/* No support of struct argument size greater than 16 bytes */
+		if (ret < 0 || ret > 16) {
 			bpf_log(log,
 				"The function %s arg%d type %s is unsupported.\n",
 				tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
@@ -5945,6 +5977,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			return -EINVAL;
 		}
 		m->arg_size[i] = ret;
+		m->arg_flags[i] = __btf_type_is_struct(t) ? BTF_FMODEL_STRUCT_ARG : 0;
 	}
 	m->nr_args = nargs;
 	return 0;
-- 
cgit v1.2.3


From a9c5ad31fbdc4dec6d266fe22e51de1ad6d1bcf2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 31 Aug 2022 08:26:52 -0700
Subject: bpf: x86: Support in-register struct arguments in trampoline programs

In C, struct value can be passed as a function argument.
For small structs, struct value may be passed in
one or more registers. For trampoline based bpf programs,
this would cause complication since one-to-one mapping between
function argument and arch argument register is not valid
any more.

The latest llvm16 added bpf support to pass by values
for struct up to 16 bytes ([1]). This is also true for
x86_64 architecture where two registers will hold
the struct value if the struct size is >8 and <= 16.
This may not be true if one of struct member is 'double'
type but in current linux source code we don't have
such instance yet, so we assume all >8 && <= 16 struct
holds two general purpose argument registers.

Also change on-stack nr_args value to the number
of registers holding the arguments. This will
permit bpf_get_func_arg() helper to get all
argument values.

 [1] https://reviews.llvm.org/D132144

Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220831152652.2078600-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 68 +++++++++++++++++++++++++++++++++------------
 1 file changed, 51 insertions(+), 17 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index c1f6c1c51d99..ae89f4143eb4 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1751,34 +1751,60 @@ emit_jmp:
 static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
 		      int stack_size)
 {
-	int i;
+	int i, j, arg_size, nr_regs;
 	/* Store function arguments to stack.
 	 * For a function that accepts two pointers the sequence will be:
 	 * mov QWORD PTR [rbp-0x10],rdi
 	 * mov QWORD PTR [rbp-0x8],rsi
 	 */
-	for (i = 0; i < min(nr_args, 6); i++)
-		emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]),
-			 BPF_REG_FP,
-			 i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
-			 -(stack_size - i * 8));
+	for (i = 0, j = 0; i < min(nr_args, 6); i++) {
+		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
+			nr_regs = (m->arg_size[i] + 7) / 8;
+			arg_size = 8;
+		} else {
+			nr_regs = 1;
+			arg_size = m->arg_size[i];
+		}
+
+		while (nr_regs) {
+			emit_stx(prog, bytes_to_bpf_size(arg_size),
+				 BPF_REG_FP,
+				 j == 5 ? X86_REG_R9 : BPF_REG_1 + j,
+				 -(stack_size - j * 8));
+			nr_regs--;
+			j++;
+		}
+	}
 }
 
 static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
 			 int stack_size)
 {
-	int i;
+	int i, j, arg_size, nr_regs;
 
 	/* Restore function arguments from stack.
 	 * For a function that accepts two pointers the sequence will be:
 	 * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
 	 * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
 	 */
-	for (i = 0; i < min(nr_args, 6); i++)
-		emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]),
-			 i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
-			 BPF_REG_FP,
-			 -(stack_size - i * 8));
+	for (i = 0, j = 0; i < min(nr_args, 6); i++) {
+		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
+			nr_regs = (m->arg_size[i] + 7) / 8;
+			arg_size = 8;
+		} else {
+			nr_regs = 1;
+			arg_size = m->arg_size[i];
+		}
+
+		while (nr_regs) {
+			emit_ldx(prog, bytes_to_bpf_size(arg_size),
+				 j == 5 ? X86_REG_R9 : BPF_REG_1 + j,
+				 BPF_REG_FP,
+				 -(stack_size - j * 8));
+			nr_regs--;
+			j++;
+		}
+	}
 }
 
 static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
@@ -2015,7 +2041,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 				struct bpf_tramp_links *tlinks,
 				void *orig_call)
 {
-	int ret, i, nr_args = m->nr_args;
+	int ret, i, nr_args = m->nr_args, extra_nregs = 0;
 	int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off;
 	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
 	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
@@ -2028,6 +2054,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	if (nr_args > 6)
 		return -ENOTSUPP;
 
+	for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
+		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
+			extra_nregs += (m->arg_size[i] + 7) / 8 - 1;
+	}
+	if (nr_args + extra_nregs > 6)
+		return -ENOTSUPP;
+	stack_size += extra_nregs * 8;
+
 	/* Generated trampoline stack layout:
 	 *
 	 * RBP + 8         [ return address  ]
@@ -2040,7 +2074,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	 *                 [ ...             ]
 	 * RBP - regs_off  [ reg_arg1        ]  program's ctx pointer
 	 *
-	 * RBP - args_off  [ args count      ]  always
+	 * RBP - args_off  [ arg regs count  ]  always
 	 *
 	 * RBP - ip_off    [ traced function ]  BPF_TRAMP_F_IP_ARG flag
 	 *
@@ -2083,11 +2117,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 	EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
 	EMIT1(0x53);		 /* push rbx */
 
-	/* Store number of arguments of the traced function:
-	 *   mov rax, nr_args
+	/* Store number of argument registers of the traced function:
+	 *   mov rax, nr_args + extra_nregs
 	 *   mov QWORD PTR [rbp - args_off], rax
 	 */
-	emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args);
+	emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args + extra_nregs);
 	emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off);
 
 	if (flags & BPF_TRAMP_F_IP_ARG) {
-- 
cgit v1.2.3


From 27ed9353aec9de4277b3389c9f2b04beb6ab7622 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 31 Aug 2022 08:26:57 -0700
Subject: bpf: Update descriptions for helpers bpf_get_func_arg[_cnt]()

Now instead of the number of arguments, the number of registers
holding argument values are stored in trampoline. Update
the description of bpf_get_func_arg[_cnt]() helpers. Previous
programs without struct arguments should continue to work
as usual.

Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220831152657.2078805-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 9 +++++----
 tools/include/uapi/linux/bpf.h | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 793103b10eab..3df78c56c1bf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5079,12 +5079,12 @@ union bpf_attr {
  *
  * long bpf_get_func_arg(void *ctx, u32 n, u64 *value)
  *	Description
- *		Get **n**-th argument (zero based) of the traced function (for tracing programs)
+ *		Get **n**-th argument register (zero based) of the traced function (for tracing programs)
  *		returned in **value**.
  *
  *	Return
  *		0 on success.
- *		**-EINVAL** if n >= arguments count of traced function.
+ *		**-EINVAL** if n >= argument register count of traced function.
  *
  * long bpf_get_func_ret(void *ctx, u64 *value)
  *	Description
@@ -5097,10 +5097,11 @@ union bpf_attr {
  *
  * long bpf_get_func_arg_cnt(void *ctx)
  *	Description
- *		Get number of arguments of the traced function (for tracing programs).
+ *		Get number of registers of the traced function (for tracing programs) where
+ *		function arguments are stored in these registers.
  *
  *	Return
- *		The number of arguments of the traced function.
+ *		The number of argument registers of the traced function.
  *
  * int bpf_get_retval(void)
  *	Description
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 793103b10eab..3df78c56c1bf 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5079,12 +5079,12 @@ union bpf_attr {
  *
  * long bpf_get_func_arg(void *ctx, u32 n, u64 *value)
  *	Description
- *		Get **n**-th argument (zero based) of the traced function (for tracing programs)
+ *		Get **n**-th argument register (zero based) of the traced function (for tracing programs)
  *		returned in **value**.
  *
  *	Return
  *		0 on success.
- *		**-EINVAL** if n >= arguments count of traced function.
+ *		**-EINVAL** if n >= argument register count of traced function.
  *
  * long bpf_get_func_ret(void *ctx, u64 *value)
  *	Description
@@ -5097,10 +5097,11 @@ union bpf_attr {
  *
  * long bpf_get_func_arg_cnt(void *ctx)
  *	Description
- *		Get number of arguments of the traced function (for tracing programs).
+ *		Get number of registers of the traced function (for tracing programs) where
+ *		function arguments are stored in these registers.
  *
  *	Return
- *		The number of arguments of the traced function.
+ *		The number of argument registers of the traced function.
  *
  * int bpf_get_retval(void)
  *	Description
-- 
cgit v1.2.3


From eb707dde264af5eb0271156d7fbd59133fa02cac Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 31 Aug 2022 08:27:02 -0700
Subject: bpf: arm64: No support of struct argument in trampoline programs

ARM64 does not support struct argument for trampoline based
bpf programs yet.

Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220831152702.2079066-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 389623ae5a91..30f76178608b 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1970,7 +1970,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 				u32 flags, struct bpf_tramp_links *tlinks,
 				void *orig_call)
 {
-	int ret;
+	int i, ret;
 	int nargs = m->nr_args;
 	int max_insns = ((long)image_end - (long)image) / AARCH64_INSN_SIZE;
 	struct jit_ctx ctx = {
@@ -1982,6 +1982,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image,
 	if (nargs > 8)
 		return -ENOTSUPP;
 
+	/* don't support struct argument */
+	for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
+		if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
+			return -ENOTSUPP;
+	}
+
 	ret = prepare_trampoline(&ctx, im, tlinks, orig_call, nargs, flags);
 	if (ret < 0)
 		return ret;
-- 
cgit v1.2.3


From 34586d29f8dfc4ae30642c5b9a4db8a4a7af6869 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 31 Aug 2022 08:27:07 -0700
Subject: libbpf: Add new BPF_PROG2 macro

To support struct arguments in trampoline based programs,
existing BPF_PROG doesn't work any more since
the type size is needed to find whether a parameter
takes one or two registers. So this patch added a new
BPF_PROG2 macro to support such trampoline programs.

The idea is suggested by Andrii. For example, if the
to-be-traced function has signature like
  typedef struct {
       void *x;
       int t;
  } sockptr;
  int blah(sockptr x, char y);

In the new BPF_PROG2 macro, the argument can be
represented as
  __bpf_prog_call(
     ({ union {
          struct { __u64 x, y; } ___z;
          sockptr x;
        } ___tmp = { .___z = { ctx[0], ctx[1] }};
        ___tmp.x;
     }),
     ({ union {
          struct { __u8 x; } ___z;
          char y;
        } ___tmp = { .___z = { ctx[2] }};
        ___tmp.y;
     }));
In the above, the values stored on the stack are properly
assigned to the actual argument type value by using 'union'
magic. Note that the macro also works even if no arguments
are with struct types.

Note that new BPF_PROG2 works for both llvm16 and pre-llvm16
compilers where llvm16 supports bpf target passing value
with struct up to 16 byte size and pre-llvm16 will pass
by reference by storing values on the stack. With static functions
with struct argument as always inline, the compiler is able
to optimize and remove additional stack saving of struct values.

Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220831152707.2079473-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/lib/bpf/bpf_tracing.h | 79 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 5fdb93da423b..8d4bdd18cb3d 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -438,6 +438,85 @@ typeof(name(0)) name(unsigned long long *ctx)				    \
 static __always_inline typeof(name(0))					    \
 ____##name(unsigned long long *ctx, ##args)
 
+#ifndef ____bpf_nth
+#define ____bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, N, ...) N
+#endif
+#ifndef ____bpf_narg
+#define ____bpf_narg(...) ____bpf_nth(_, ##__VA_ARGS__, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0)
+#endif
+
+#define BPF_REG_CNT(t) \
+	(__builtin_choose_expr(sizeof(t) == 1 || sizeof(t) == 2 || sizeof(t) == 4 || sizeof(t) == 8, 1,	\
+	 __builtin_choose_expr(sizeof(t) == 16, 2,							\
+			       (void)0)))
+
+#define ____bpf_reg_cnt0()			(0)
+#define ____bpf_reg_cnt1(t, x)			(____bpf_reg_cnt0() + BPF_REG_CNT(t))
+#define ____bpf_reg_cnt2(t, x, args...)		(____bpf_reg_cnt1(args) + BPF_REG_CNT(t))
+#define ____bpf_reg_cnt3(t, x, args...)		(____bpf_reg_cnt2(args) + BPF_REG_CNT(t))
+#define ____bpf_reg_cnt4(t, x, args...)		(____bpf_reg_cnt3(args) + BPF_REG_CNT(t))
+#define ____bpf_reg_cnt5(t, x, args...)		(____bpf_reg_cnt4(args) + BPF_REG_CNT(t))
+#define ____bpf_reg_cnt6(t, x, args...)		(____bpf_reg_cnt5(args) + BPF_REG_CNT(t))
+#define ____bpf_reg_cnt7(t, x, args...)		(____bpf_reg_cnt6(args) + BPF_REG_CNT(t))
+#define ____bpf_reg_cnt8(t, x, args...)		(____bpf_reg_cnt7(args) + BPF_REG_CNT(t))
+#define ____bpf_reg_cnt9(t, x, args...)		(____bpf_reg_cnt8(args) + BPF_REG_CNT(t))
+#define ____bpf_reg_cnt10(t, x, args...)	(____bpf_reg_cnt9(args) + BPF_REG_CNT(t))
+#define ____bpf_reg_cnt11(t, x, args...)	(____bpf_reg_cnt10(args) + BPF_REG_CNT(t))
+#define ____bpf_reg_cnt12(t, x, args...)	(____bpf_reg_cnt11(args) + BPF_REG_CNT(t))
+#define ____bpf_reg_cnt(args...)	 ___bpf_apply(____bpf_reg_cnt, ____bpf_narg(args))(args)
+
+#define ____bpf_union_arg(t, x, n) \
+	__builtin_choose_expr(sizeof(t) == 1, ({ union { struct { __u8 x; } ___z; t x; } ___tmp = { .___z = {ctx[n]}}; ___tmp.x; }), \
+	__builtin_choose_expr(sizeof(t) == 2, ({ union { struct { __u16 x; } ___z; t x; } ___tmp = { .___z = {ctx[n]} }; ___tmp.x; }), \
+	__builtin_choose_expr(sizeof(t) == 4, ({ union { struct { __u32 x; } ___z; t x; } ___tmp = { .___z = {ctx[n]} }; ___tmp.x; }), \
+	__builtin_choose_expr(sizeof(t) == 8, ({ union { struct { __u64 x; } ___z; t x; } ___tmp = {.___z = {ctx[n]} }; ___tmp.x; }), \
+	__builtin_choose_expr(sizeof(t) == 16, ({ union { struct { __u64 x, y; } ___z; t x; } ___tmp = {.___z = {ctx[n], ctx[n + 1]} }; ___tmp.x; }), \
+			      (void)0)))))
+
+#define ____bpf_ctx_arg0(n, args...)
+#define ____bpf_ctx_arg1(n, t, x)		, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt1(t, x))
+#define ____bpf_ctx_arg2(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt2(t, x, args)) ____bpf_ctx_arg1(n, args)
+#define ____bpf_ctx_arg3(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt3(t, x, args)) ____bpf_ctx_arg2(n, args)
+#define ____bpf_ctx_arg4(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt4(t, x, args)) ____bpf_ctx_arg3(n, args)
+#define ____bpf_ctx_arg5(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt5(t, x, args)) ____bpf_ctx_arg4(n, args)
+#define ____bpf_ctx_arg6(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt6(t, x, args)) ____bpf_ctx_arg5(n, args)
+#define ____bpf_ctx_arg7(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt7(t, x, args)) ____bpf_ctx_arg6(n, args)
+#define ____bpf_ctx_arg8(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt8(t, x, args)) ____bpf_ctx_arg7(n, args)
+#define ____bpf_ctx_arg9(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt9(t, x, args)) ____bpf_ctx_arg8(n, args)
+#define ____bpf_ctx_arg10(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt10(t, x, args)) ____bpf_ctx_arg9(n, args)
+#define ____bpf_ctx_arg11(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt11(t, x, args)) ____bpf_ctx_arg10(n, args)
+#define ____bpf_ctx_arg12(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt12(t, x, args)) ____bpf_ctx_arg11(n, args)
+#define ____bpf_ctx_arg(n, args...)	___bpf_apply(____bpf_ctx_arg, ____bpf_narg(args))(n, args)
+
+#define ____bpf_ctx_decl0()
+#define ____bpf_ctx_decl1(t, x)			, t x
+#define ____bpf_ctx_decl2(t, x, args...)	, t x ____bpf_ctx_decl1(args)
+#define ____bpf_ctx_decl3(t, x, args...)	, t x ____bpf_ctx_decl2(args)
+#define ____bpf_ctx_decl4(t, x, args...)	, t x ____bpf_ctx_decl3(args)
+#define ____bpf_ctx_decl5(t, x, args...)	, t x ____bpf_ctx_decl4(args)
+#define ____bpf_ctx_decl6(t, x, args...)	, t x ____bpf_ctx_decl5(args)
+#define ____bpf_ctx_decl7(t, x, args...)	, t x ____bpf_ctx_decl6(args)
+#define ____bpf_ctx_decl8(t, x, args...)	, t x ____bpf_ctx_decl7(args)
+#define ____bpf_ctx_decl9(t, x, args...)	, t x ____bpf_ctx_decl8(args)
+#define ____bpf_ctx_decl10(t, x, args...)	, t x ____bpf_ctx_decl9(args)
+#define ____bpf_ctx_decl11(t, x, args...)	, t x ____bpf_ctx_decl10(args)
+#define ____bpf_ctx_decl12(t, x, args...)	, t x ____bpf_ctx_decl11(args)
+#define ____bpf_ctx_decl(args...)	___bpf_apply(____bpf_ctx_decl, ____bpf_narg(args))(args)
+
+/*
+ * BPF_PROG2 can handle struct arguments.
+ */
+#define BPF_PROG2(name, args...)						\
+name(unsigned long long *ctx);							\
+static __always_inline typeof(name(0))						\
+____##name(unsigned long long *ctx ____bpf_ctx_decl(args));			\
+typeof(name(0)) name(unsigned long long *ctx)					\
+{										\
+	return ____##name(ctx ____bpf_ctx_arg(____bpf_reg_cnt(args), args));	\
+}										\
+static __always_inline typeof(name(0))						\
+____##name(unsigned long long *ctx ____bpf_ctx_decl(args))
+
 struct pt_regs;
 
 #define ___bpf_kprobe_args0()           ctx
-- 
cgit v1.2.3


From 1642a3945e223a922312fab2401ecdf58b3825b9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 31 Aug 2022 08:27:13 -0700
Subject: selftests/bpf: Add struct argument tests with fentry/fexit programs.

Add various struct argument tests with fentry/fexit programs.
Also add one test with a kernel func which does not have any
argument to test BPF_PROG2 macro in such situation.

Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220831152713.2080039-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/bpf_testmod/bpf_testmod.c        |  48 +++++++++
 .../selftests/bpf/prog_tests/tracing_struct.c      |  63 +++++++++++
 tools/testing/selftests/bpf/progs/tracing_struct.c | 120 +++++++++++++++++++++
 3 files changed, 231 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/tracing_struct.c
 create mode 100644 tools/testing/selftests/bpf/progs/tracing_struct.c

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 792cb15bac40..a6021d6117b5 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -18,6 +18,46 @@ typedef int (*func_proto_typedef_nested1)(func_proto_typedef);
 typedef int (*func_proto_typedef_nested2)(func_proto_typedef_nested1);
 
 DEFINE_PER_CPU(int, bpf_testmod_ksym_percpu) = 123;
+long bpf_testmod_test_struct_arg_result;
+
+struct bpf_testmod_struct_arg_1 {
+	int a;
+};
+struct bpf_testmod_struct_arg_2 {
+	long a;
+	long b;
+};
+
+noinline int
+bpf_testmod_test_struct_arg_1(struct bpf_testmod_struct_arg_2 a, int b, int c) {
+	bpf_testmod_test_struct_arg_result = a.a + a.b  + b + c;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_2(int a, struct bpf_testmod_struct_arg_2 b, int c) {
+	bpf_testmod_test_struct_arg_result = a + b.a + b.b + c;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_3(int a, int b, struct bpf_testmod_struct_arg_2 c) {
+	bpf_testmod_test_struct_arg_result = a + b + c.a + c.b;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_4(struct bpf_testmod_struct_arg_1 a, int b,
+			      int c, int d, struct bpf_testmod_struct_arg_2 e) {
+	bpf_testmod_test_struct_arg_result = a.a + b + c + d + e.a + e.b;
+	return bpf_testmod_test_struct_arg_result;
+}
+
+noinline int
+bpf_testmod_test_struct_arg_5(void) {
+	bpf_testmod_test_struct_arg_result = 1;
+	return bpf_testmod_test_struct_arg_result;
+}
 
 noinline void
 bpf_testmod_test_mod_kfunc(int i)
@@ -98,11 +138,19 @@ bpf_testmod_test_read(struct file *file, struct kobject *kobj,
 		.off = off,
 		.len = len,
 	};
+	struct bpf_testmod_struct_arg_1 struct_arg1 = {10};
+	struct bpf_testmod_struct_arg_2 struct_arg2 = {2, 3};
 	int i = 1;
 
 	while (bpf_testmod_return_ptr(i))
 		i++;
 
+	(void)bpf_testmod_test_struct_arg_1(struct_arg2, 1, 4);
+	(void)bpf_testmod_test_struct_arg_2(1, struct_arg2, 4);
+	(void)bpf_testmod_test_struct_arg_3(1, 4, struct_arg2);
+	(void)bpf_testmod_test_struct_arg_4(struct_arg1, 1, 2, 3, struct_arg2);
+	(void)bpf_testmod_test_struct_arg_5();
+
 	/* This is always true. Use the check to make sure the compiler
 	 * doesn't remove bpf_testmod_loop_test.
 	 */
diff --git a/tools/testing/selftests/bpf/prog_tests/tracing_struct.c b/tools/testing/selftests/bpf/prog_tests/tracing_struct.c
new file mode 100644
index 000000000000..d5022b91d1e4
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tracing_struct.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include <test_progs.h>
+#include "tracing_struct.skel.h"
+
+static void test_fentry(void)
+{
+	struct tracing_struct *skel;
+	int err;
+
+	skel = tracing_struct__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "tracing_struct__open_and_load"))
+		return;
+
+	err = tracing_struct__attach(skel);
+	if (!ASSERT_OK(err, "tracing_struct__attach"))
+		return;
+
+	ASSERT_OK(trigger_module_test_read(256), "trigger_read");
+
+	ASSERT_EQ(skel->bss->t1_a_a, 2, "t1:a.a");
+	ASSERT_EQ(skel->bss->t1_a_b, 3, "t1:a.b");
+	ASSERT_EQ(skel->bss->t1_b, 1, "t1:b");
+	ASSERT_EQ(skel->bss->t1_c, 4, "t1:c");
+
+	ASSERT_EQ(skel->bss->t1_nregs, 4, "t1 nregs");
+	ASSERT_EQ(skel->bss->t1_reg0, 2, "t1 reg0");
+	ASSERT_EQ(skel->bss->t1_reg1, 3, "t1 reg1");
+	ASSERT_EQ(skel->bss->t1_reg2, 1, "t1 reg2");
+	ASSERT_EQ(skel->bss->t1_reg3, 4, "t1 reg3");
+	ASSERT_EQ(skel->bss->t1_ret, 10, "t1 ret");
+
+	ASSERT_EQ(skel->bss->t2_a, 1, "t2:a");
+	ASSERT_EQ(skel->bss->t2_b_a, 2, "t2:b.a");
+	ASSERT_EQ(skel->bss->t2_b_b, 3, "t2:b.b");
+	ASSERT_EQ(skel->bss->t2_c, 4, "t2:c");
+	ASSERT_EQ(skel->bss->t2_ret, 10, "t2 ret");
+
+	ASSERT_EQ(skel->bss->t3_a, 1, "t3:a");
+	ASSERT_EQ(skel->bss->t3_b, 4, "t3:b");
+	ASSERT_EQ(skel->bss->t3_c_a, 2, "t3:c.a");
+	ASSERT_EQ(skel->bss->t3_c_b, 3, "t3:c.b");
+	ASSERT_EQ(skel->bss->t3_ret, 10, "t3 ret");
+
+	ASSERT_EQ(skel->bss->t4_a_a, 10, "t4:a.a");
+	ASSERT_EQ(skel->bss->t4_b, 1, "t4:b");
+	ASSERT_EQ(skel->bss->t4_c, 2, "t4:c");
+	ASSERT_EQ(skel->bss->t4_d, 3, "t4:d");
+	ASSERT_EQ(skel->bss->t4_e_a, 2, "t4:e.a");
+	ASSERT_EQ(skel->bss->t4_e_b, 3, "t4:e.b");
+	ASSERT_EQ(skel->bss->t4_ret, 21, "t4 ret");
+
+	ASSERT_EQ(skel->bss->t5_ret, 1, "t5 ret");
+
+	tracing_struct__detach(skel);
+	tracing_struct__destroy(skel);
+}
+
+void test_tracing_struct(void)
+{
+	test_fentry();
+}
diff --git a/tools/testing/selftests/bpf/progs/tracing_struct.c b/tools/testing/selftests/bpf/progs/tracing_struct.c
new file mode 100644
index 000000000000..e718f0ebee7d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tracing_struct.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+
+struct bpf_testmod_struct_arg_1 {
+	int a;
+};
+struct bpf_testmod_struct_arg_2 {
+	long a;
+	long b;
+};
+
+long t1_a_a, t1_a_b, t1_b, t1_c, t1_ret, t1_nregs;
+__u64 t1_reg0, t1_reg1, t1_reg2, t1_reg3;
+long t2_a, t2_b_a, t2_b_b, t2_c, t2_ret;
+long t3_a, t3_b, t3_c_a, t3_c_b, t3_ret;
+long t4_a_a, t4_b, t4_c, t4_d, t4_e_a, t4_e_b, t4_ret;
+long t5_ret;
+
+SEC("fentry/bpf_testmod_test_struct_arg_1")
+int BPF_PROG2(test_struct_arg_1, struct bpf_testmod_struct_arg_2, a, int, b, int, c)
+{
+	t1_a_a = a.a;
+	t1_a_b = a.b;
+	t1_b = b;
+	t1_c = c;
+	return 0;
+}
+
+SEC("fexit/bpf_testmod_test_struct_arg_1")
+int BPF_PROG2(test_struct_arg_2, struct bpf_testmod_struct_arg_2, a, int, b, int, c, int, ret)
+{
+	t1_nregs =  bpf_get_func_arg_cnt(ctx);
+	/* a.a */
+	bpf_get_func_arg(ctx, 0, &t1_reg0);
+	/* a.b */
+	bpf_get_func_arg(ctx, 1, &t1_reg1);
+	/* b */
+	bpf_get_func_arg(ctx, 2, &t1_reg2);
+	t1_reg2 = (int)t1_reg2;
+	/* c */
+	bpf_get_func_arg(ctx, 3, &t1_reg3);
+	t1_reg3 = (int)t1_reg3;
+
+	t1_ret = ret;
+	return 0;
+}
+
+SEC("fentry/bpf_testmod_test_struct_arg_2")
+int BPF_PROG2(test_struct_arg_3, int, a, struct bpf_testmod_struct_arg_2, b, int, c)
+{
+	t2_a = a;
+	t2_b_a = b.a;
+	t2_b_b = b.b;
+	t2_c = c;
+	return 0;
+}
+
+SEC("fexit/bpf_testmod_test_struct_arg_2")
+int BPF_PROG2(test_struct_arg_4, int, a, struct bpf_testmod_struct_arg_2, b, int, c, int, ret)
+{
+	t2_ret = ret;
+	return 0;
+}
+
+SEC("fentry/bpf_testmod_test_struct_arg_3")
+int BPF_PROG2(test_struct_arg_5, int, a, int, b, struct bpf_testmod_struct_arg_2, c)
+{
+	t3_a = a;
+	t3_b = b;
+	t3_c_a = c.a;
+	t3_c_b = c.b;
+	return 0;
+}
+
+SEC("fexit/bpf_testmod_test_struct_arg_3")
+int BPF_PROG2(test_struct_arg_6, int, a, int, b, struct bpf_testmod_struct_arg_2, c, int, ret)
+{
+	t3_ret = ret;
+	return 0;
+}
+
+SEC("fentry/bpf_testmod_test_struct_arg_4")
+int BPF_PROG2(test_struct_arg_7, struct bpf_testmod_struct_arg_1, a, int, b,
+	     int, c, int, d, struct bpf_testmod_struct_arg_2, e)
+{
+	t4_a_a = a.a;
+	t4_b = b;
+	t4_c = c;
+	t4_d = d;
+	t4_e_a = e.a;
+	t4_e_b = e.b;
+	return 0;
+}
+
+SEC("fexit/bpf_testmod_test_struct_arg_4")
+int BPF_PROG2(test_struct_arg_8, struct bpf_testmod_struct_arg_1, a, int, b,
+	     int, c, int, d, struct bpf_testmod_struct_arg_2, e, int, ret)
+{
+	t4_ret = ret;
+	return 0;
+}
+
+SEC("fentry/bpf_testmod_test_struct_arg_5")
+int BPF_PROG2(test_struct_arg_9)
+{
+	return 0;
+}
+
+SEC("fexit/bpf_testmod_test_struct_arg_5")
+int BPF_PROG2(test_struct_arg_10, int, ret)
+{
+	t5_ret = ret;
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From a7c2ca3a2f697044094475055b3fba3929b234e4 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 31 Aug 2022 08:27:18 -0700
Subject: selftests/bpf: Use BPF_PROG2 for some fentry programs without struct
 arguments

Use BPF_PROG2 instead of BPF_PROG for programs in progs/timer.c
to test BPF_PROG2 for cases without struct arguments.

Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220831152718.2081091-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/progs/timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/timer.c b/tools/testing/selftests/bpf/progs/timer.c
index 0053c5402173..acda5c9cea93 100644
--- a/tools/testing/selftests/bpf/progs/timer.c
+++ b/tools/testing/selftests/bpf/progs/timer.c
@@ -120,7 +120,7 @@ static int timer_cb1(void *map, int *key, struct bpf_timer *timer)
 }
 
 SEC("fentry/bpf_fentry_test1")
-int BPF_PROG(test1, int a)
+int BPF_PROG2(test1, int, a)
 {
 	struct bpf_timer *arr_timer, *lru_timer;
 	struct elem init = {};
@@ -236,7 +236,7 @@ int bpf_timer_test(void)
 }
 
 SEC("fentry/bpf_fentry_test2")
-int BPF_PROG(test2, int a, int b)
+int BPF_PROG2(test2, int, a, int, b)
 {
 	struct hmap_elem init = {}, *val;
 	int key = HTAB, key_malloc = HTAB_MALLOC;
-- 
cgit v1.2.3


From ae63c10fc241a94bb916da96d40c8810f9ad7f18 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 31 Aug 2022 08:27:23 -0700
Subject: selftests/bpf: Add tracing_struct test in DENYLIST.s390x

Add tracing_struct test in DENYLIST.s390x since s390x does not
support trampoline now.

Signed-off-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220831152723.2081551-1-yhs@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.s390x | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index 18fbb6eab1e2..168c5b287b5c 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -70,3 +70,4 @@ setget_sockopt                           # attach unexpected error: -524
 cb_refs                                  # expected error message unexpected error: -524                               (trampoline)
 cgroup_hierarchical_stats                # JIT does not support calling kernel function                                (kfunc)
 htab_update                              # failed to attach: ERROR: strerror_r(-524)=22                                (trampoline)
+tracing_struct                           # failed to auto-attach: -524                                                 (trampoline)
-- 
cgit v1.2.3


From 934f70d9d498a7fffd611edf48356698f1d72e91 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Tue, 6 Sep 2022 21:25:08 +0800
Subject: audit: remove selinux_audit_rule_update() declaration

selinux_audit_rule_update() has been renamed to audit_update_lsm_rules()
since commit d7a96f3a1ae2 ("Audit: internally use the new LSM audit
hooks"), so remove it.

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/audit.h b/kernel/audit.h
index d6eb7b59c791..c57b008b9914 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -245,8 +245,6 @@ struct audit_netlink_list {
 
 int audit_send_list_thread(void *_dest);
 
-extern int selinux_audit_rule_update(void);
-
 extern struct mutex audit_filter_mutex;
 extern int audit_del_rule(struct audit_entry *entry);
 extern void audit_free_rule_rcu(struct rcu_head *head);
-- 
cgit v1.2.3


From 012ba1156e4a7b38062d109b818cb479a68c87ba Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 6 Sep 2022 17:12:57 +0200
Subject: selftests/bpf: regroup and declare similar kfuncs selftests in an
 array

Similar to tools/testing/selftests/bpf/prog_tests/dynptr.c:
we declare an array of tests that we run one by one in a for loop.

Followup patches will add more similar-ish tests, so avoid a lot of copy
paste by grouping the declaration in an array.

For light skeletons, we have to rely on the offsetof() macro so we can
statically declare which program we are using.
In the libbpf case, we can rely on bpf_object__find_program_by_name().
So also change the Makefile to generate both light skeletons and normal
ones.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220906151303.2780789-2-benjamin.tissoires@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/Makefile               |  5 +-
 .../testing/selftests/bpf/prog_tests/kfunc_call.c  | 81 +++++++++++++++++-----
 2 files changed, 68 insertions(+), 18 deletions(-)

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index c10adecb5a73..6cd327f1f216 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -351,11 +351,12 @@ LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 		test_subskeleton.skel.h test_subskeleton_lib.skel.h	\
 		test_usdt.skel.h
 
-LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \
+LSKELS := fentry_test.c fexit_test.c fexit_sleep.c \
 	test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c \
 	map_ptr_kern.c core_kern.c core_kern_overflow.c
 # Generate both light skeleton and libbpf skeleton for these
-LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test_subprog.c
+LSKELS_EXTRA := test_ksyms_module.c test_ksyms_weak.c kfunc_call_test.c \
+	kfunc_call_test_subprog.c
 SKEL_BLACKLIST += $$(LSKELS)
 
 test_static_linked.skel.h-deps := test_static_linked1.bpf.o test_static_linked2.bpf.o
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
index eede7c304f86..9dfbe5355a2d 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2021 Facebook */
 #include <test_progs.h>
 #include <network_helpers.h>
+#include "kfunc_call_test.skel.h"
 #include "kfunc_call_test.lskel.h"
 #include "kfunc_call_test_subprog.skel.h"
 #include "kfunc_call_test_subprog.lskel.h"
@@ -9,9 +10,31 @@
 
 #include "cap_helpers.h"
 
-static void test_main(void)
+struct kfunc_test_params {
+	const char *prog_name;
+	unsigned long lskel_prog_desc_offset;
+	int retval;
+};
+
+#define TC_TEST(name, __retval) \
+	{ \
+	  .prog_name = #name, \
+	  .lskel_prog_desc_offset = offsetof(struct kfunc_call_test_lskel, progs.name), \
+	  .retval = __retval, \
+	}
+
+static struct kfunc_test_params kfunc_tests[] = {
+	TC_TEST(kfunc_call_test1, 12),
+	TC_TEST(kfunc_call_test2, 3),
+	TC_TEST(kfunc_call_test_ref_btf_id, 0),
+};
+
+static void verify_success(struct kfunc_test_params *param)
 {
-	struct kfunc_call_test_lskel *skel;
+	struct kfunc_call_test_lskel *lskel = NULL;
+	struct bpf_prog_desc *lskel_prog;
+	struct kfunc_call_test *skel;
+	struct bpf_program *prog;
 	int prog_fd, err;
 	LIBBPF_OPTS(bpf_test_run_opts, topts,
 		.data_in = &pkt_v4,
@@ -19,26 +42,53 @@ static void test_main(void)
 		.repeat = 1,
 	);
 
-	skel = kfunc_call_test_lskel__open_and_load();
+	/* first test with normal libbpf */
+	skel = kfunc_call_test__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "skel"))
 		return;
 
-	prog_fd = skel->progs.kfunc_call_test1.prog_fd;
-	err = bpf_prog_test_run_opts(prog_fd, &topts);
-	ASSERT_OK(err, "bpf_prog_test_run(test1)");
-	ASSERT_EQ(topts.retval, 12, "test1-retval");
+	prog = bpf_object__find_program_by_name(skel->obj, param->prog_name);
+	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+		goto cleanup;
 
-	prog_fd = skel->progs.kfunc_call_test2.prog_fd;
+	prog_fd = bpf_program__fd(prog);
 	err = bpf_prog_test_run_opts(prog_fd, &topts);
-	ASSERT_OK(err, "bpf_prog_test_run(test2)");
-	ASSERT_EQ(topts.retval, 3, "test2-retval");
+	if (!ASSERT_OK(err, param->prog_name))
+		goto cleanup;
+
+	if (!ASSERT_EQ(topts.retval, param->retval, "retval"))
+		goto cleanup;
+
+	/* second test with light skeletons */
+	lskel = kfunc_call_test_lskel__open_and_load();
+	if (!ASSERT_OK_PTR(lskel, "lskel"))
+		goto cleanup;
+
+	lskel_prog = (struct bpf_prog_desc *)((char *)lskel + param->lskel_prog_desc_offset);
 
-	prog_fd = skel->progs.kfunc_call_test_ref_btf_id.prog_fd;
+	prog_fd = lskel_prog->prog_fd;
 	err = bpf_prog_test_run_opts(prog_fd, &topts);
-	ASSERT_OK(err, "bpf_prog_test_run(test_ref_btf_id)");
-	ASSERT_EQ(topts.retval, 0, "test_ref_btf_id-retval");
+	if (!ASSERT_OK(err, param->prog_name))
+		goto cleanup;
+
+	ASSERT_EQ(topts.retval, param->retval, "retval");
+
+cleanup:
+	kfunc_call_test__destroy(skel);
+	if (lskel)
+		kfunc_call_test_lskel__destroy(lskel);
+}
+
+static void test_main(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(kfunc_tests); i++) {
+		if (!test__start_subtest(kfunc_tests[i].prog_name))
+			continue;
 
-	kfunc_call_test_lskel__destroy(skel);
+		verify_success(&kfunc_tests[i]);
+	}
 }
 
 static void test_subprog(void)
@@ -121,8 +171,7 @@ static void test_destructive(void)
 
 void test_kfunc_call(void)
 {
-	if (test__start_subtest("main"))
-		test_main();
+	test_main();
 
 	if (test__start_subtest("subprog"))
 		test_subprog();
-- 
cgit v1.2.3


From 95f2f26f3cac06cfc046d2b29e60719d7848ea54 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 6 Sep 2022 17:12:58 +0200
Subject: bpf: split btf_check_subprog_arg_match in two

btf_check_subprog_arg_match() was used twice in verifier.c:
- when checking for the type mismatches between a (sub)prog declaration
  and BTF
- when checking the call of a subprog to see if the provided arguments
  are correct and valid

This is problematic when we check if the first argument of a program
(pointer to ctx) is correctly accessed:
To be able to ensure we access a valid memory in the ctx, the verifier
assumes the pointer to context is not null.
This has the side effect of marking the program accessing the entire
context, even if the context is never dereferenced.

For example, by checking the context access with the current code, the
following eBPF program would fail with -EINVAL if the ctx is set to null
from the userspace:

```
SEC("syscall")
int prog(struct my_ctx *args) {
  return 0;
}
```

In that particular case, we do not want to actually check that the memory
is correct while checking for the BTF validity, but we just want to
ensure that the (sub)prog definition matches the BTF we have.

So split btf_check_subprog_arg_match() in two so we can actually check
for the memory used when in a call, and ignore that part when not.

Note that a further patch is in preparation to disentangled
btf_check_func_arg_match() from these two purposes, and so right now we
just add a new hack around that by adding a boolean to this function.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220906151303.2780789-3-benjamin.tissoires@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  2 ++
 kernel/bpf/btf.c      | 54 ++++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/bpf/verifier.c |  2 +-
 3 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4d32f125f4af..3cf161cfd396 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1947,6 +1947,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 struct bpf_reg_state;
 int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 				struct bpf_reg_state *regs);
+int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+			   struct bpf_reg_state *regs);
 int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
 			      const struct btf *btf, u32 func_id,
 			      struct bpf_reg_state *regs,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ea94527e5d70..9291e2b2c950 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6203,7 +6203,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
 				    bool ptr_to_mem_ok,
-				    u32 kfunc_flags)
+				    u32 kfunc_flags,
+				    bool processing_call)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
 	bool rel = false, kptr_get = false, trusted_arg = false;
@@ -6389,7 +6390,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 					reg_ref_tname);
 				return -EINVAL;
 			}
-		} else if (ptr_to_mem_ok) {
+		} else if (ptr_to_mem_ok && processing_call) {
 			const struct btf_type *resolve_ret;
 			u32 type_size;
 
@@ -6464,7 +6465,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 	return rel ? ref_regno : 0;
 }
 
-/* Compare BTF of a function with given bpf_reg_state.
+/* Compare BTF of a function declaration with given bpf_reg_state.
  * Returns:
  * EFAULT - there is a verifier bug. Abort verification.
  * EINVAL - there is a type mismatch or BTF is not available.
@@ -6491,7 +6492,50 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 		return -EINVAL;
 
 	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0);
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, false);
+
+	/* Compiler optimizations can remove arguments from static functions
+	 * or mismatched type can be passed into a global function.
+	 * In such cases mark the function as unreliable from BTF point of view.
+	 */
+	if (err)
+		prog->aux->func_info_aux[subprog].unreliable = true;
+	return err;
+}
+
+/* Compare BTF of a function call with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ *
+ * NOTE: the code is duplicated from btf_check_subprog_arg_match()
+ * because btf_check_func_arg_match() is still doing both. Once that
+ * function is split in 2, we can call from here btf_check_subprog_arg_match()
+ * first, and then treat the calling part in a new code path.
+ */
+int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
+			   struct bpf_reg_state *regs)
+{
+	struct bpf_prog *prog = env->prog;
+	struct btf *btf = prog->aux->btf;
+	bool is_global;
+	u32 btf_id;
+	int err;
+
+	if (!prog->aux->func_info)
+		return -EINVAL;
+
+	btf_id = prog->aux->func_info[subprog].type_id;
+	if (!btf_id)
+		return -EFAULT;
+
+	if (prog->aux->func_info_aux[subprog].unreliable)
+		return -EINVAL;
+
+	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, true);
 
 	/* Compiler optimizations can remove arguments from static functions
 	 * or mismatched type can be passed into a global function.
@@ -6507,7 +6551,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
 			      struct bpf_reg_state *regs,
 			      u32 kfunc_flags)
 {
-	return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags);
+	return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags, true);
 }
 
 /* Convert BTF of a function into bpf_reg_state if possible
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 003f7ba19558..7d9a2e18ca8a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6629,7 +6629,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	func_info_aux = env->prog->aux->func_info_aux;
 	if (func_info_aux)
 		is_global = func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_subprog_arg_match(env, subprog, caller->regs);
+	err = btf_check_subprog_call(env, subprog, caller->regs);
 	if (err == -EFAULT)
 		return err;
 	if (is_global) {
-- 
cgit v1.2.3


From 15baa55ff5b00b81bcd9874b89cb8e0b0daaa13d Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 6 Sep 2022 17:12:59 +0200
Subject: bpf/verifier: allow all functions to read user provided context

When a function was trying to access data from context in a syscall eBPF
program, the verifier was rejecting the call unless it was accessing the
first element.
This is because the syscall context is not known at compile time, and
so we need to check this when actually accessing it.

Check for the valid memory access if there is no convert_ctx callback,
and allow such situation to happen.

Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220906151303.2780789-4-benjamin.tissoires@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7d9a2e18ca8a..3cfe60206de6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5233,6 +5233,25 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 				env,
 				regno, reg->off, access_size,
 				zero_size_allowed, ACCESS_HELPER, meta);
+	case PTR_TO_CTX:
+		/* in case the function doesn't know how to access the context,
+		 * (because we are in a program of type SYSCALL for example), we
+		 * can not statically check its size.
+		 * Dynamically check it now.
+		 */
+		if (!env->ops->convert_ctx_access) {
+			enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ;
+			int offset = access_size - 1;
+
+			/* Allow zero-byte read from PTR_TO_CTX */
+			if (access_size == 0)
+				return zero_size_allowed ? 0 : -EACCES;
+
+			return check_mem_access(env, env->insn_idx, regno, offset, BPF_B,
+						atype, -1, false);
+		}
+
+		fallthrough;
 	default: /* scalar_value or invalid ptr */
 		/* Allow zero-byte read from NULL, regardless of pointer type */
 		if (zero_size_allowed && access_size == 0 &&
-- 
cgit v1.2.3


From fb66223a244f252273995557b23e0fa53092e92c Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 6 Sep 2022 17:13:00 +0200
Subject: selftests/bpf: add test for accessing ctx from syscall program type

We need to also export the kfunc set to the syscall program type,
and then add a couple of eBPF programs that are testing those calls.

The first one checks for valid access, and the second one is OK
from a static analysis point of view but fails at run time because
we are trying to access outside of the allocated memory.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220906151303.2780789-5-benjamin.tissoires@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/bpf/test_run.c                                 |   1 +
 .../testing/selftests/bpf/prog_tests/kfunc_call.c  | 143 ++++++++++++++++++++-
 .../testing/selftests/bpf/progs/kfunc_call_fail.c  |  39 ++++++
 .../testing/selftests/bpf/progs/kfunc_call_test.c  |  38 ++++++
 4 files changed, 214 insertions(+), 7 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/kfunc_call_fail.c

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 25d8ecf105aa..f16baf977a21 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -1634,6 +1634,7 @@ static int __init bpf_prog_test_run_init(void)
 
 	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set);
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set);
 	return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc,
 						  ARRAY_SIZE(bpf_prog_test_dtor_kfunc),
 						  THIS_MODULE);
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
index 9dfbe5355a2d..d5881c3331a8 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2021 Facebook */
 #include <test_progs.h>
 #include <network_helpers.h>
+#include "kfunc_call_fail.skel.h"
 #include "kfunc_call_test.skel.h"
 #include "kfunc_call_test.lskel.h"
 #include "kfunc_call_test_subprog.skel.h"
@@ -10,37 +11,96 @@
 
 #include "cap_helpers.h"
 
+static size_t log_buf_sz = 1048576; /* 1 MB */
+static char obj_log_buf[1048576];
+
+enum kfunc_test_type {
+	tc_test = 0,
+	syscall_test,
+	syscall_null_ctx_test,
+};
+
 struct kfunc_test_params {
 	const char *prog_name;
 	unsigned long lskel_prog_desc_offset;
 	int retval;
+	enum kfunc_test_type test_type;
+	const char *expected_err_msg;
 };
 
-#define TC_TEST(name, __retval) \
+#define __BPF_TEST_SUCCESS(name, __retval, type) \
 	{ \
 	  .prog_name = #name, \
 	  .lskel_prog_desc_offset = offsetof(struct kfunc_call_test_lskel, progs.name), \
 	  .retval = __retval, \
+	  .test_type = type, \
+	  .expected_err_msg = NULL, \
+	}
+
+#define __BPF_TEST_FAIL(name, __retval, type, error_msg) \
+	{ \
+	  .prog_name = #name, \
+	  .lskel_prog_desc_offset = 0 /* unused when test is failing */, \
+	  .retval = __retval, \
+	  .test_type = type, \
+	  .expected_err_msg = error_msg, \
 	}
 
+#define TC_TEST(name, retval) __BPF_TEST_SUCCESS(name, retval, tc_test)
+#define SYSCALL_TEST(name, retval) __BPF_TEST_SUCCESS(name, retval, syscall_test)
+#define SYSCALL_NULL_CTX_TEST(name, retval) __BPF_TEST_SUCCESS(name, retval, syscall_null_ctx_test)
+
+#define SYSCALL_NULL_CTX_FAIL(name, retval, error_msg) \
+	__BPF_TEST_FAIL(name, retval, syscall_null_ctx_test, error_msg)
+
 static struct kfunc_test_params kfunc_tests[] = {
+	/* failure cases:
+	 * if retval is 0 -> the program will fail to load and the error message is an error
+	 * if retval is not 0 -> the program can be loaded but running it will gives the
+	 *                       provided return value. The error message is thus the one
+	 *                       from a successful load
+	 */
+	SYSCALL_NULL_CTX_FAIL(kfunc_syscall_test_fail, -EINVAL, "processed 4 insns"),
+	SYSCALL_NULL_CTX_FAIL(kfunc_syscall_test_null_fail, -EINVAL, "processed 4 insns"),
+
+	/* success cases */
 	TC_TEST(kfunc_call_test1, 12),
 	TC_TEST(kfunc_call_test2, 3),
 	TC_TEST(kfunc_call_test_ref_btf_id, 0),
+	SYSCALL_TEST(kfunc_syscall_test, 0),
+	SYSCALL_NULL_CTX_TEST(kfunc_syscall_test_null, 0),
+};
+
+struct syscall_test_args {
+	__u8 data[16];
+	size_t size;
 };
 
 static void verify_success(struct kfunc_test_params *param)
 {
 	struct kfunc_call_test_lskel *lskel = NULL;
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
 	struct bpf_prog_desc *lskel_prog;
 	struct kfunc_call_test *skel;
 	struct bpf_program *prog;
 	int prog_fd, err;
-	LIBBPF_OPTS(bpf_test_run_opts, topts,
-		.data_in = &pkt_v4,
-		.data_size_in = sizeof(pkt_v4),
-		.repeat = 1,
-	);
+	struct syscall_test_args args = {
+		.size = 10,
+	};
+
+	switch (param->test_type) {
+	case syscall_test:
+		topts.ctx_in = &args;
+		topts.ctx_size_in = sizeof(args);
+		/* fallthrough */
+	case syscall_null_ctx_test:
+		break;
+	case tc_test:
+		topts.data_in = &pkt_v4;
+		topts.data_size_in = sizeof(pkt_v4);
+		topts.repeat = 1;
+		break;
+	}
 
 	/* first test with normal libbpf */
 	skel = kfunc_call_test__open_and_load();
@@ -79,6 +139,72 @@ cleanup:
 		kfunc_call_test_lskel__destroy(lskel);
 }
 
+static void verify_fail(struct kfunc_test_params *param)
+{
+	LIBBPF_OPTS(bpf_object_open_opts, opts);
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	struct bpf_program *prog;
+	struct kfunc_call_fail *skel;
+	int prog_fd, err;
+	struct syscall_test_args args = {
+		.size = 10,
+	};
+
+	opts.kernel_log_buf = obj_log_buf;
+	opts.kernel_log_size = log_buf_sz;
+	opts.kernel_log_level = 1;
+
+	switch (param->test_type) {
+	case syscall_test:
+		topts.ctx_in = &args;
+		topts.ctx_size_in = sizeof(args);
+		/* fallthrough */
+	case syscall_null_ctx_test:
+		break;
+	case tc_test:
+		topts.data_in = &pkt_v4;
+		topts.data_size_in = sizeof(pkt_v4);
+		break;
+		topts.repeat = 1;
+	}
+
+	skel = kfunc_call_fail__open_opts(&opts);
+	if (!ASSERT_OK_PTR(skel, "kfunc_call_fail__open_opts"))
+		goto cleanup;
+
+	prog = bpf_object__find_program_by_name(skel->obj, param->prog_name);
+	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+		goto cleanup;
+
+	bpf_program__set_autoload(prog, true);
+
+	err = kfunc_call_fail__load(skel);
+	if (!param->retval) {
+		/* the verifier is supposed to complain and refuses to load */
+		if (!ASSERT_ERR(err, "unexpected load success"))
+			goto out_err;
+
+	} else {
+		/* the program is loaded but must dynamically fail */
+		if (!ASSERT_OK(err, "unexpected load error"))
+			goto out_err;
+
+		prog_fd = bpf_program__fd(prog);
+		err = bpf_prog_test_run_opts(prog_fd, &topts);
+		if (!ASSERT_EQ(err, param->retval, param->prog_name))
+			goto out_err;
+	}
+
+out_err:
+	if (!ASSERT_OK_PTR(strstr(obj_log_buf, param->expected_err_msg), "expected_err_msg")) {
+		fprintf(stderr, "Expected err_msg: %s\n", param->expected_err_msg);
+		fprintf(stderr, "Verifier output: %s\n", obj_log_buf);
+	}
+
+cleanup:
+	kfunc_call_fail__destroy(skel);
+}
+
 static void test_main(void)
 {
 	int i;
@@ -87,7 +213,10 @@ static void test_main(void)
 		if (!test__start_subtest(kfunc_tests[i].prog_name))
 			continue;
 
-		verify_success(&kfunc_tests[i]);
+		if (!kfunc_tests[i].expected_err_msg)
+			verify_success(&kfunc_tests[i]);
+		else
+			verify_fail(&kfunc_tests[i]);
 	}
 }
 
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_fail.c b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c
new file mode 100644
index 000000000000..4168027f2ab1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+extern void bpf_kfunc_call_test_mem_len_pass1(void *mem, int len) __ksym;
+
+struct syscall_test_args {
+	__u8 data[16];
+	size_t size;
+};
+
+SEC("?syscall")
+int kfunc_syscall_test_fail(struct syscall_test_args *args)
+{
+	bpf_kfunc_call_test_mem_len_pass1(&args->data, sizeof(*args) + 1);
+
+	return 0;
+}
+
+SEC("?syscall")
+int kfunc_syscall_test_null_fail(struct syscall_test_args *args)
+{
+	/* Must be called with args as a NULL pointer
+	 * we do not check for it to have the verifier consider that
+	 * the pointer might not be null, and so we can load it.
+	 *
+	 * So the following can not be added:
+	 *
+	 * if (args)
+	 *      return -22;
+	 */
+
+	bpf_kfunc_call_test_mem_len_pass1(args, sizeof(*args));
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
index 5aecbb9fdc68..9e1914916f1d 100644
--- a/tools/testing/selftests/bpf/progs/kfunc_call_test.c
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
@@ -92,4 +92,42 @@ int kfunc_call_test_pass(struct __sk_buff *skb)
 	return 0;
 }
 
+struct syscall_test_args {
+	__u8 data[16];
+	size_t size;
+};
+
+SEC("syscall")
+int kfunc_syscall_test(struct syscall_test_args *args)
+{
+	const long size = args->size;
+
+	if (size > sizeof(args->data))
+		return -7; /* -E2BIG */
+
+	bpf_kfunc_call_test_mem_len_pass1(&args->data, sizeof(args->data));
+	bpf_kfunc_call_test_mem_len_pass1(&args->data, sizeof(*args));
+	bpf_kfunc_call_test_mem_len_pass1(&args->data, size);
+
+	return 0;
+}
+
+SEC("syscall")
+int kfunc_syscall_test_null(struct syscall_test_args *args)
+{
+	/* Must be called with args as a NULL pointer
+	 * we do not check for it to have the verifier consider that
+	 * the pointer might not be null, and so we can load it.
+	 *
+	 * So the following can not be added:
+	 *
+	 * if (args)
+	 *      return -22;
+	 */
+
+	bpf_kfunc_call_test_mem_len_pass1(args, 0);
+
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From f9b348185f4d684cc19e6bd9b87904823d5aa5ed Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 6 Sep 2022 17:13:01 +0200
Subject: bpf/btf: bump BTF_KFUNC_SET_MAX_CNT

net/bpf/test_run.c is already presenting 20 kfuncs.
net/netfilter/nf_conntrack_bpf.c is also presenting an extra 10 kfuncs.

Given that all the kfuncs are regrouped into one unique set, having
only 2 space left prevent us to add more selftests.

Bump it to 256.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220906151303.2780789-6-benjamin.tissoires@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9291e2b2c950..2c2d8190ca4a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -208,7 +208,7 @@ enum btf_kfunc_hook {
 };
 
 enum {
-	BTF_KFUNC_SET_MAX_CNT = 32,
+	BTF_KFUNC_SET_MAX_CNT = 256,
 	BTF_DTOR_KFUNC_MAX_CNT = 256,
 };
 
-- 
cgit v1.2.3


From eb1f7f71c126c8fd50ea81af98f97c4b581ea4ae Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 6 Sep 2022 17:13:02 +0200
Subject: bpf/verifier: allow kfunc to return an allocated mem

For drivers (outside of network), the incoming data is not statically
defined in a struct. Most of the time the data buffer is kzalloc-ed
and thus we can not rely on eBPF and BTF to explore the data.

This commit allows to return an arbitrary memory, previously allocated by
the driver.
An interesting extra point is that the kfunc can mark the exported
memory region as read only or read/write.

So, when a kfunc is not returning a pointer to a struct but to a plain
type, we can consider it is a valid allocated memory assuming that:
- one of the arguments is either called rdonly_buf_size or
  rdwr_buf_size
- and this argument is a const from the caller point of view

We can then use this parameter as the size of the allocated memory.

The memory is either read-only or read-write based on the name
of the size parameter.

Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220906151303.2780789-7-benjamin.tissoires@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |   9 +++-
 include/linux/bpf_verifier.h |   2 +
 include/linux/btf.h          |  10 +++++
 kernel/bpf/btf.c             | 101 ++++++++++++++++++++++++++++++++++---------
 kernel/bpf/verifier.c        |  45 +++++++++++++------
 5 files changed, 133 insertions(+), 34 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3cf161cfd396..79883f883ff3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1944,6 +1944,13 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 			   const char *func_name,
 			   struct btf_func_model *m);
 
+struct bpf_kfunc_arg_meta {
+	u64 r0_size;
+	bool r0_rdonly;
+	int ref_obj_id;
+	u32 flags;
+};
+
 struct bpf_reg_state;
 int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 				struct bpf_reg_state *regs);
@@ -1952,7 +1959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
 			      const struct btf *btf, u32 func_id,
 			      struct bpf_reg_state *regs,
-			      u32 kfunc_flags);
+			      struct bpf_kfunc_arg_meta *meta);
 int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			  struct bpf_reg_state *reg);
 int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 1fdddbf3546b..8fbc1d05281e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -598,6 +598,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
 			    struct bpf_attach_target_info *tgt_info);
 void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab);
 
+int mark_chain_precision(struct bpf_verifier_env *env, int regno);
+
 #define BPF_BASE_TYPE_MASK	GENMASK(BPF_BASE_TYPE_BITS - 1, 0)
 
 /* extract base type from bpf_{arg, return, reg}_type. */
diff --git a/include/linux/btf.h b/include/linux/btf.h
index ad93c2d9cc1c..1fcc833a8690 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -441,4 +441,14 @@ static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dt
 }
 #endif
 
+static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t)
+{
+	if (!btf_type_is_ptr(t))
+		return false;
+
+	t = btf_type_skip_modifiers(btf, t->type, NULL);
+
+	return btf_type_is_struct(t);
+}
+
 #endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2c2d8190ca4a..9d12212fcd61 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6199,11 +6199,36 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
 	return true;
 }
 
+static bool btf_is_kfunc_arg_mem_size(const struct btf *btf,
+				      const struct btf_param *arg,
+				      const struct bpf_reg_state *reg,
+				      const char *name)
+{
+	int len, target_len = strlen(name);
+	const struct btf_type *t;
+	const char *param_name;
+
+	t = btf_type_skip_modifiers(btf, arg->type, NULL);
+	if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+		return false;
+
+	param_name = btf_name_by_offset(btf, arg->name_off);
+	if (str_is_empty(param_name))
+		return false;
+	len = strlen(param_name);
+	if (len != target_len)
+		return false;
+	if (strcmp(param_name, name))
+		return false;
+
+	return true;
+}
+
 static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    const struct btf *btf, u32 func_id,
 				    struct bpf_reg_state *regs,
 				    bool ptr_to_mem_ok,
-				    u32 kfunc_flags,
+				    struct bpf_kfunc_arg_meta *kfunc_meta,
 				    bool processing_call)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
@@ -6241,12 +6266,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	if (is_kfunc) {
+	if (is_kfunc && kfunc_meta) {
 		/* Only kfunc can be release func */
-		rel = kfunc_flags & KF_RELEASE;
-		kptr_get = kfunc_flags & KF_KPTR_GET;
-		trusted_arg = kfunc_flags & KF_TRUSTED_ARGS;
-		sleepable = kfunc_flags & KF_SLEEPABLE;
+		rel = kfunc_meta->flags & KF_RELEASE;
+		kptr_get = kfunc_meta->flags & KF_KPTR_GET;
+		trusted_arg = kfunc_meta->flags & KF_TRUSTED_ARGS;
+		sleepable = kfunc_meta->flags & KF_SLEEPABLE;
 	}
 
 	/* check that BTF function arguments match actual types that the
@@ -6259,6 +6284,38 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 
 		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
 		if (btf_type_is_scalar(t)) {
+			if (is_kfunc && kfunc_meta) {
+				bool is_buf_size = false;
+
+				/* check for any const scalar parameter of name "rdonly_buf_size"
+				 * or "rdwr_buf_size"
+				 */
+				if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg,
+							      "rdonly_buf_size")) {
+					kfunc_meta->r0_rdonly = true;
+					is_buf_size = true;
+				} else if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg,
+								     "rdwr_buf_size"))
+					is_buf_size = true;
+
+				if (is_buf_size) {
+					if (kfunc_meta->r0_size) {
+						bpf_log(log, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
+						return -EINVAL;
+					}
+
+					if (!tnum_is_const(reg->var_off)) {
+						bpf_log(log, "R%d is not a const\n", regno);
+						return -EINVAL;
+					}
+
+					kfunc_meta->r0_size = reg->var_off.value;
+					ret = mark_chain_precision(env, regno);
+					if (ret)
+						return ret;
+				}
+			}
+
 			if (reg->type == SCALAR_VALUE)
 				continue;
 			bpf_log(log, "R%d is not a scalar\n", regno);
@@ -6289,6 +6346,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		if (ret < 0)
 			return ret;
 
+		if (is_kfunc && reg->ref_obj_id) {
+			/* Ensure only one argument is referenced PTR_TO_BTF_ID */
+			if (ref_obj_id) {
+				bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+					regno, reg->ref_obj_id, ref_obj_id);
+				return -EFAULT;
+			}
+			ref_regno = regno;
+			ref_obj_id = reg->ref_obj_id;
+		}
+
 		/* kptr_get is only true for kfunc */
 		if (i == 0 && kptr_get) {
 			struct bpf_map_value_off_desc *off_desc;
@@ -6361,16 +6429,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			if (reg->type == PTR_TO_BTF_ID) {
 				reg_btf = reg->btf;
 				reg_ref_id = reg->btf_id;
-				/* Ensure only one argument is referenced PTR_TO_BTF_ID */
-				if (reg->ref_obj_id) {
-					if (ref_obj_id) {
-						bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
-							regno, reg->ref_obj_id, ref_obj_id);
-						return -EFAULT;
-					}
-					ref_regno = regno;
-					ref_obj_id = reg->ref_obj_id;
-				}
 			} else {
 				reg_btf = btf_vmlinux;
 				reg_ref_id = *reg2btf_ids[base_type(reg->type)];
@@ -6461,6 +6519,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		return -EINVAL;
 	}
 
+	if (kfunc_meta && ref_obj_id)
+		kfunc_meta->ref_obj_id = ref_obj_id;
+
 	/* returns argument register number > 0 in case of reference release kfunc */
 	return rel ? ref_regno : 0;
 }
@@ -6492,7 +6553,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
 		return -EINVAL;
 
 	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, false);
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, false);
 
 	/* Compiler optimizations can remove arguments from static functions
 	 * or mismatched type can be passed into a global function.
@@ -6535,7 +6596,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 		return -EINVAL;
 
 	is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
-	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0, true);
+	err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, true);
 
 	/* Compiler optimizations can remove arguments from static functions
 	 * or mismatched type can be passed into a global function.
@@ -6549,9 +6610,9 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
 int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
 			      const struct btf *btf, u32 func_id,
 			      struct bpf_reg_state *regs,
-			      u32 kfunc_flags)
+			      struct bpf_kfunc_arg_meta *meta)
 {
-	return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags, true);
+	return btf_check_func_arg_match(env, btf, func_id, regs, true, meta, true);
 }
 
 /* Convert BTF of a function into bpf_reg_state if possible
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3cfe60206de6..f3344a86d88d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2908,7 +2908,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
 	return 0;
 }
 
-static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+int mark_chain_precision(struct bpf_verifier_env *env, int regno)
 {
 	return __mark_chain_precision(env, regno, -1);
 }
@@ -7595,6 +7595,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 {
 	const struct btf_type *t, *func, *func_proto, *ptr_type;
 	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_kfunc_arg_meta meta = { 0 };
 	const char *func_name, *ptr_type_name;
 	u32 i, nargs, func_id, ptr_type_id;
 	int err, insn_idx = *insn_idx_p;
@@ -7629,8 +7630,10 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 
 	acq = *kfunc_flags & KF_ACQUIRE;
 
+	meta.flags = *kfunc_flags;
+
 	/* Check the arguments */
-	err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, *kfunc_flags);
+	err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, &meta);
 	if (err < 0)
 		return err;
 	/* In case of release function, we get register number of refcounted
@@ -7651,7 +7654,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	/* Check return type */
 	t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
 
-	if (acq && !btf_type_is_ptr(t)) {
+	if (acq && !btf_type_is_struct_ptr(desc_btf, t)) {
 		verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
 		return -EINVAL;
 	}
@@ -7663,17 +7666,33 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		ptr_type = btf_type_skip_modifiers(desc_btf, t->type,
 						   &ptr_type_id);
 		if (!btf_type_is_struct(ptr_type)) {
-			ptr_type_name = btf_name_by_offset(desc_btf,
-							   ptr_type->name_off);
-			verbose(env, "kernel function %s returns pointer type %s %s is not supported\n",
-				func_name, btf_type_str(ptr_type),
-				ptr_type_name);
-			return -EINVAL;
+			if (!meta.r0_size) {
+				ptr_type_name = btf_name_by_offset(desc_btf,
+								   ptr_type->name_off);
+				verbose(env,
+					"kernel function %s returns pointer type %s %s is not supported\n",
+					func_name,
+					btf_type_str(ptr_type),
+					ptr_type_name);
+				return -EINVAL;
+			}
+
+			mark_reg_known_zero(env, regs, BPF_REG_0);
+			regs[BPF_REG_0].type = PTR_TO_MEM;
+			regs[BPF_REG_0].mem_size = meta.r0_size;
+
+			if (meta.r0_rdonly)
+				regs[BPF_REG_0].type |= MEM_RDONLY;
+
+			/* Ensures we don't access the memory after a release_reference() */
+			if (meta.ref_obj_id)
+				regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+		} else {
+			mark_reg_known_zero(env, regs, BPF_REG_0);
+			regs[BPF_REG_0].btf = desc_btf;
+			regs[BPF_REG_0].type = PTR_TO_BTF_ID;
+			regs[BPF_REG_0].btf_id = ptr_type_id;
 		}
-		mark_reg_known_zero(env, regs, BPF_REG_0);
-		regs[BPF_REG_0].btf = desc_btf;
-		regs[BPF_REG_0].type = PTR_TO_BTF_ID;
-		regs[BPF_REG_0].btf_id = ptr_type_id;
 		if (*kfunc_flags & KF_RET_NULL) {
 			regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
 			/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
-- 
cgit v1.2.3


From 22ed8d5a46520ef0f060e7c0ee91f1cc6f684400 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Tue, 6 Sep 2022 17:13:03 +0200
Subject: selftests/bpf: Add tests for kfunc returning a memory pointer

We add 2 new kfuncs that are following the RET_PTR_TO_MEM
capability from the previous commit.
Then we test them in selftests:
the first tests are testing valid case, and are not failing,
and the later ones are actually preventing the program to be loaded
because they are wrong.

To work around that, we mark the failing ones as not autoloaded
(with SEC("?tc")), and we manually enable them one by one, ensuring
the verifier rejects them.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Link: https://lore.kernel.org/r/20220906151303.2780789-8-benjamin.tissoires@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/bpf/test_run.c                                 |  36 ++++++
 .../testing/selftests/bpf/prog_tests/kfunc_call.c  |   7 ++
 .../testing/selftests/bpf/progs/kfunc_call_fail.c  | 121 +++++++++++++++++++++
 .../testing/selftests/bpf/progs/kfunc_call_test.c  |  33 ++++++
 4 files changed, 197 insertions(+)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index f16baf977a21..13d578ce2a09 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -606,6 +606,38 @@ noinline void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p)
 	WARN_ON_ONCE(1);
 }
 
+static int *__bpf_kfunc_call_test_get_mem(struct prog_test_ref_kfunc *p, const int size)
+{
+	if (size > 2 * sizeof(int))
+		return NULL;
+
+	return (int *)p;
+}
+
+noinline int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size)
+{
+	return __bpf_kfunc_call_test_get_mem(p, rdwr_buf_size);
+}
+
+noinline int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size)
+{
+	return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size);
+}
+
+/* the next 2 ones can't be really used for testing expect to ensure
+ * that the verifier rejects the call.
+ * Acquire functions must return struct pointers, so these ones are
+ * failing.
+ */
+noinline int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size)
+{
+	return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size);
+}
+
+noinline void bpf_kfunc_call_int_mem_release(int *p)
+{
+}
+
 noinline struct prog_test_ref_kfunc *
 bpf_kfunc_call_test_kptr_get(struct prog_test_ref_kfunc **pp, int a, int b)
 {
@@ -712,6 +744,10 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_memb_acquire, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_memb1_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_get_rdwr_mem, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_get_rdonly_mem, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_acq_rdonly_mem, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_kfunc_call_int_mem_release, KF_RELEASE)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_kptr_get, KF_ACQUIRE | KF_RET_NULL | KF_KPTR_GET)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass_ctx)
 BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass1)
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
index d5881c3331a8..5af1ee8f0e6e 100644
--- a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c
@@ -50,6 +50,7 @@ struct kfunc_test_params {
 #define SYSCALL_TEST(name, retval) __BPF_TEST_SUCCESS(name, retval, syscall_test)
 #define SYSCALL_NULL_CTX_TEST(name, retval) __BPF_TEST_SUCCESS(name, retval, syscall_null_ctx_test)
 
+#define TC_FAIL(name, retval, error_msg) __BPF_TEST_FAIL(name, retval, tc_test, error_msg)
 #define SYSCALL_NULL_CTX_FAIL(name, retval, error_msg) \
 	__BPF_TEST_FAIL(name, retval, syscall_null_ctx_test, error_msg)
 
@@ -62,11 +63,17 @@ static struct kfunc_test_params kfunc_tests[] = {
 	 */
 	SYSCALL_NULL_CTX_FAIL(kfunc_syscall_test_fail, -EINVAL, "processed 4 insns"),
 	SYSCALL_NULL_CTX_FAIL(kfunc_syscall_test_null_fail, -EINVAL, "processed 4 insns"),
+	TC_FAIL(kfunc_call_test_get_mem_fail_rdonly, 0, "R0 cannot write into rdonly_mem"),
+	TC_FAIL(kfunc_call_test_get_mem_fail_use_after_free, 0, "invalid mem access 'scalar'"),
+	TC_FAIL(kfunc_call_test_get_mem_fail_oob, 0, "min value is outside of the allowed memory range"),
+	TC_FAIL(kfunc_call_test_get_mem_fail_not_const, 0, "is not a const"),
+	TC_FAIL(kfunc_call_test_mem_acquire_fail, 0, "acquire kernel function does not return PTR_TO_BTF_ID"),
 
 	/* success cases */
 	TC_TEST(kfunc_call_test1, 12),
 	TC_TEST(kfunc_call_test2, 3),
 	TC_TEST(kfunc_call_test_ref_btf_id, 0),
+	TC_TEST(kfunc_call_test_get_mem, 42),
 	SYSCALL_TEST(kfunc_syscall_test, 0),
 	SYSCALL_NULL_CTX_TEST(kfunc_syscall_test_null, 0),
 };
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_fail.c b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c
index 4168027f2ab1..b98313d391c6 100644
--- a/tools/testing/selftests/bpf/progs/kfunc_call_fail.c
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_fail.c
@@ -3,7 +3,13 @@
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
 
+extern struct prog_test_ref_kfunc *bpf_kfunc_call_test_acquire(unsigned long *sp) __ksym;
+extern void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) __ksym;
 extern void bpf_kfunc_call_test_mem_len_pass1(void *mem, int len) __ksym;
+extern int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size) __ksym;
+extern int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) __ksym;
+extern int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) __ksym;
+extern void bpf_kfunc_call_int_mem_release(int *p) __ksym;
 
 struct syscall_test_args {
 	__u8 data[16];
@@ -36,4 +42,119 @@ int kfunc_syscall_test_null_fail(struct syscall_test_args *args)
 	return 0;
 }
 
+SEC("?tc")
+int kfunc_call_test_get_mem_fail_rdonly(struct __sk_buff *skb)
+{
+	struct prog_test_ref_kfunc *pt;
+	unsigned long s = 0;
+	int *p = NULL;
+	int ret = 0;
+
+	pt = bpf_kfunc_call_test_acquire(&s);
+	if (pt) {
+		p = bpf_kfunc_call_test_get_rdonly_mem(pt, 2 * sizeof(int));
+		if (p)
+			p[0] = 42; /* this is a read-only buffer, so -EACCES */
+		else
+			ret = -1;
+
+		bpf_kfunc_call_test_release(pt);
+	}
+	return ret;
+}
+
+SEC("?tc")
+int kfunc_call_test_get_mem_fail_use_after_free(struct __sk_buff *skb)
+{
+	struct prog_test_ref_kfunc *pt;
+	unsigned long s = 0;
+	int *p = NULL;
+	int ret = 0;
+
+	pt = bpf_kfunc_call_test_acquire(&s);
+	if (pt) {
+		p = bpf_kfunc_call_test_get_rdwr_mem(pt, 2 * sizeof(int));
+		if (p) {
+			p[0] = 42;
+			ret = p[1]; /* 108 */
+		} else {
+			ret = -1;
+		}
+
+		bpf_kfunc_call_test_release(pt);
+	}
+	if (p)
+		ret = p[0]; /* p is not valid anymore */
+
+	return ret;
+}
+
+SEC("?tc")
+int kfunc_call_test_get_mem_fail_oob(struct __sk_buff *skb)
+{
+	struct prog_test_ref_kfunc *pt;
+	unsigned long s = 0;
+	int *p = NULL;
+	int ret = 0;
+
+	pt = bpf_kfunc_call_test_acquire(&s);
+	if (pt) {
+		p = bpf_kfunc_call_test_get_rdonly_mem(pt, 2 * sizeof(int));
+		if (p)
+			ret = p[2 * sizeof(int)]; /* oob access, so -EACCES */
+		else
+			ret = -1;
+
+		bpf_kfunc_call_test_release(pt);
+	}
+	return ret;
+}
+
+int not_const_size = 2 * sizeof(int);
+
+SEC("?tc")
+int kfunc_call_test_get_mem_fail_not_const(struct __sk_buff *skb)
+{
+	struct prog_test_ref_kfunc *pt;
+	unsigned long s = 0;
+	int *p = NULL;
+	int ret = 0;
+
+	pt = bpf_kfunc_call_test_acquire(&s);
+	if (pt) {
+		p = bpf_kfunc_call_test_get_rdonly_mem(pt, not_const_size); /* non const size, -EINVAL */
+		if (p)
+			ret = p[0];
+		else
+			ret = -1;
+
+		bpf_kfunc_call_test_release(pt);
+	}
+	return ret;
+}
+
+SEC("?tc")
+int kfunc_call_test_mem_acquire_fail(struct __sk_buff *skb)
+{
+	struct prog_test_ref_kfunc *pt;
+	unsigned long s = 0;
+	int *p = NULL;
+	int ret = 0;
+
+	pt = bpf_kfunc_call_test_acquire(&s);
+	if (pt) {
+		/* we are failing on this one, because we are not acquiring a PTR_TO_BTF_ID (a struct ptr) */
+		p = bpf_kfunc_call_test_acq_rdonly_mem(pt, 2 * sizeof(int));
+		if (p)
+			ret = p[0];
+		else
+			ret = -1;
+
+		bpf_kfunc_call_int_mem_release(p);
+
+		bpf_kfunc_call_test_release(pt);
+	}
+	return ret;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
index 9e1914916f1d..f636e50be259 100644
--- a/tools/testing/selftests/bpf/progs/kfunc_call_test.c
+++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c
@@ -14,6 +14,8 @@ extern void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p) __ksym;
 extern void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p) __ksym;
 extern void bpf_kfunc_call_test_mem_len_pass1(void *mem, int len) __ksym;
 extern void bpf_kfunc_call_test_mem_len_fail2(__u64 *mem, int len) __ksym;
+extern int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size) __ksym;
+extern int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) __ksym;
 
 SEC("tc")
 int kfunc_call_test2(struct __sk_buff *skb)
@@ -130,4 +132,35 @@ int kfunc_syscall_test_null(struct syscall_test_args *args)
 	return 0;
 }
 
+SEC("tc")
+int kfunc_call_test_get_mem(struct __sk_buff *skb)
+{
+	struct prog_test_ref_kfunc *pt;
+	unsigned long s = 0;
+	int *p = NULL;
+	int ret = 0;
+
+	pt = bpf_kfunc_call_test_acquire(&s);
+	if (pt) {
+		p = bpf_kfunc_call_test_get_rdwr_mem(pt, 2 * sizeof(int));
+		if (p) {
+			p[0] = 42;
+			ret = p[1]; /* 108 */
+		} else {
+			ret = -1;
+		}
+
+		if (ret >= 0) {
+			p = bpf_kfunc_call_test_get_rdonly_mem(pt, 2 * sizeof(int));
+			if (p)
+				ret = p[0]; /* 42 */
+			else
+				ret = -1;
+		}
+
+		bpf_kfunc_call_test_release(pt);
+	}
+	return ret;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From 9fad7fe5b29803584c7f17a2abe6c2936fec6828 Mon Sep 17 00:00:00 2001
From: Jules Irenge <jbi.octave@gmail.com>
Date: Wed, 7 Sep 2022 16:24:20 +0100
Subject: bpf: Fix resetting logic for unreferenced kptrs

Sparse reported a warning at bpf_map_free_kptrs()
"warning: Using plain integer as NULL pointer"
During the process of fixing this warning, it was discovered that the current
code erroneously writes to the pointer variable instead of deferencing and
writing to the actual kptr. Hence, Sparse tool accidentally helped to uncover
this problem. Fix this by doing WRITE_ONCE(*p, 0) instead of WRITE_ONCE(p, 0).

Note that the effect of this bug is that unreferenced kptrs will not be cleared
during check_and_free_fields. It is not a problem if the clearing is not done
during map_free stage, as there is nothing to free for them.

Fixes: 14a324f6a67e ("bpf: Wire up freeing of referenced kptr")
Signed-off-by: Jules Irenge <jbi.octave@gmail.com>
Link: https://lore.kernel.org/r/Yxi3pJaK6UDjVJSy@playground
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/syscall.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4fb08c43420d..d35a6aa3aa96 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -598,7 +598,7 @@ void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
 		if (off_desc->type == BPF_KPTR_UNREF) {
 			u64 *p = (u64 *)btf_id_ptr;
 
-			WRITE_ONCE(p, 0);
+			WRITE_ONCE(*p, 0);
 			continue;
 		}
 		old_ptr = xchg(btf_id_ptr, 0);
-- 
cgit v1.2.3


From 448325199f574d33824dbf9121efb03558412966 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sun, 4 Sep 2022 22:41:14 +0200
Subject: bpf: Add copy_map_value_long to copy to remote percpu memory

bpf_long_memcpy is used while copying to remote percpu regions from BPF
syscall and helpers, so that the copy is atomic at word size
granularity.

This might not be possible when you copy from map value hosting kptrs
from or to percpu maps, as the alignment or size in disjoint regions may
not be multiple of word size.

Hence, to avoid complicating the copy loop, we only use bpf_long_memcpy
when special fields are not present, otherwise use normal memcpy to copy
the disjoint regions.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220904204145.3089-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 52 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 79883f883ff3..6a73e94821c4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -280,14 +280,33 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
 	}
 }
 
-/* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */
-static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
+/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
+ * forced to use 'long' read/writes to try to atomically copy long counters.
+ * Best-effort only.  No barriers here, since it _will_ race with concurrent
+ * updates from BPF programs. Called from bpf syscall and mostly used with
+ * size 8 or 16 bytes, so ask compiler to inline it.
+ */
+static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
+{
+	const long *lsrc = src;
+	long *ldst = dst;
+
+	size /= sizeof(long);
+	while (size--)
+		*ldst++ = *lsrc++;
+}
+
+/* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */
+static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, bool long_memcpy)
 {
 	u32 curr_off = 0;
 	int i;
 
 	if (likely(!map->off_arr)) {
-		memcpy(dst, src, map->value_size);
+		if (long_memcpy)
+			bpf_long_memcpy(dst, src, round_up(map->value_size, 8));
+		else
+			memcpy(dst, src, map->value_size);
 		return;
 	}
 
@@ -299,6 +318,17 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 	}
 	memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off);
 }
+
+static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
+{
+	__copy_map_value(map, dst, src, false);
+}
+
+static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src)
+{
+	__copy_map_value(map, dst, src, true);
+}
+
 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
 			   bool lock_src);
 void bpf_timer_cancel_and_free(void *timer);
@@ -1827,22 +1857,6 @@ int bpf_get_file_flag(int flags);
 int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size,
 			     size_t actual_size);
 
-/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
- * forced to use 'long' read/writes to try to atomically copy long counters.
- * Best-effort only.  No barriers here, since it _will_ race with concurrent
- * updates from BPF programs. Called from bpf syscall and mostly used with
- * size 8 or 16 bytes, so ask compiler to inline it.
- */
-static inline void bpf_long_memcpy(void *dst, const void *src, u32 size)
-{
-	const long *lsrc = src;
-	long *ldst = dst;
-
-	size /= sizeof(long);
-	while (size--)
-		*ldst++ = *lsrc++;
-}
-
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr);
 
-- 
cgit v1.2.3


From 6df4ea1ff0ff70798ff1e7eed79f98ccb7b5b0a2 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sun, 4 Sep 2022 22:41:15 +0200
Subject: bpf: Support kptrs in percpu arraymap

Enable support for kptrs in percpu BPF arraymap by wiring up the freeing
of these kptrs from percpu map elements.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220904204145.3089-3-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/arraymap.c | 33 ++++++++++++++++++++++++---------
 kernel/bpf/syscall.c  |  3 ++-
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 624527401d4d..832b2659e96e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -279,7 +279,8 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
 	rcu_read_lock();
 	pptr = array->pptrs[index & array->index_mask];
 	for_each_possible_cpu(cpu) {
-		bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+		copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+		check_and_init_map_value(map, value + off);
 		off += size;
 	}
 	rcu_read_unlock();
@@ -338,8 +339,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 		return -EINVAL;
 
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
-		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
-		       value, map->value_size);
+		val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
+		copy_map_value(map, val, value);
+		check_and_free_fields(array, val);
 	} else {
 		val = array->value +
 			(u64)array->elem_size * (index & array->index_mask);
@@ -383,7 +385,8 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
 	rcu_read_lock();
 	pptr = array->pptrs[index & array->index_mask];
 	for_each_possible_cpu(cpu) {
-		bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+		copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
+		check_and_free_fields(array, per_cpu_ptr(pptr, cpu));
 		off += size;
 	}
 	rcu_read_unlock();
@@ -421,8 +424,20 @@ static void array_map_free(struct bpf_map *map)
 	int i;
 
 	if (map_value_has_kptrs(map)) {
-		for (i = 0; i < array->map.max_entries; i++)
-			bpf_map_free_kptrs(map, array_map_elem_ptr(array, i));
+		if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+			for (i = 0; i < array->map.max_entries; i++) {
+				void __percpu *pptr = array->pptrs[i & array->index_mask];
+				int cpu;
+
+				for_each_possible_cpu(cpu) {
+					bpf_map_free_kptrs(map, per_cpu_ptr(pptr, cpu));
+					cond_resched();
+				}
+			}
+		} else {
+			for (i = 0; i < array->map.max_entries; i++)
+				bpf_map_free_kptrs(map, array_map_elem_ptr(array, i));
+		}
 		bpf_map_free_kptr_off_tab(map);
 	}
 
@@ -608,9 +623,9 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
 			pptr = v;
 			size = array->elem_size;
 			for_each_possible_cpu(cpu) {
-				bpf_long_memcpy(info->percpu_value_buf + off,
-						per_cpu_ptr(pptr, cpu),
-						size);
+				copy_map_value_long(map, info->percpu_value_buf + off,
+						    per_cpu_ptr(pptr, cpu));
+				check_and_init_map_value(map, info->percpu_value_buf + off);
 				off += size;
 			}
 			ctx.value = info->percpu_value_buf;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d35a6aa3aa96..69be1c612daa 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1049,7 +1049,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
 		}
 		if (map->map_type != BPF_MAP_TYPE_HASH &&
 		    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
-		    map->map_type != BPF_MAP_TYPE_ARRAY) {
+		    map->map_type != BPF_MAP_TYPE_ARRAY &&
+		    map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
 			ret = -EOPNOTSUPP;
 			goto free_map_tab;
 		}
-- 
cgit v1.2.3


From cc48755808c646666436745b35629c3f0d05e165 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sun, 4 Sep 2022 22:41:16 +0200
Subject: bpf: Add zero_map_value to zero map value with special fields

We need this helper to skip over special fields (bpf_spin_lock,
bpf_timer, kptrs) while zeroing a map value. Use the same logic as
copy_map_value but memset instead of memcpy.

Currently, the code zeroing map value memory does not have to deal with
special fields, hence this is a prerequisite for introducing such
support.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220904204145.3089-4-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6a73e94821c4..48ae05099f36 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -329,6 +329,25 @@ static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src
 	__copy_map_value(map, dst, src, true);
 }
 
+static inline void zero_map_value(struct bpf_map *map, void *dst)
+{
+	u32 curr_off = 0;
+	int i;
+
+	if (likely(!map->off_arr)) {
+		memset(dst, 0, map->value_size);
+		return;
+	}
+
+	for (i = 0; i < map->off_arr->cnt; i++) {
+		u32 next_off = map->off_arr->field_off[i];
+
+		memset(dst + curr_off, 0, next_off - curr_off);
+		curr_off += map->off_arr->field_sz[i];
+	}
+	memset(dst + curr_off, 0, map->value_size - curr_off);
+}
+
 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
 			   bool lock_src);
 void bpf_timer_cancel_and_free(void *timer);
-- 
cgit v1.2.3


From d219d2a9a92e39aa92799efe8f2aa21259b6dd82 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 29 Aug 2022 13:37:17 -0700
Subject: overflow: Allow mixed type arguments

When the check_[op]_overflow() helpers were introduced, all arguments
were required to be the same type to make the fallback macros simpler.
However, now that the fallback macros have been removed[1], it is fine
to allow mixed types, which makes using the helpers much more useful,
as they can be used to test for type-based overflows (e.g. adding two
large ints but storing into a u8), as would be handy in the drm core[2].

Remove the restriction, and add additional self-tests that exercise
some of the mixed-type overflow cases, and double-check for accidental
macro side-effects.

[1] https://git.kernel.org/linus/4eb6bd55cfb22ffc20652732340c4962f3ac9a91
[2] https://lore.kernel.org/lkml/20220824084514.2261614-2-gwan-gyeong.mun@intel.com

Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: linux-hardening@vger.kernel.org
Reviewed-by: Andrzej Hajda <andrzej.hajda@intel.com>
Reviewed-by: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Tested-by: Gwan-gyeong Mun <gwan-gyeong.mun@intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/overflow.h |  72 ++++++++++++++++++---------------
 lib/overflow_kunit.c     | 101 +++++++++++++++++++++++++++++++++--------------
 2 files changed, 113 insertions(+), 60 deletions(-)

diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index 0eb3b192f07a..19dfdd74835e 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -51,40 +51,50 @@ static inline bool __must_check __must_check_overflow(bool overflow)
 	return unlikely(overflow);
 }
 
-/*
- * For simplicity and code hygiene, the fallback code below insists on
- * a, b and *d having the same type (similar to the min() and max()
- * macros), whereas gcc's type-generic overflow checkers accept
- * different types. Hence we don't just make check_add_overflow an
- * alias for __builtin_add_overflow, but add type checks similar to
- * below.
+/** check_add_overflow() - Calculate addition with overflow checking
+ *
+ * @a: first addend
+ * @b: second addend
+ * @d: pointer to store sum
+ *
+ * Returns 0 on success.
+ *
+ * *@d holds the results of the attempted addition, but is not considered
+ * "safe for use" on a non-zero return value, which indicates that the
+ * sum has overflowed or been truncated.
  */
-#define check_add_overflow(a, b, d) __must_check_overflow(({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	__builtin_add_overflow(__a, __b, __d);	\
-}))
+#define check_add_overflow(a, b, d)	\
+	__must_check_overflow(__builtin_add_overflow(a, b, d))
 
-#define check_sub_overflow(a, b, d) __must_check_overflow(({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	__builtin_sub_overflow(__a, __b, __d);	\
-}))
+/** check_sub_overflow() - Calculate subtraction with overflow checking
+ *
+ * @a: minuend; value to subtract from
+ * @b: subtrahend; value to subtract from @a
+ * @d: pointer to store difference
+ *
+ * Returns 0 on success.
+ *
+ * *@d holds the results of the attempted subtraction, but is not considered
+ * "safe for use" on a non-zero return value, which indicates that the
+ * difference has underflowed or been truncated.
+ */
+#define check_sub_overflow(a, b, d)	\
+	__must_check_overflow(__builtin_sub_overflow(a, b, d))
 
-#define check_mul_overflow(a, b, d) __must_check_overflow(({	\
-	typeof(a) __a = (a);			\
-	typeof(b) __b = (b);			\
-	typeof(d) __d = (d);			\
-	(void) (&__a == &__b);			\
-	(void) (&__a == __d);			\
-	__builtin_mul_overflow(__a, __b, __d);	\
-}))
+/** check_mul_overflow() - Calculate multiplication with overflow checking
+ *
+ * @a: first factor
+ * @b: second factor
+ * @d: pointer to store product
+ *
+ * Returns 0 on success.
+ *
+ * *@d holds the results of the attempted multiplication, but is not
+ * considered "safe for use" on a non-zero return value, which indicates
+ * that the product has overflowed or been truncated.
+ */
+#define check_mul_overflow(a, b, d)	\
+	__must_check_overflow(__builtin_mul_overflow(a, b, d))
 
 /** check_shl_overflow() - Calculate a left-shifted value and check overflow
  *
diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c
index 7e3e43679b73..0d98c9bc75da 100644
--- a/lib/overflow_kunit.c
+++ b/lib/overflow_kunit.c
@@ -16,12 +16,15 @@
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 
-#define DEFINE_TEST_ARRAY(t)			\
-	static const struct test_ ## t {	\
-		t a, b;				\
-		t sum, diff, prod;		\
-		bool s_of, d_of, p_of;		\
-	} t ## _tests[]
+#define DEFINE_TEST_ARRAY_TYPED(t1, t2, t)			\
+	static const struct test_ ## t1 ## _ ## t2 ## __ ## t {	\
+		t1 a;						\
+		t2 b;						\
+		t sum, diff, prod;				\
+		bool s_of, d_of, p_of;				\
+	} t1 ## _ ## t2 ## __ ## t ## _tests[]
+
+#define DEFINE_TEST_ARRAY(t)	DEFINE_TEST_ARRAY_TYPED(t, t, t)
 
 DEFINE_TEST_ARRAY(u8) = {
 	{0, 0, 0, 0, 0, false, false, false},
@@ -222,21 +225,27 @@ DEFINE_TEST_ARRAY(s64) = {
 };
 #endif
 
-#define check_one_op(t, fmt, op, sym, a, b, r, of) do {		\
-	t _r;							\
-	bool _of;						\
-								\
-	_of = check_ ## op ## _overflow(a, b, &_r);		\
-	KUNIT_EXPECT_EQ_MSG(test, _of, of,			\
+#define check_one_op(t, fmt, op, sym, a, b, r, of) do {			\
+	int _a_orig = a, _a_bump = a + 1;				\
+	int _b_orig = b, _b_bump = b + 1;				\
+	bool _of;							\
+	t _r;								\
+									\
+	_of = check_ ## op ## _overflow(a, b, &_r);			\
+	KUNIT_EXPECT_EQ_MSG(test, _of, of,				\
 		"expected "fmt" "sym" "fmt" to%s overflow (type %s)\n",	\
-		a, b, of ? "" : " not", #t);			\
-	KUNIT_EXPECT_EQ_MSG(test, _r, r,			\
+		a, b, of ? "" : " not", #t);				\
+	KUNIT_EXPECT_EQ_MSG(test, _r, r,				\
 		"expected "fmt" "sym" "fmt" == "fmt", got "fmt" (type %s)\n", \
-		a, b, r, _r, #t);				\
+		a, b, r, _r, #t);					\
+	/* Check for internal macro side-effects. */			\
+	_of = check_ ## op ## _overflow(_a_orig++, _b_orig++, &_r);	\
+	KUNIT_EXPECT_EQ_MSG(test, _a_orig, _a_bump, "Unexpected " #op " macro side-effect!\n"); \
+	KUNIT_EXPECT_EQ_MSG(test, _b_orig, _b_bump, "Unexpected " #op " macro side-effect!\n"); \
 } while (0)
 
-#define DEFINE_TEST_FUNC(t, fmt)					\
-static void do_test_ ## t(struct kunit *test, const struct test_ ## t *p) \
+#define DEFINE_TEST_FUNC_TYPED(n, t, fmt)				\
+static void do_test_ ## n(struct kunit *test, const struct test_ ## n *p) \
 {							   		\
 	check_one_op(t, fmt, add, "+", p->a, p->b, p->sum, p->s_of);	\
 	check_one_op(t, fmt, add, "+", p->b, p->a, p->sum, p->s_of);	\
@@ -245,15 +254,18 @@ static void do_test_ ## t(struct kunit *test, const struct test_ ## t *p) \
 	check_one_op(t, fmt, mul, "*", p->b, p->a, p->prod, p->p_of);	\
 }									\
 									\
-static void t ## _overflow_test(struct kunit *test) {			\
+static void n ## _overflow_test(struct kunit *test) {			\
 	unsigned i;							\
 									\
-	for (i = 0; i < ARRAY_SIZE(t ## _tests); ++i)			\
-		do_test_ ## t(test, &t ## _tests[i]);			\
+	for (i = 0; i < ARRAY_SIZE(n ## _tests); ++i)			\
+		do_test_ ## n(test, &n ## _tests[i]);			\
 	kunit_info(test, "%zu %s arithmetic tests finished\n",		\
-		ARRAY_SIZE(t ## _tests), #t);				\
+		ARRAY_SIZE(n ## _tests), #n);				\
 }
 
+#define DEFINE_TEST_FUNC(t, fmt)					\
+	DEFINE_TEST_FUNC_TYPED(t ## _ ## t ## __ ## t, t, fmt)
+
 DEFINE_TEST_FUNC(u8, "%d");
 DEFINE_TEST_FUNC(s8, "%d");
 DEFINE_TEST_FUNC(u16, "%d");
@@ -265,6 +277,33 @@ DEFINE_TEST_FUNC(u64, "%llu");
 DEFINE_TEST_FUNC(s64, "%lld");
 #endif
 
+DEFINE_TEST_ARRAY_TYPED(u32, u32, u8) = {
+	{0, 0, 0, 0, 0, false, false, false},
+	{U8_MAX, 2, 1, U8_MAX - 2, U8_MAX - 1, true, false, true},
+	{U8_MAX + 1, 0, 0, 0, 0, true, true, false},
+};
+DEFINE_TEST_FUNC_TYPED(u32_u32__u8, u8, "%d");
+
+DEFINE_TEST_ARRAY_TYPED(u32, u32, int) = {
+	{0, 0, 0, 0, 0, false, false, false},
+	{U32_MAX, 0, -1, -1, 0, true, true, false},
+};
+DEFINE_TEST_FUNC_TYPED(u32_u32__int, int, "%d");
+
+DEFINE_TEST_ARRAY_TYPED(u8, u8, int) = {
+	{0, 0, 0, 0, 0, false, false, false},
+	{U8_MAX, U8_MAX, 2 * U8_MAX, 0, U8_MAX * U8_MAX, false, false, false},
+	{1, 2, 3, -1, 2, false, false, false},
+};
+DEFINE_TEST_FUNC_TYPED(u8_u8__int, int, "%d");
+
+DEFINE_TEST_ARRAY_TYPED(int, int, u8) = {
+	{0, 0, 0, 0, 0, false, false, false},
+	{1, 2, 3, U8_MAX, 2, false, true, false},
+	{-1, 0, U8_MAX, U8_MAX, 0, true, true, false},
+};
+DEFINE_TEST_FUNC_TYPED(int_int__u8, u8, "%d");
+
 static void overflow_shift_test(struct kunit *test)
 {
 	int count = 0;
@@ -649,17 +688,21 @@ static void overflow_size_helpers_test(struct kunit *test)
 }
 
 static struct kunit_case overflow_test_cases[] = {
-	KUNIT_CASE(u8_overflow_test),
-	KUNIT_CASE(s8_overflow_test),
-	KUNIT_CASE(u16_overflow_test),
-	KUNIT_CASE(s16_overflow_test),
-	KUNIT_CASE(u32_overflow_test),
-	KUNIT_CASE(s32_overflow_test),
+	KUNIT_CASE(u8_u8__u8_overflow_test),
+	KUNIT_CASE(s8_s8__s8_overflow_test),
+	KUNIT_CASE(u16_u16__u16_overflow_test),
+	KUNIT_CASE(s16_s16__s16_overflow_test),
+	KUNIT_CASE(u32_u32__u32_overflow_test),
+	KUNIT_CASE(s32_s32__s32_overflow_test),
 /* Clang 13 and earlier generate unwanted libcalls on 32-bit. */
 #if BITS_PER_LONG == 64
-	KUNIT_CASE(u64_overflow_test),
-	KUNIT_CASE(s64_overflow_test),
+	KUNIT_CASE(u64_u64__u64_overflow_test),
+	KUNIT_CASE(s64_s64__s64_overflow_test),
 #endif
+	KUNIT_CASE(u32_u32__u8_overflow_test),
+	KUNIT_CASE(u32_u32__int_overflow_test),
+	KUNIT_CASE(u8_u8__int_overflow_test),
+	KUNIT_CASE(int_int__u8_overflow_test),
 	KUNIT_CASE(overflow_shift_test),
 	KUNIT_CASE(overflow_allocation_test),
 	KUNIT_CASE(overflow_size_helpers_test),
-- 
cgit v1.2.3


From 779742255cb464e9e833fed2a8d352eb12936dae Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 31 Aug 2022 11:09:13 -0700
Subject: overflow: Split up kunit tests for smaller stack frames

Under some pathological 32-bit configs, the shift overflow KUnit tests
create huge stack frames. Split up the function to avoid this,
separating by rough shift overflow cases.

Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Daniel Latypov <dlatypov@google.com>
Cc: Vitor Massaru Iha <vitor@massaru.org>
Cc: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/lkml/202208301850.iuv9VwA8-lkp@intel.com
Acked-by: Daniel Latypov <dlatypov@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 lib/overflow_kunit.c | 78 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 51 insertions(+), 27 deletions(-)

diff --git a/lib/overflow_kunit.c b/lib/overflow_kunit.c
index 0d98c9bc75da..f385ca652b74 100644
--- a/lib/overflow_kunit.c
+++ b/lib/overflow_kunit.c
@@ -304,10 +304,6 @@ DEFINE_TEST_ARRAY_TYPED(int, int, u8) = {
 };
 DEFINE_TEST_FUNC_TYPED(int_int__u8, u8, "%d");
 
-static void overflow_shift_test(struct kunit *test)
-{
-	int count = 0;
-
 /* Args are: value, shift, type, expected result, overflow expected */
 #define TEST_ONE_SHIFT(a, s, t, expect, of)	do {			\
 	typeof(a) __a = (a);						\
@@ -331,6 +327,10 @@ static void overflow_shift_test(struct kunit *test)
 	count++;							\
 } while (0)
 
+static void shift_sane_test(struct kunit *test)
+{
+	int count = 0;
+
 	/* Sane shifts. */
 	TEST_ONE_SHIFT(1, 0, u8, 1 << 0, false);
 	TEST_ONE_SHIFT(1, 4, u8, 1 << 4, false);
@@ -373,6 +373,13 @@ static void overflow_shift_test(struct kunit *test)
 	TEST_ONE_SHIFT(0, 30, s32, 0, false);
 	TEST_ONE_SHIFT(0, 62, s64, 0, false);
 
+	kunit_info(test, "%d sane shift tests finished\n", count);
+}
+
+static void shift_overflow_test(struct kunit *test)
+{
+	int count = 0;
+
 	/* Overflow: shifted the bit off the end. */
 	TEST_ONE_SHIFT(1, 8, u8, 0, true);
 	TEST_ONE_SHIFT(1, 16, u16, 0, true);
@@ -420,6 +427,13 @@ static void overflow_shift_test(struct kunit *test)
 	/* 0100000100001000001000000010000001000010000001000100010001001011 */
 	TEST_ONE_SHIFT(4686030735197619275LL, 2, s64, 0, true);
 
+	kunit_info(test, "%d overflow shift tests finished\n", count);
+}
+
+static void shift_truncate_test(struct kunit *test)
+{
+	int count = 0;
+
 	/* Overflow: values larger than destination type. */
 	TEST_ONE_SHIFT(0x100, 0, u8, 0, true);
 	TEST_ONE_SHIFT(0xFF, 0, s8, 0, true);
@@ -431,6 +445,33 @@ static void overflow_shift_test(struct kunit *test)
 	TEST_ONE_SHIFT(0xFFFFFFFFUL, 0, int, 0, true);
 	TEST_ONE_SHIFT(0xFFFFFFFFFFFFFFFFULL, 0, s64, 0, true);
 
+	/* Overflow: shifted at or beyond entire type's bit width. */
+	TEST_ONE_SHIFT(0, 8, u8, 0, true);
+	TEST_ONE_SHIFT(0, 9, u8, 0, true);
+	TEST_ONE_SHIFT(0, 8, s8, 0, true);
+	TEST_ONE_SHIFT(0, 9, s8, 0, true);
+	TEST_ONE_SHIFT(0, 16, u16, 0, true);
+	TEST_ONE_SHIFT(0, 17, u16, 0, true);
+	TEST_ONE_SHIFT(0, 16, s16, 0, true);
+	TEST_ONE_SHIFT(0, 17, s16, 0, true);
+	TEST_ONE_SHIFT(0, 32, u32, 0, true);
+	TEST_ONE_SHIFT(0, 33, u32, 0, true);
+	TEST_ONE_SHIFT(0, 32, int, 0, true);
+	TEST_ONE_SHIFT(0, 33, int, 0, true);
+	TEST_ONE_SHIFT(0, 32, s32, 0, true);
+	TEST_ONE_SHIFT(0, 33, s32, 0, true);
+	TEST_ONE_SHIFT(0, 64, u64, 0, true);
+	TEST_ONE_SHIFT(0, 65, u64, 0, true);
+	TEST_ONE_SHIFT(0, 64, s64, 0, true);
+	TEST_ONE_SHIFT(0, 65, s64, 0, true);
+
+	kunit_info(test, "%d truncate shift tests finished\n", count);
+}
+
+static void shift_nonsense_test(struct kunit *test)
+{
+	int count = 0;
+
 	/* Nonsense: negative initial value. */
 	TEST_ONE_SHIFT(-1, 0, s8, 0, true);
 	TEST_ONE_SHIFT(-1, 0, u8, 0, true);
@@ -455,26 +496,6 @@ static void overflow_shift_test(struct kunit *test)
 	TEST_ONE_SHIFT(0, -30, s64, 0, true);
 	TEST_ONE_SHIFT(0, -30, u64, 0, true);
 
-	/* Overflow: shifted at or beyond entire type's bit width. */
-	TEST_ONE_SHIFT(0, 8, u8, 0, true);
-	TEST_ONE_SHIFT(0, 9, u8, 0, true);
-	TEST_ONE_SHIFT(0, 8, s8, 0, true);
-	TEST_ONE_SHIFT(0, 9, s8, 0, true);
-	TEST_ONE_SHIFT(0, 16, u16, 0, true);
-	TEST_ONE_SHIFT(0, 17, u16, 0, true);
-	TEST_ONE_SHIFT(0, 16, s16, 0, true);
-	TEST_ONE_SHIFT(0, 17, s16, 0, true);
-	TEST_ONE_SHIFT(0, 32, u32, 0, true);
-	TEST_ONE_SHIFT(0, 33, u32, 0, true);
-	TEST_ONE_SHIFT(0, 32, int, 0, true);
-	TEST_ONE_SHIFT(0, 33, int, 0, true);
-	TEST_ONE_SHIFT(0, 32, s32, 0, true);
-	TEST_ONE_SHIFT(0, 33, s32, 0, true);
-	TEST_ONE_SHIFT(0, 64, u64, 0, true);
-	TEST_ONE_SHIFT(0, 65, u64, 0, true);
-	TEST_ONE_SHIFT(0, 64, s64, 0, true);
-	TEST_ONE_SHIFT(0, 65, s64, 0, true);
-
 	/*
 	 * Corner case: for unsigned types, we fail when we've shifted
 	 * through the entire width of bits. For signed types, we might
@@ -490,9 +511,9 @@ static void overflow_shift_test(struct kunit *test)
 	TEST_ONE_SHIFT(0, 31, s32, 0, false);
 	TEST_ONE_SHIFT(0, 63, s64, 0, false);
 
-	kunit_info(test, "%d shift tests finished\n", count);
-#undef TEST_ONE_SHIFT
+	kunit_info(test, "%d nonsense shift tests finished\n", count);
 }
+#undef TEST_ONE_SHIFT
 
 /*
  * Deal with the various forms of allocator arguments. See comments above
@@ -703,7 +724,10 @@ static struct kunit_case overflow_test_cases[] = {
 	KUNIT_CASE(u32_u32__int_overflow_test),
 	KUNIT_CASE(u8_u8__int_overflow_test),
 	KUNIT_CASE(int_int__u8_overflow_test),
-	KUNIT_CASE(overflow_shift_test),
+	KUNIT_CASE(shift_sane_test),
+	KUNIT_CASE(shift_overflow_test),
+	KUNIT_CASE(shift_truncate_test),
+	KUNIT_CASE(shift_nonsense_test),
 	KUNIT_CASE(overflow_allocation_test),
 	KUNIT_CASE(overflow_size_helpers_test),
 	{}
-- 
cgit v1.2.3


From dfbafa70bde26c40615f8c538ce68dac82a64fb4 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 26 Aug 2022 11:04:43 -0700
Subject: string: Introduce strtomem() and strtomem_pad()

One of the "legitimate" uses of strncpy() is copying a NUL-terminated
string into a fixed-size non-NUL-terminated character array. To avoid
the weaknesses and ambiguity of intent when using strncpy(), provide
replacement functions that explicitly distinguish between trailing
padding and not, and require the destination buffer size be discoverable
by the compiler.

For example:

struct obj {
	int foo;
	char small[4] __nonstring;
	char big[8] __nonstring;
	int bar;
};

struct obj p;

/* This will truncate to 4 chars with no trailing NUL */
strncpy(p.small, "hello", sizeof(p.small));
/* p.small contains 'h', 'e', 'l', 'l' */

/* This will NUL pad to 8 chars. */
strncpy(p.big, "hello", sizeof(p.big));
/* p.big contains 'h', 'e', 'l', 'l', 'o', '\0', '\0', '\0' */

When the "__nonstring" attributes are missing, the intent of the
programmer becomes ambiguous for whether the lack of a trailing NUL
in the p.small copy is a bug. Additionally, it's not clear whether
the trailing padding in the p.big copy is _needed_. Both cases
become unambiguous with:

strtomem(p.small, "hello");
strtomem_pad(p.big, "hello", 0);

See also https://github.com/KSPP/linux/issues/90

Expand the memcpy KUnit tests to include these functions.

Cc: Wolfram Sang <wsa+renesas@sang-engineering.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Documentation/process/deprecated.rst | 11 ++++---
 include/linux/fortify-string.h       | 32 +++++++++++++++++++
 include/linux/string.h               | 43 ++++++++++++++++++++++++++
 lib/memcpy_kunit.c                   | 59 +++++++++++++++++++++++++++++++++---
 4 files changed, 137 insertions(+), 8 deletions(-)

diff --git a/Documentation/process/deprecated.rst b/Documentation/process/deprecated.rst
index a6e36d9c3d14..c8fd53a11a20 100644
--- a/Documentation/process/deprecated.rst
+++ b/Documentation/process/deprecated.rst
@@ -138,17 +138,20 @@ be NUL terminated. This can lead to various linear read overflows and
 other misbehavior due to the missing termination. It also NUL-pads
 the destination buffer if the source contents are shorter than the
 destination buffer size, which may be a needless performance penalty
-for callers using only NUL-terminated strings. The safe replacement is
+for callers using only NUL-terminated strings.
+
+When the destination is required to be NUL-terminated, the replacement is
 strscpy(), though care must be given to any cases where the return value
 of strncpy() was used, since strscpy() does not return a pointer to the
 destination, but rather a count of non-NUL bytes copied (or negative
 errno when it truncates). Any cases still needing NUL-padding should
 instead use strscpy_pad().
 
-If a caller is using non-NUL-terminated strings, strncpy() can
-still be used, but destinations should be marked with the `__nonstring
+If a caller is using non-NUL-terminated strings, strtomem() should be
+used, and the destinations should be marked with the `__nonstring
 <https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html>`_
-attribute to avoid future compiler warnings.
+attribute to avoid future compiler warnings. For cases still needing
+NUL-padding, strtomem_pad() can be used.
 
 strlcpy()
 ---------
diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 3b401fa0f374..8e8c2c87b1d5 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -77,6 +77,38 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 #define POS	__pass_object_size(1)
 #define POS0	__pass_object_size(0)
 
+/**
+ * strncpy - Copy a string to memory with non-guaranteed NUL padding
+ *
+ * @p: pointer to destination of copy
+ * @q: pointer to NUL-terminated source string to copy
+ * @size: bytes to write at @p
+ *
+ * If strlen(@q) >= @size, the copy of @q will stop after @size bytes,
+ * and @p will NOT be NUL-terminated
+ *
+ * If strlen(@q) < @size, following the copy of @q, trailing NUL bytes
+ * will be written to @p until @size total bytes have been written.
+ *
+ * Do not use this function. While FORTIFY_SOURCE tries to avoid
+ * over-reads of @q, it cannot defend against writing unterminated
+ * results to @p. Using strncpy() remains ambiguous and fragile.
+ * Instead, please choose an alternative, so that the expectation
+ * of @p's contents is unambiguous:
+ *
+ * +--------------------+-----------------+------------+
+ * | @p needs to be:    | padded to @size | not padded |
+ * +====================+=================+============+
+ * |     NUL-terminated | strscpy_pad()   | strscpy()  |
+ * +--------------------+-----------------+------------+
+ * | not NUL-terminated | strtomem_pad()  | strtomem() |
+ * +--------------------+-----------------+------------+
+ *
+ * Note strscpy*()'s differing return values for detecting truncation,
+ * and strtomem*()'s expectation that the destination is marked with
+ * __nonstring when it is a character array.
+ *
+ */
 __FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3)
 char *strncpy(char * const POS p, const char *q, __kernel_size_t size)
 {
diff --git a/include/linux/string.h b/include/linux/string.h
index 61ec7e4f6311..cf7607b32102 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -260,6 +260,49 @@ static inline const char *kbasename(const char *path)
 void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
 		    int pad);
 
+/**
+ * strtomem_pad - Copy NUL-terminated string to non-NUL-terminated buffer
+ *
+ * @dest: Pointer of destination character array (marked as __nonstring)
+ * @src: Pointer to NUL-terminated string
+ * @pad: Padding character to fill any remaining bytes of @dest after copy
+ *
+ * This is a replacement for strncpy() uses where the destination is not
+ * a NUL-terminated string, but with bounds checking on the source size, and
+ * an explicit padding character. If padding is not required, use strtomem().
+ *
+ * Note that the size of @dest is not an argument, as the length of @dest
+ * must be discoverable by the compiler.
+ */
+#define strtomem_pad(dest, src, pad)	do {				\
+	const size_t _dest_len = __builtin_object_size(dest, 1);	\
+									\
+	BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||		\
+		     _dest_len == (size_t)-1);				\
+	memcpy_and_pad(dest, _dest_len, src, strnlen(src, _dest_len), pad); \
+} while (0)
+
+/**
+ * strtomem - Copy NUL-terminated string to non-NUL-terminated buffer
+ *
+ * @dest: Pointer of destination character array (marked as __nonstring)
+ * @src: Pointer to NUL-terminated string
+ *
+ * This is a replacement for strncpy() uses where the destination is not
+ * a NUL-terminated string, but with bounds checking on the source size, and
+ * without trailing padding. If padding is required, use strtomem_pad().
+ *
+ * Note that the size of @dest is not an argument, as the length of @dest
+ * must be discoverable by the compiler.
+ */
+#define strtomem(dest, src)	do {					\
+	const size_t _dest_len = __builtin_object_size(dest, 1);	\
+									\
+	BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||		\
+		     _dest_len == (size_t)-1);				\
+	memcpy(dest, src, min(_dest_len, strnlen(src, _dest_len)));	\
+} while (0)
+
 /**
  * memset_after - Set a value after a struct member to the end of a struct
  *
diff --git a/lib/memcpy_kunit.c b/lib/memcpy_kunit.c
index 62f8ffcbbaa3..d22fa3838ee9 100644
--- a/lib/memcpy_kunit.c
+++ b/lib/memcpy_kunit.c
@@ -29,9 +29,8 @@ struct some_bytes {
 };
 
 #define check(instance, v) do {	\
-	int i;	\
 	BUILD_BUG_ON(sizeof(instance.data) != 32);	\
-	for (i = 0; i < sizeof(instance.data); i++) {	\
+	for (size_t i = 0; i < sizeof(instance.data); i++) {	\
 		KUNIT_ASSERT_EQ_MSG(test, instance.data[i], v, \
 			"line %d: '%s' not initialized to 0x%02x @ %d (saw 0x%02x)\n", \
 			__LINE__, #instance, v, i, instance.data[i]);	\
@@ -39,9 +38,8 @@ struct some_bytes {
 } while (0)
 
 #define compare(name, one, two) do { \
-	int i; \
 	BUILD_BUG_ON(sizeof(one) != sizeof(two)); \
-	for (i = 0; i < sizeof(one); i++) {	\
+	for (size_t i = 0; i < sizeof(one); i++) {	\
 		KUNIT_EXPECT_EQ_MSG(test, one.data[i], two.data[i], \
 			"line %d: %s.data[%d] (0x%02x) != %s.data[%d] (0x%02x)\n", \
 			__LINE__, #one, i, one.data[i], #two, i, two.data[i]); \
@@ -272,10 +270,63 @@ static void memset_test(struct kunit *test)
 #undef TEST_OP
 }
 
+static void strtomem_test(struct kunit *test)
+{
+	static const char input[] = "hi";
+	static const char truncate[] = "this is too long";
+	struct {
+		unsigned long canary1;
+		unsigned char output[sizeof(unsigned long)] __nonstring;
+		unsigned long canary2;
+	} wrap;
+
+	memset(&wrap, 0xFF, sizeof(wrap));
+	KUNIT_EXPECT_EQ_MSG(test, wrap.canary1, ULONG_MAX,
+			    "bad initial canary value");
+	KUNIT_EXPECT_EQ_MSG(test, wrap.canary2, ULONG_MAX,
+			    "bad initial canary value");
+
+	/* Check unpadded copy leaves surroundings untouched. */
+	strtomem(wrap.output, input);
+	KUNIT_EXPECT_EQ(test, wrap.canary1, ULONG_MAX);
+	KUNIT_EXPECT_EQ(test, wrap.output[0], input[0]);
+	KUNIT_EXPECT_EQ(test, wrap.output[1], input[1]);
+	for (size_t i = 2; i < sizeof(wrap.output); i++)
+		KUNIT_EXPECT_EQ(test, wrap.output[i], 0xFF);
+	KUNIT_EXPECT_EQ(test, wrap.canary2, ULONG_MAX);
+
+	/* Check truncated copy leaves surroundings untouched. */
+	memset(&wrap, 0xFF, sizeof(wrap));
+	strtomem(wrap.output, truncate);
+	KUNIT_EXPECT_EQ(test, wrap.canary1, ULONG_MAX);
+	for (size_t i = 0; i < sizeof(wrap.output); i++)
+		KUNIT_EXPECT_EQ(test, wrap.output[i], truncate[i]);
+	KUNIT_EXPECT_EQ(test, wrap.canary2, ULONG_MAX);
+
+	/* Check padded copy leaves only string padded. */
+	memset(&wrap, 0xFF, sizeof(wrap));
+	strtomem_pad(wrap.output, input, 0xAA);
+	KUNIT_EXPECT_EQ(test, wrap.canary1, ULONG_MAX);
+	KUNIT_EXPECT_EQ(test, wrap.output[0], input[0]);
+	KUNIT_EXPECT_EQ(test, wrap.output[1], input[1]);
+	for (size_t i = 2; i < sizeof(wrap.output); i++)
+		KUNIT_EXPECT_EQ(test, wrap.output[i], 0xAA);
+	KUNIT_EXPECT_EQ(test, wrap.canary2, ULONG_MAX);
+
+	/* Check truncated padded copy has no padding. */
+	memset(&wrap, 0xFF, sizeof(wrap));
+	strtomem(wrap.output, truncate);
+	KUNIT_EXPECT_EQ(test, wrap.canary1, ULONG_MAX);
+	for (size_t i = 0; i < sizeof(wrap.output); i++)
+		KUNIT_EXPECT_EQ(test, wrap.output[i], truncate[i]);
+	KUNIT_EXPECT_EQ(test, wrap.canary2, ULONG_MAX);
+}
+
 static struct kunit_case memcpy_test_cases[] = {
 	KUNIT_CASE(memset_test),
 	KUNIT_CASE(memcpy_test),
 	KUNIT_CASE(memmove_test),
+	KUNIT_CASE(strtomem_test),
 	{}
 };
 
-- 
cgit v1.2.3


From d07c0acb4f41cc42a0d97530946965b3e4fa68c1 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 2 Sep 2022 13:02:26 -0700
Subject: fortify: Fix __compiletime_strlen() under UBSAN_BOUNDS_LOCAL

With CONFIG_FORTIFY=y and CONFIG_UBSAN_LOCAL_BOUNDS=y enabled, we observe
a runtime panic while running Android's Compatibility Test Suite's (CTS)
android.hardware.input.cts.tests. This is stemming from a strlen()
call in hidinput_allocate().

__compiletime_strlen() is implemented in terms of __builtin_object_size(),
then does an array access to check for NUL-termination. A quirk of
__builtin_object_size() is that for strings whose values are runtime
dependent, __builtin_object_size(str, 1 or 0) returns the maximum size
of possible values when those sizes are determinable at compile time.
Example:

  static const char *v = "FOO BAR";
  static const char *y = "FOO BA";
  unsigned long x (int z) {
      // Returns 8, which is:
      // max(__builtin_object_size(v, 1), __builtin_object_size(y, 1))
      return __builtin_object_size(z ? v : y, 1);
  }

So when FORTIFY_SOURCE is enabled, the current implementation of
__compiletime_strlen() will try to access beyond the end of y at runtime
using the size of v. Mixed with UBSAN_LOCAL_BOUNDS we get a fault.

hidinput_allocate() has a local C string whose value is control flow
dependent on a switch statement, so __builtin_object_size(str, 1)
evaluates to the maximum string length, making all other cases fault on
the last character check. hidinput_allocate() could be cleaned up to
avoid runtime calls to strlen() since the local variable can only have
literal values, so there's no benefit to trying to fortify the strlen
call site there.

Perform a __builtin_constant_p() check against index 0 earlier in the
macro to filter out the control-flow-dependant case. Add a KUnit test
for checking the expected behavioral characteristics of FORTIFY_SOURCE
internals.

Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Tom Rix <trix@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Cc: David Gow <davidgow@google.com>
Cc: Yury Norov <yury.norov@gmail.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Sander Vanheule <sander@svanheule.net>
Cc: linux-hardening@vger.kernel.org
Cc: llvm@lists.linux.dev
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Android Treehugger Robot
Link: https://android-review.googlesource.com/c/kernel/common/+/2206839
Co-developed-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 8e8c2c87b1d5..be264091f7a7 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -19,7 +19,8 @@ void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("
 	unsigned char *__p = (unsigned char *)(p);		\
 	size_t __ret = (size_t)-1;				\
 	size_t __p_size = __builtin_object_size(p, 1);		\
-	if (__p_size != (size_t)-1) {				\
+	if (__p_size != (size_t)-1 &&				\
+	    __builtin_constant_p(*__p)) {			\
 		size_t __p_len = __p_size - 1;			\
 		if (__builtin_constant_p(__p[__p_len]) &&	\
 		    __p[__p_len] == '\0')			\
-- 
cgit v1.2.3


From 875bfd5276f31d09e811d31fca638b9f4d1205e8 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 2 Sep 2022 13:02:26 -0700
Subject: fortify: Add KUnit test for FORTIFY_SOURCE internals

Add lib/fortify_kunit.c KUnit test for checking the expected behavioral
characteristics of FORTIFY_SOURCE internals.

Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Tom Rix <trix@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Cc: Yury Norov <yury.norov@gmail.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Sander Vanheule <sander@svanheule.net>
Cc: linux-hardening@vger.kernel.org
Cc: llvm@lists.linux.dev
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 MAINTAINERS         |  1 +
 lib/Kconfig.debug   |  9 +++++++
 lib/Makefile        |  1 +
 lib/fortify_kunit.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 88 insertions(+)
 create mode 100644 lib/fortify_kunit.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 9d7f64dc0efe..640115472199 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8002,6 +8002,7 @@ L:	linux-hardening@vger.kernel.org
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening
 F:	include/linux/fortify-string.h
+F:	lib/fortify_kunit.c
 F:	lib/test_fortify/*
 F:	scripts/test_fortify.sh
 K:	\b__NO_FORTIFY\b
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 36455953d306..1f267c0ddffd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2542,6 +2542,15 @@ config STACKINIT_KUNIT_TEST
 	  CONFIG_GCC_PLUGIN_STRUCTLEAK, CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF,
 	  or CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL.
 
+config FORTIFY_KUNIT_TEST
+	tristate "Test fortified str*() and mem*() function internals at runtime" if !KUNIT_ALL_TESTS
+	depends on KUNIT && FORTIFY_SOURCE
+	default KUNIT_ALL_TESTS
+	help
+	  Builds unit tests for checking internals of FORTIFY_SOURCE as used
+	  by the str*() and mem*() family of functions. For testing runtime
+	  traps of FORTIFY_SOURCE, see LKDTM's "FORTIFY_*" tests.
+
 config TEST_UDELAY
 	tristate "udelay test driver"
 	help
diff --git a/lib/Makefile b/lib/Makefile
index f545140ed9e7..4ee1ceae945a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -381,6 +381,7 @@ obj-$(CONFIG_IS_SIGNED_TYPE_KUNIT_TEST) += is_signed_type_kunit.o
 obj-$(CONFIG_OVERFLOW_KUNIT_TEST) += overflow_kunit.o
 CFLAGS_stackinit_kunit.o += $(call cc-disable-warning, switch-unreachable)
 obj-$(CONFIG_STACKINIT_KUNIT_TEST) += stackinit_kunit.o
+obj-$(CONFIG_FORTIFY_KUNIT_TEST) += fortify_kunit.o
 
 obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o
 
diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c
new file mode 100644
index 000000000000..99bc0ea60d27
--- /dev/null
+++ b/lib/fortify_kunit.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Runtime test cases for CONFIG_FORTIFY_SOURCE that aren't expected to
+ * Oops the kernel on success. (For those, see drivers/misc/lkdtm/fortify.c)
+ *
+ * For corner cases with UBSAN, try testing with:
+ *
+ * ./tools/testing/kunit/kunit.py run --arch=x86_64 \
+ *	--kconfig_add CONFIG_FORTIFY_SOURCE=y \
+ *	--kconfig_add CONFIG_UBSAN=y \
+ *	--kconfig_add CONFIG_UBSAN_TRAP=y \
+ *	--kconfig_add CONFIG_UBSAN_BOUNDS=y \
+ *	--kconfig_add CONFIG_UBSAN_LOCAL_BOUNDS=y \
+ *	--make_options LLVM=1 fortify
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <kunit/test.h>
+#include <linux/string.h>
+#include <linux/init.h>
+
+static const char array_of_10[] = "this is 10";
+static const char *ptr_of_11 = "this is 11!";
+static char array_unknown[] = "compiler thinks I might change";
+
+static void known_sizes_test(struct kunit *test)
+{
+	KUNIT_EXPECT_EQ(test, __compiletime_strlen("88888888"), 8);
+	KUNIT_EXPECT_EQ(test, __compiletime_strlen(array_of_10), 10);
+	KUNIT_EXPECT_EQ(test, __compiletime_strlen(ptr_of_11), 11);
+
+	KUNIT_EXPECT_EQ(test, __compiletime_strlen(array_unknown), SIZE_MAX);
+	/* Externally defined and dynamically sized string pointer: */
+	KUNIT_EXPECT_EQ(test, __compiletime_strlen(saved_command_line), SIZE_MAX);
+}
+
+/* This is volatile so the optimizer can't perform DCE below. */
+static volatile int pick;
+
+/* Not inline to keep optimizer from figuring out which string we want. */
+static noinline size_t want_minus_one(int pick)
+{
+	const char *str;
+
+	switch (pick) {
+	case 1:
+		str = "4444";
+		break;
+	case 2:
+		str = "333";
+		break;
+	default:
+		str = "1";
+		break;
+	}
+	return __compiletime_strlen(str);
+}
+
+static void control_flow_split_test(struct kunit *test)
+{
+	KUNIT_EXPECT_EQ(test, want_minus_one(pick), SIZE_MAX);
+}
+
+static struct kunit_case fortify_test_cases[] = {
+	KUNIT_CASE(known_sizes_test),
+	KUNIT_CASE(control_flow_split_test),
+	{}
+};
+
+static struct kunit_suite fortify_test_suite = {
+	.name = "fortify",
+	.test_cases = fortify_test_cases,
+};
+
+kunit_test_suite(fortify_test_suite);
+
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 311fb40aa0569abacc430b0d66ee41470803111f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 2 Sep 2022 13:23:06 -0700
Subject: fortify: Use SIZE_MAX instead of (size_t)-1

Clean up uses of "(size_t)-1" in favor of SIZE_MAX.

Cc: linux-hardening@vger.kernel.org
Suggested-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index be264091f7a7..e46af17d23d0 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -3,6 +3,7 @@
 #define _LINUX_FORTIFY_STRING_H_
 
 #include <linux/const.h>
+#include <linux/limits.h>
 
 #define __FORTIFY_INLINE extern __always_inline __gnu_inline __overloadable
 #define __RENAME(x) __asm__(#x)
@@ -17,9 +18,9 @@ void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("
 #define __compiletime_strlen(p)					\
 ({								\
 	unsigned char *__p = (unsigned char *)(p);		\
-	size_t __ret = (size_t)-1;				\
+	size_t __ret = SIZE_MAX;				\
 	size_t __p_size = __builtin_object_size(p, 1);		\
-	if (__p_size != (size_t)-1 &&				\
+	if (__p_size != SIZE_MAX &&				\
 	    __builtin_constant_p(*__p)) {			\
 		size_t __p_len = __p_size - 1;			\
 		if (__builtin_constant_p(__p[__p_len]) &&	\
@@ -127,7 +128,7 @@ char *strcat(char * const POS p, const char *q)
 {
 	size_t p_size = __builtin_object_size(p, 1);
 
-	if (p_size == (size_t)-1)
+	if (p_size == SIZE_MAX)
 		return __underlying_strcat(p, q);
 	if (strlcat(p, q, p_size) >= p_size)
 		fortify_panic(__func__);
@@ -142,7 +143,7 @@ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size
 	size_t ret;
 
 	/* We can take compile-time actions when maxlen is const. */
-	if (__builtin_constant_p(maxlen) && p_len != (size_t)-1) {
+	if (__builtin_constant_p(maxlen) && p_len != SIZE_MAX) {
 		/* If p is const, we can use its compile-time-known len. */
 		if (maxlen >= p_size)
 			return p_len;
@@ -170,7 +171,7 @@ __kernel_size_t __fortify_strlen(const char * const POS p)
 	size_t p_size = __builtin_object_size(p, 1);
 
 	/* Give up if we don't know how large p is. */
-	if (p_size == (size_t)-1)
+	if (p_size == SIZE_MAX)
 		return __underlying_strlen(p);
 	ret = strnlen(p, p_size);
 	if (p_size <= ret)
@@ -187,7 +188,7 @@ __FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, si
 	size_t q_len;	/* Full count of source string length. */
 	size_t len;	/* Count of characters going into destination. */
 
-	if (p_size == (size_t)-1 && q_size == (size_t)-1)
+	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
 		return __real_strlcpy(p, q, size);
 	q_len = strlen(q);
 	len = (q_len >= size) ? size - 1 : q_len;
@@ -215,7 +216,7 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s
 	size_t q_size = __builtin_object_size(q, 1);
 
 	/* If we cannot get size of p and q default to call strscpy. */
-	if (p_size == (size_t) -1 && q_size == (size_t) -1)
+	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
 		return __real_strscpy(p, q, size);
 
 	/*
@@ -260,7 +261,7 @@ char *strncat(char * const POS p, const char * const POS q, __kernel_size_t coun
 	size_t p_size = __builtin_object_size(p, 1);
 	size_t q_size = __builtin_object_size(q, 1);
 
-	if (p_size == (size_t)-1 && q_size == (size_t)-1)
+	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
 		return __underlying_strncat(p, q, count);
 	p_len = strlen(p);
 	copy_len = strnlen(q, count);
@@ -301,10 +302,10 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size,
 	/*
 	 * Always stop accesses beyond the struct that contains the
 	 * field, when the buffer's remaining size is known.
-	 * (The -1 test is to optimize away checks where the buffer
+	 * (The SIZE_MAX test is to optimize away checks where the buffer
 	 * lengths are unknown.)
 	 */
-	if (p_size != (size_t)(-1) && p_size < size)
+	if (p_size != SIZE_MAX && p_size < size)
 		fortify_panic("memset");
 }
 
@@ -395,11 +396,11 @@ __FORTIFY_INLINE void fortify_memcpy_chk(__kernel_size_t size,
 	/*
 	 * Always stop accesses beyond the struct that contains the
 	 * field, when the buffer's remaining size is known.
-	 * (The -1 test is to optimize away checks where the buffer
+	 * (The SIZE_MAX test is to optimize away checks where the buffer
 	 * lengths are unknown.)
 	 */
-	if ((p_size != (size_t)(-1) && p_size < size) ||
-	    (q_size != (size_t)(-1) && q_size < size))
+	if ((p_size != SIZE_MAX && p_size < size) ||
+	    (q_size != SIZE_MAX && q_size < size))
 		fortify_panic(func);
 }
 
@@ -498,7 +499,7 @@ char *strcpy(char * const POS p, const char * const POS q)
 	size_t size;
 
 	/* If neither buffer size is known, immediately give up. */
-	if (p_size == (size_t)-1 && q_size == (size_t)-1)
+	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
 		return __underlying_strcpy(p, q);
 	size = strlen(q) + 1;
 	/* Compile-time check for const size overflow. */
-- 
cgit v1.2.3


From 54d9469bc515dc5fcbc20eecbe19cea868b70d68 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 24 Jun 2021 15:39:26 -0700
Subject: fortify: Add run-time WARN for cross-field memcpy()

Enable run-time checking of dynamic memcpy() and memmove() lengths,
issuing a WARN when a write would exceed the size of the target struct
member, when built with CONFIG_FORTIFY_SOURCE=y. This would have
caught all of the memcpy()-based buffer overflows in the last 3 years,
specifically covering all the cases where the destination buffer size
is known at compile time.

This change ONLY adds a run-time warning. As false positives are currently
still expected, this will not block the overflow. The new warnings will
look like this:

  memcpy: detected field-spanning write (size N) of single field "var->dest" (size M)
  WARNING: CPU: n PID: pppp at source/file/path.c:nr function+0xXX/0xXX [module]

There may be false positives in the kernel where intentional
field-spanning writes are happening. These need to be addressed
similarly to how the compile-time cases were addressed: add a
struct_group(), split the memcpy(), or some other refactoring.

In order to make counting/investigating instances of added runtime checks
easier, each instance includes the destination variable name as a WARN
argument, prefixed with 'field "'. Therefore, on an x86_64 defconfig
build, it is trivial to inspect the build artifacts to find instances.
For example on an x86_64 defconfig build, there are 78 new run-time
memcpy() bounds checks added:

  $ for i in vmlinux $(find . -name '*.ko'); do \
      strings "$i" | grep '^field "'; done | wc -l
  78

Simple cases where a destination buffer is known to be a dynamic size
do not generate a WARN. For example:

struct normal_flex_array {
	void *a;
	int b;
	u32 c;
	size_t array_size;
	u8 flex_array[];
};

struct normal_flex_array *instance;
...
/* These will be ignored for run-time bounds checking. */
memcpy(instance, src, len);
memcpy(instance->flex_array, src, len);

However, one of the dynamic-sized destination cases is irritatingly
unable to be detected by the compiler: when using memcpy() to target
a composite struct member which contains a trailing flexible array
struct. For example:

struct wrapper {
	int foo;
	char bar;
	struct normal_flex_array embedded;
};

struct wrapper *instance;
...
/* This will incorrectly WARN when len > sizeof(instance->embedded) */
memcpy(&instance->embedded, src, len);

These cases end up appearing to the compiler to be sized as if the
flexible array had 0 elements. :( For more details see:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832
https://godbolt.org/z/vW6x8vh4P

These "composite flexible array structure destination" cases will be
need to be flushed out and addressed on a case-by-case basis.

Regardless, for the general case of using memcpy() on flexible array
destinations, future APIs will be created to handle common cases. Those
can be used to migrate away from open-coded memcpy() so that proper
error handling (instead of trapping) can be used.

As mentioned, none of these bounds checks block any overflows
currently. For users that have tested their workloads, do not encounter
any warnings, and wish to make these checks stop any overflows, they
can use a big hammer and set the sysctl panic_on_warn=1.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 70 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 3 deletions(-)

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index e46af17d23d0..ff879efe94ed 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -2,6 +2,7 @@
 #ifndef _LINUX_FORTIFY_STRING_H_
 #define _LINUX_FORTIFY_STRING_H_
 
+#include <linux/bug.h>
 #include <linux/const.h>
 #include <linux/limits.h>
 
@@ -353,7 +354,7 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size,
  * V = vulnerable to run-time overflow (will need refactoring to solve)
  *
  */
-__FORTIFY_INLINE void fortify_memcpy_chk(__kernel_size_t size,
+__FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
 					 const size_t p_size,
 					 const size_t q_size,
 					 const size_t p_size_field,
@@ -402,16 +403,79 @@ __FORTIFY_INLINE void fortify_memcpy_chk(__kernel_size_t size,
 	if ((p_size != SIZE_MAX && p_size < size) ||
 	    (q_size != SIZE_MAX && q_size < size))
 		fortify_panic(func);
+
+	/*
+	 * Warn when writing beyond destination field size.
+	 *
+	 * We must ignore p_size_field == 0 for existing 0-element
+	 * fake flexible arrays, until they are all converted to
+	 * proper flexible arrays.
+	 *
+	 * The implementation of __builtin_object_size() behaves
+	 * like sizeof() when not directly referencing a flexible
+	 * array member, which means there will be many bounds checks
+	 * that will appear at run-time, without a way for them to be
+	 * detected at compile-time (as can be done when the destination
+	 * is specifically the flexible array member).
+	 * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832
+	 */
+	if (p_size_field != 0 && p_size_field != SIZE_MAX &&
+	    p_size != p_size_field && p_size_field < size)
+		return true;
+
+	return false;
 }
 
 #define __fortify_memcpy_chk(p, q, size, p_size, q_size,		\
 			     p_size_field, q_size_field, op) ({		\
 	size_t __fortify_size = (size_t)(size);				\
-	fortify_memcpy_chk(__fortify_size, p_size, q_size,		\
-			   p_size_field, q_size_field, #op);		\
+	WARN_ONCE(fortify_memcpy_chk(__fortify_size, p_size, q_size,	\
+				     p_size_field, q_size_field, #op),	\
+		  #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
+		  __fortify_size,					\
+		  "field \"" #p "\" at " __FILE__ ":" __stringify(__LINE__), \
+		  p_size_field);					\
 	__underlying_##op(p, q, __fortify_size);			\
 })
 
+/*
+ * Notes about compile-time buffer size detection:
+ *
+ * With these types...
+ *
+ *	struct middle {
+ *		u16 a;
+ *		u8 middle_buf[16];
+ *		int b;
+ *	};
+ *	struct end {
+ *		u16 a;
+ *		u8 end_buf[16];
+ *	};
+ *	struct flex {
+ *		int a;
+ *		u8 flex_buf[];
+ *	};
+ *
+ *	void func(TYPE *ptr) { ... }
+ *
+ * Cases where destination size cannot be currently detected:
+ * - the size of ptr's object (seemingly by design, gcc & clang fail):
+ *	__builtin_object_size(ptr, 1) == SIZE_MAX
+ * - the size of flexible arrays in ptr's obj (by design, dynamic size):
+ *	__builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX
+ * - the size of ANY array at the end of ptr's obj (gcc and clang bug):
+ *	__builtin_object_size(ptr->end_buf, 1) == SIZE_MAX
+ *	https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836
+ *
+ * Cases where destination size is currently detected:
+ * - the size of non-array members within ptr's object:
+ *	__builtin_object_size(ptr->a, 1) == 2
+ * - the size of non-flexible-array in the middle of ptr's obj:
+ *	__builtin_object_size(ptr->middle_buf, 1) == 16
+ *
+ */
+
 /*
  * __builtin_object_size() must be captured here to avoid evaluating argument
  * side-effects further into the macro layers.
-- 
cgit v1.2.3


From 325bf6d84bad3fc641b94fad6e69c70e960fdf2e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 31 Aug 2022 23:05:01 -0700
Subject: lkdtm: Update tests for memcpy() run-time warnings

Clarify the LKDTM FORTIFY tests, and add tests for the mem*() family of
functions, now that run-time checking is distinct.

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: linux-kselftest@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/misc/lkdtm/fortify.c            | 96 ++++++++++++++++++++++++++++-----
 tools/testing/selftests/lkdtm/tests.txt |  8 +--
 2 files changed, 88 insertions(+), 16 deletions(-)

diff --git a/drivers/misc/lkdtm/fortify.c b/drivers/misc/lkdtm/fortify.c
index 080293fa3c52..015927665678 100644
--- a/drivers/misc/lkdtm/fortify.c
+++ b/drivers/misc/lkdtm/fortify.c
@@ -10,28 +10,31 @@
 
 static volatile int fortify_scratch_space;
 
-static void lkdtm_FORTIFIED_OBJECT(void)
+static void lkdtm_FORTIFY_STR_OBJECT(void)
 {
 	struct target {
 		char a[10];
-	} target[2] = {};
+		int foo;
+	} target[3] = {};
 	/*
 	 * Using volatile prevents the compiler from determining the value of
 	 * 'size' at compile time. Without that, we would get a compile error
 	 * rather than a runtime error.
 	 */
-	volatile int size = 11;
+	volatile int size = 20;
+
+	pr_info("trying to strcmp() past the end of a struct\n");
 
-	pr_info("trying to read past the end of a struct\n");
+	strncpy(target[0].a, target[1].a, size);
 
 	/* Store result to global to prevent the code from being eliminated */
-	fortify_scratch_space = memcmp(&target[0], &target[1], size);
+	fortify_scratch_space = target[0].a[3];
 
-	pr_err("FAIL: fortify did not block an object overread!\n");
+	pr_err("FAIL: fortify did not block a strncpy() object write overflow!\n");
 	pr_expected_config(CONFIG_FORTIFY_SOURCE);
 }
 
-static void lkdtm_FORTIFIED_SUBOBJECT(void)
+static void lkdtm_FORTIFY_STR_MEMBER(void)
 {
 	struct target {
 		char a[10];
@@ -44,7 +47,7 @@ static void lkdtm_FORTIFIED_SUBOBJECT(void)
 	strscpy(src, "over ten bytes", size);
 	size = strlen(src) + 1;
 
-	pr_info("trying to strncpy past the end of a member of a struct\n");
+	pr_info("trying to strncpy() past the end of a struct member...\n");
 
 	/*
 	 * strncpy(target.a, src, 20); will hit a compile error because the
@@ -56,7 +59,72 @@ static void lkdtm_FORTIFIED_SUBOBJECT(void)
 	/* Store result to global to prevent the code from being eliminated */
 	fortify_scratch_space = target.a[3];
 
-	pr_err("FAIL: fortify did not block an sub-object overrun!\n");
+	pr_err("FAIL: fortify did not block a strncpy() struct member write overflow!\n");
+	pr_expected_config(CONFIG_FORTIFY_SOURCE);
+
+	kfree(src);
+}
+
+static void lkdtm_FORTIFY_MEM_OBJECT(void)
+{
+	int before[10];
+	struct target {
+		char a[10];
+		int foo;
+	} target = {};
+	int after[10];
+	/*
+	 * Using volatile prevents the compiler from determining the value of
+	 * 'size' at compile time. Without that, we would get a compile error
+	 * rather than a runtime error.
+	 */
+	volatile int size = 20;
+
+	memset(before, 0, sizeof(before));
+	memset(after, 0, sizeof(after));
+	fortify_scratch_space = before[5];
+	fortify_scratch_space = after[5];
+
+	pr_info("trying to memcpy() past the end of a struct\n");
+
+	pr_info("0: %zu\n", __builtin_object_size(&target, 0));
+	pr_info("1: %zu\n", __builtin_object_size(&target, 1));
+	pr_info("s: %d\n", size);
+	memcpy(&target, &before, size);
+
+	/* Store result to global to prevent the code from being eliminated */
+	fortify_scratch_space = target.a[3];
+
+	pr_err("FAIL: fortify did not block a memcpy() object write overflow!\n");
+	pr_expected_config(CONFIG_FORTIFY_SOURCE);
+}
+
+static void lkdtm_FORTIFY_MEM_MEMBER(void)
+{
+	struct target {
+		char a[10];
+		char b[10];
+	} target;
+	volatile int size = 20;
+	char *src;
+
+	src = kmalloc(size, GFP_KERNEL);
+	strscpy(src, "over ten bytes", size);
+	size = strlen(src) + 1;
+
+	pr_info("trying to memcpy() past the end of a struct member...\n");
+
+	/*
+	 * strncpy(target.a, src, 20); will hit a compile error because the
+	 * compiler knows at build time that target.a < 20 bytes. Use a
+	 * volatile to force a runtime error.
+	 */
+	memcpy(target.a, src, size);
+
+	/* Store result to global to prevent the code from being eliminated */
+	fortify_scratch_space = target.a[3];
+
+	pr_err("FAIL: fortify did not block a memcpy() struct member write overflow!\n");
 	pr_expected_config(CONFIG_FORTIFY_SOURCE);
 
 	kfree(src);
@@ -67,7 +135,7 @@ static void lkdtm_FORTIFIED_SUBOBJECT(void)
  * strscpy and generate a panic because there is a write overflow (i.e. src
  * length is greater than dst length).
  */
-static void lkdtm_FORTIFIED_STRSCPY(void)
+static void lkdtm_FORTIFY_STRSCPY(void)
 {
 	char *src;
 	char dst[5];
@@ -136,9 +204,11 @@ static void lkdtm_FORTIFIED_STRSCPY(void)
 }
 
 static struct crashtype crashtypes[] = {
-	CRASHTYPE(FORTIFIED_OBJECT),
-	CRASHTYPE(FORTIFIED_SUBOBJECT),
-	CRASHTYPE(FORTIFIED_STRSCPY),
+	CRASHTYPE(FORTIFY_STR_OBJECT),
+	CRASHTYPE(FORTIFY_STR_MEMBER),
+	CRASHTYPE(FORTIFY_MEM_OBJECT),
+	CRASHTYPE(FORTIFY_MEM_MEMBER),
+	CRASHTYPE(FORTIFY_STRSCPY),
 };
 
 struct crashtype_category fortify_crashtypes = {
diff --git a/tools/testing/selftests/lkdtm/tests.txt b/tools/testing/selftests/lkdtm/tests.txt
index 65e53eb0840b..607b8d7e3ea3 100644
--- a/tools/testing/selftests/lkdtm/tests.txt
+++ b/tools/testing/selftests/lkdtm/tests.txt
@@ -75,7 +75,9 @@ USERCOPY_KERNEL
 STACKLEAK_ERASING OK: the rest of the thread stack is properly erased
 CFI_FORWARD_PROTO
 CFI_BACKWARD call trace:|ok: control flow unchanged
-FORTIFIED_STRSCPY
-FORTIFIED_OBJECT
-FORTIFIED_SUBOBJECT
+FORTIFY_STRSCPY detected buffer overflow
+FORTIFY_STR_OBJECT detected buffer overflow
+FORTIFY_STR_MEMBER detected buffer overflow
+FORTIFY_MEM_OBJECT detected buffer overflow
+FORTIFY_MEM_MEMBER detected field-spanning write
 PPC_SLB_MULTIHIT Recovered
-- 
cgit v1.2.3


From ba38961a069b0d8d03b53218a6c29d737577d448 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 9 Feb 2022 16:32:24 -0800
Subject: um: Enable FORTIFY_SOURCE

Enable FORTIFY_SOURCE so running Kunit tests can test fortified
functions.

Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: David Gow <davidgow@google.com>
Link: https://lore.kernel.org/r/20220210003224.773957-1-keescook@chromium.org
---
 arch/um/Kconfig              | 1 +
 arch/um/os-Linux/user_syms.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 78de31ac1da7..ad4ff3b0e91e 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -6,6 +6,7 @@ config UML
 	bool
 	default y
 	select ARCH_EPHEMERAL_INODES
+	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_STRNCPY_FROM_USER
diff --git a/arch/um/os-Linux/user_syms.c b/arch/um/os-Linux/user_syms.c
index cb667c9225ab..fd575ecbcaec 100644
--- a/arch/um/os-Linux/user_syms.c
+++ b/arch/um/os-Linux/user_syms.c
@@ -1,4 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
+#define __NO_FORTIFY
 #include <linux/types.h>
 #include <linux/module.h>
 
-- 
cgit v1.2.3


From aafc203bbad4bf6cf394a34ea698c2b0b8affae0 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Mon, 29 Aug 2022 17:46:10 -0700
Subject: LoadPin: Fix Kconfig doc about format of file with verity digests

The doc for CONFIG_SECURITY_LOADPIN_VERITY says that the file with verity
digests must contain a comma separated list of digests. That was the case
at some stage of the development, but was changed during the review
process to one digest per line. Update the Kconfig doc accordingly.

Reported-by: Jae Hoon Kim <kimjae@chromium.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Fixes: 3f805f8cc23b ("LoadPin: Enable loading from trusted dm-verity devices")
Cc: stable@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220829174557.1.I5d202d1344212a3800d9828f936df6511eb2d0d1@changeid
---
 security/loadpin/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/loadpin/Kconfig b/security/loadpin/Kconfig
index 70e7985b2561..994c1d9376e6 100644
--- a/security/loadpin/Kconfig
+++ b/security/loadpin/Kconfig
@@ -33,4 +33,4 @@ config SECURITY_LOADPIN_VERITY
 	  on the LoadPin securityfs entry 'dm-verity'. The ioctl
 	  expects a file descriptor of a file with verity digests as
 	  parameter. The file must be located on the pinned root and
-	  contain a comma separated list of digests.
+	  contain one digest per line.
-- 
cgit v1.2.3


From 916ef6232cc4b84db7082b4c3d3cf1753d9462ba Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 7 Sep 2022 13:30:58 -0700
Subject: dm: verity-loadpin: Only trust verity targets with enforcement

Verity targets can be configured to ignore corrupted data blocks.
LoadPin must only trust verity targets that are configured to
perform some kind of enforcement when data corruption is detected,
like returning an error, restarting the system or triggering a
panic.

Fixes: b6c1c5745ccc ("dm: Add verity helpers for LoadPin")
Reported-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
Cc: stable@vger.kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220907133055.1.Ic8a1dafe960dc0f8302e189642bc88ebb785d274@changeid
---
 drivers/md/dm-verity-loadpin.c |  8 ++++++++
 drivers/md/dm-verity-target.c  | 16 ++++++++++++++++
 drivers/md/dm-verity.h         |  1 +
 3 files changed, 25 insertions(+)

diff --git a/drivers/md/dm-verity-loadpin.c b/drivers/md/dm-verity-loadpin.c
index 387ec43aef72..4f78cc55c251 100644
--- a/drivers/md/dm-verity-loadpin.c
+++ b/drivers/md/dm-verity-loadpin.c
@@ -14,6 +14,7 @@ LIST_HEAD(dm_verity_loadpin_trusted_root_digests);
 
 static bool is_trusted_verity_target(struct dm_target *ti)
 {
+	int verity_mode;
 	u8 *root_digest;
 	unsigned int digest_size;
 	struct dm_verity_loadpin_trusted_root_digest *trd;
@@ -22,6 +23,13 @@ static bool is_trusted_verity_target(struct dm_target *ti)
 	if (!dm_is_verity_target(ti))
 		return false;
 
+	verity_mode = dm_verity_get_mode(ti);
+
+	if ((verity_mode != DM_VERITY_MODE_EIO) &&
+	    (verity_mode != DM_VERITY_MODE_RESTART) &&
+	    (verity_mode != DM_VERITY_MODE_PANIC))
+		return false;
+
 	if (dm_verity_get_root_digest(ti, &root_digest, &digest_size))
 		return false;
 
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 94b6cb599db4..8a00cc42e498 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -1446,6 +1446,22 @@ bool dm_is_verity_target(struct dm_target *ti)
 	return ti->type->module == THIS_MODULE;
 }
 
+/*
+ * Get the verity mode (error behavior) of a verity target.
+ *
+ * Returns the verity mode of the target, or -EINVAL if 'ti' is not a verity
+ * target.
+ */
+int dm_verity_get_mode(struct dm_target *ti)
+{
+	struct dm_verity *v = ti->private;
+
+	if (!dm_is_verity_target(ti))
+		return -EINVAL;
+
+	return v->mode;
+}
+
 /*
  * Get the root digest of a verity target.
  *
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 45455de1b4bc..98f306ec6a33 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -134,6 +134,7 @@ extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
 				 sector_t block, u8 *digest, bool *is_zero);
 
 extern bool dm_is_verity_target(struct dm_target *ti);
+extern int dm_verity_get_mode(struct dm_target *ti);
 extern int dm_verity_get_root_digest(struct dm_target *ti, u8 **root_digest,
 				     unsigned int *digest_size);
 
-- 
cgit v1.2.3


From 6e42aec7c75947e0d6b38400628f171364eb8231 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Tue, 6 Sep 2022 18:18:12 -0700
Subject: LoadPin: Require file with verity root digests to have a header

LoadPin expects the file with trusted verity root digests to be
an ASCII file with one digest (hex value) per line. A pinned
root could contain files that meet these format requirements,
even though the hex values don't represent trusted root
digests.

Add a new requirement to the file format which consists in
the first line containing a fixed string. This prevents
attackers from feeding files with an otherwise valid format
to LoadPin.

Suggested-by: Sarthak Kukreti <sarthakkukreti@chromium.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220906181725.1.I3f51d1bb0014e5a5951be4ad3c5ad7c7ca1dfc32@changeid
---
 security/loadpin/Kconfig   |  7 ++++++-
 security/loadpin/loadpin.c | 16 +++++++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/security/loadpin/Kconfig b/security/loadpin/Kconfig
index 994c1d9376e6..6724eaba3d36 100644
--- a/security/loadpin/Kconfig
+++ b/security/loadpin/Kconfig
@@ -33,4 +33,9 @@ config SECURITY_LOADPIN_VERITY
 	  on the LoadPin securityfs entry 'dm-verity'. The ioctl
 	  expects a file descriptor of a file with verity digests as
 	  parameter. The file must be located on the pinned root and
-	  contain one digest per line.
+	  start with the line:
+
+	  # LOADPIN_TRUSTED_VERITY_ROOT_DIGESTS
+
+	  This is followed by the verity digests, with one digest per
+	  line.
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index 44521582dcba..de41621f4998 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -21,6 +21,8 @@
 #include <linux/dm-verity-loadpin.h>
 #include <uapi/linux/loadpin.h>
 
+#define VERITY_DIGEST_FILE_HEADER "# LOADPIN_TRUSTED_VERITY_ROOT_DIGESTS"
+
 static void report_load(const char *origin, struct file *file, char *operation)
 {
 	char *cmdline, *pathname;
@@ -292,9 +294,21 @@ static int read_trusted_verity_root_digests(unsigned int fd)
 
 	p = strim(data);
 	while ((d = strsep(&p, "\n")) != NULL) {
-		int len = strlen(d);
+		int len;
 		struct dm_verity_loadpin_trusted_root_digest *trd;
 
+		if (d == data) {
+			/* first line, validate header */
+			if (strcmp(d, VERITY_DIGEST_FILE_HEADER)) {
+				rc = -EPROTO;
+				goto err;
+			}
+
+			continue;
+		}
+
+		len = strlen(d);
+
 		if (len % 2) {
 			rc = -EPROTO;
 			goto err;
-- 
cgit v1.2.3


From 98388bda6a99d76309f81584f2bc0d773bdf8b35 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Wed, 7 Sep 2022 11:03:29 -0700
Subject: lib: Improve the is_signed_type() kunit test

Since the definition of is_signed_type() has been moved from
<linux/overflow.h> to <linux/compiler.h>, include the latter header file
instead of the former. Additionally, add a test for the type 'char'.

Cc: Isabella Basso <isabbasso@riseup.net>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220907180329.3825417-1-bvanassche@acm.org
---
 lib/is_signed_type_kunit.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lib/is_signed_type_kunit.c b/lib/is_signed_type_kunit.c
index f2eedb1f0935..207207522925 100644
--- a/lib/is_signed_type_kunit.c
+++ b/lib/is_signed_type_kunit.c
@@ -5,7 +5,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <kunit/test.h>
-#include <linux/overflow.h>
+#include <linux/compiler.h>
 
 enum unsigned_enum {
 	constant_a = 3,
@@ -21,6 +21,11 @@ static void is_signed_type_test(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, is_signed_type(bool), false);
 	KUNIT_EXPECT_EQ(test, is_signed_type(signed char), true);
 	KUNIT_EXPECT_EQ(test, is_signed_type(unsigned char), false);
+#ifdef __CHAR_UNSIGNED__
+	KUNIT_EXPECT_EQ(test, is_signed_type(char), false);
+#else
+	KUNIT_EXPECT_EQ(test, is_signed_type(char), true);
+#endif
 	KUNIT_EXPECT_EQ(test, is_signed_type(int), true);
 	KUNIT_EXPECT_EQ(test, is_signed_type(unsigned int), false);
 	KUNIT_EXPECT_EQ(test, is_signed_type(long), true);
-- 
cgit v1.2.3


From 66cb2a36a96f6facbcb4ef1db967b8e9ea6910fe Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 7 Sep 2022 16:27:06 -0700
Subject: kunit/memcpy: Avoid pathological compile-time string size

The memcpy() KUnit tests are trying to sanity-check run-time behaviors,
but tripped compile-time warnings about a pathological condition of a
too-small buffer being used for input. Avoid this by explicitly resizing
the buffer, but leaving the string short. Avoid the following warning:

lib/memcpy_kunit.c: In function 'strtomem_test':
include/linux/string.h:303:42: warning: 'strnlen' specified bound 4 exceeds source size 3 [-Wstringop-overread]
  303 |         memcpy(dest, src, min(_dest_len, strnlen(src, _dest_len)));     \
include/linux/minmax.h:32:39: note: in definition of macro '__cmp_once'
   32 |                 typeof(y) unique_y = (y);               \
      |                                       ^
include/linux/minmax.h:45:25: note: in expansion of macro '__careful_cmp'
   45 | #define min(x, y)       __careful_cmp(x, y, <)
      |                         ^~~~~~~~~~~~~
include/linux/string.h:303:27: note: in expansion of macro 'min'
  303 |         memcpy(dest, src, min(_dest_len, strnlen(src, _dest_len)));     \
      |                           ^~~
lib/memcpy_kunit.c:290:9: note: in expansion of macro 'strtomem'
  290 |         strtomem(wrap.output, input);
      |         ^~~~~~~~
lib/memcpy_kunit.c:275:27: note: source object allocated here
  275 |         static const char input[] = "hi";
      |                           ^~~~~

Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/linux-mm/202209070728.o3stvgVt-lkp@intel.com
Fixes: dfbafa70bde2 ("string: Introduce strtomem() and strtomem_pad()")
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 lib/memcpy_kunit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/memcpy_kunit.c b/lib/memcpy_kunit.c
index d22fa3838ee9..2b5cc70ac53f 100644
--- a/lib/memcpy_kunit.c
+++ b/lib/memcpy_kunit.c
@@ -272,7 +272,7 @@ static void memset_test(struct kunit *test)
 
 static void strtomem_test(struct kunit *test)
 {
-	static const char input[] = "hi";
+	static const char input[sizeof(unsigned long)] = "hi";
 	static const char truncate[] = "this is too long";
 	struct {
 		unsigned long canary1;
-- 
cgit v1.2.3


From b239da34203f49c40b5d656220c39647c3ff0b3c Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Sun, 4 Sep 2022 22:41:28 +0200
Subject: bpf: Add helper macro bpf_for_each_reg_in_vstate

For a lot of use cases in future patches, we will want to modify the
state of registers part of some same 'group' (e.g. same ref_obj_id). It
won't just be limited to releasing reference state, but setting a type
flag dynamically based on certain actions, etc.

Hence, we need a way to easily pass a callback to the function that
iterates over all registers in current bpf_verifier_state in all frames
upto (and including) the curframe.

While in C++ we would be able to easily use a lambda to pass state and
the callback together, sadly we aren't using C++ in the kernel. The next
best thing to avoid defining a function for each case seems like
statement expressions in GNU C. The kernel already uses them heavily,
hence they can passed to the macro in the style of a lambda. The
statement expression will then be substituted in the for loop bodies.

Variables __state and __reg are set to current bpf_func_state and reg
for each invocation of the expression inside the passed in verifier
state.

Then, convert mark_ptr_or_null_regs, clear_all_pkt_pointers,
release_reference, find_good_pkt_pointers, find_equal_scalars to
use bpf_for_each_reg_in_vstate.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220904204145.3089-16-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  21 +++++++
 kernel/bpf/verifier.c        | 135 +++++++++----------------------------------
 2 files changed, 49 insertions(+), 107 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 8fbc1d05281e..b49a349cc6ae 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -348,6 +348,27 @@ struct bpf_verifier_state {
 	     iter < frame->allocated_stack / BPF_REG_SIZE;		\
 	     iter++, reg = bpf_get_spilled_reg(iter, frame))
 
+/* Invoke __expr over regsiters in __vst, setting __state and __reg */
+#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr)   \
+	({                                                               \
+		struct bpf_verifier_state *___vstate = __vst;            \
+		int ___i, ___j;                                          \
+		for (___i = 0; ___i <= ___vstate->curframe; ___i++) {    \
+			struct bpf_reg_state *___regs;                   \
+			__state = ___vstate->frame[___i];                \
+			___regs = __state->regs;                         \
+			for (___j = 0; ___j < MAX_BPF_REG; ___j++) {     \
+				__reg = &___regs[___j];                  \
+				(void)(__expr);                          \
+			}                                                \
+			bpf_for_each_spilled_reg(___j, __state, __reg) { \
+				if (!__reg)                              \
+					continue;                        \
+				(void)(__expr);                          \
+			}                                                \
+		}                                                        \
+	})
+
 /* linked list of verifier states used to prune search */
 struct bpf_verifier_state_list {
 	struct bpf_verifier_state state;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f3344a86d88d..c0f175ac187a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6513,31 +6513,15 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
 /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
  * are now invalid, so turn them into unknown SCALAR_VALUE.
  */
-static void __clear_all_pkt_pointers(struct bpf_verifier_env *env,
-				     struct bpf_func_state *state)
+static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
-	struct bpf_reg_state *regs = state->regs, *reg;
-	int i;
-
-	for (i = 0; i < MAX_BPF_REG; i++)
-		if (reg_is_pkt_pointer_any(&regs[i]))
-			mark_reg_unknown(env, regs, i);
+	struct bpf_func_state *state;
+	struct bpf_reg_state *reg;
 
-	bpf_for_each_spilled_reg(i, state, reg) {
-		if (!reg)
-			continue;
+	bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
 		if (reg_is_pkt_pointer_any(reg))
 			__mark_reg_unknown(env, reg);
-	}
-}
-
-static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
-{
-	struct bpf_verifier_state *vstate = env->cur_state;
-	int i;
-
-	for (i = 0; i <= vstate->curframe; i++)
-		__clear_all_pkt_pointers(env, vstate->frame[i]);
+	}));
 }
 
 enum {
@@ -6566,41 +6550,24 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range
 		reg->range = AT_PKT_END;
 }
 
-static void release_reg_references(struct bpf_verifier_env *env,
-				   struct bpf_func_state *state,
-				   int ref_obj_id)
-{
-	struct bpf_reg_state *regs = state->regs, *reg;
-	int i;
-
-	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].ref_obj_id == ref_obj_id)
-			mark_reg_unknown(env, regs, i);
-
-	bpf_for_each_spilled_reg(i, state, reg) {
-		if (!reg)
-			continue;
-		if (reg->ref_obj_id == ref_obj_id)
-			__mark_reg_unknown(env, reg);
-	}
-}
-
 /* The pointer with the specified id has released its reference to kernel
  * resources. Identify all copies of the same pointer and clear the reference.
  */
 static int release_reference(struct bpf_verifier_env *env,
 			     int ref_obj_id)
 {
-	struct bpf_verifier_state *vstate = env->cur_state;
+	struct bpf_func_state *state;
+	struct bpf_reg_state *reg;
 	int err;
-	int i;
 
 	err = release_reference_state(cur_func(env), ref_obj_id);
 	if (err)
 		return err;
 
-	for (i = 0; i <= vstate->curframe; i++)
-		release_reg_references(env, vstate->frame[i], ref_obj_id);
+	bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+		if (reg->ref_obj_id == ref_obj_id)
+			__mark_reg_unknown(env, reg);
+	}));
 
 	return 0;
 }
@@ -9335,34 +9302,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
-static void __find_good_pkt_pointers(struct bpf_func_state *state,
-				     struct bpf_reg_state *dst_reg,
-				     enum bpf_reg_type type, int new_range)
-{
-	struct bpf_reg_state *reg;
-	int i;
-
-	for (i = 0; i < MAX_BPF_REG; i++) {
-		reg = &state->regs[i];
-		if (reg->type == type && reg->id == dst_reg->id)
-			/* keep the maximum range already checked */
-			reg->range = max(reg->range, new_range);
-	}
-
-	bpf_for_each_spilled_reg(i, state, reg) {
-		if (!reg)
-			continue;
-		if (reg->type == type && reg->id == dst_reg->id)
-			reg->range = max(reg->range, new_range);
-	}
-}
-
 static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 				   struct bpf_reg_state *dst_reg,
 				   enum bpf_reg_type type,
 				   bool range_right_open)
 {
-	int new_range, i;
+	struct bpf_func_state *state;
+	struct bpf_reg_state *reg;
+	int new_range;
 
 	if (dst_reg->off < 0 ||
 	    (dst_reg->off == 0 && range_right_open))
@@ -9427,9 +9374,11 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
 	 * the range won't allow anything.
 	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
 	 */
-	for (i = 0; i <= vstate->curframe; i++)
-		__find_good_pkt_pointers(vstate->frame[i], dst_reg, type,
-					 new_range);
+	bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+		if (reg->type == type && reg->id == dst_reg->id)
+			/* keep the maximum range already checked */
+			reg->range = max(reg->range, new_range);
+	}));
 }
 
 static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
@@ -9918,7 +9867,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 
 		if (!reg_may_point_to_spin_lock(reg)) {
 			/* For not-NULL ptr, reg->ref_obj_id will be reset
-			 * in release_reg_references().
+			 * in release_reference().
 			 *
 			 * reg->id is still used by spin_lock ptr. Other
 			 * than spin_lock ptr type, reg->id can be reset.
@@ -9928,22 +9877,6 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 	}
 }
 
-static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id,
-				    bool is_null)
-{
-	struct bpf_reg_state *reg;
-	int i;
-
-	for (i = 0; i < MAX_BPF_REG; i++)
-		mark_ptr_or_null_reg(state, &state->regs[i], id, is_null);
-
-	bpf_for_each_spilled_reg(i, state, reg) {
-		if (!reg)
-			continue;
-		mark_ptr_or_null_reg(state, reg, id, is_null);
-	}
-}
-
 /* The logic is similar to find_good_pkt_pointers(), both could eventually
  * be folded together at some point.
  */
@@ -9951,10 +9884,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
 				  bool is_null)
 {
 	struct bpf_func_state *state = vstate->frame[vstate->curframe];
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *regs = state->regs, *reg;
 	u32 ref_obj_id = regs[regno].ref_obj_id;
 	u32 id = regs[regno].id;
-	int i;
 
 	if (ref_obj_id && ref_obj_id == id && is_null)
 		/* regs[regno] is in the " == NULL" branch.
@@ -9963,8 +9895,9 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
 		 */
 		WARN_ON_ONCE(release_reference_state(state, id));
 
-	for (i = 0; i <= vstate->curframe; i++)
-		__mark_ptr_or_null_regs(vstate->frame[i], id, is_null);
+	bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+		mark_ptr_or_null_reg(state, reg, id, is_null);
+	}));
 }
 
 static bool try_match_pkt_pointers(const struct bpf_insn *insn,
@@ -10077,23 +10010,11 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
 {
 	struct bpf_func_state *state;
 	struct bpf_reg_state *reg;
-	int i, j;
 
-	for (i = 0; i <= vstate->curframe; i++) {
-		state = vstate->frame[i];
-		for (j = 0; j < MAX_BPF_REG; j++) {
-			reg = &state->regs[j];
-			if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
-				*reg = *known_reg;
-		}
-
-		bpf_for_each_spilled_reg(j, state, reg) {
-			if (!reg)
-				continue;
-			if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
-				*reg = *known_reg;
-		}
-	}
+	bpf_for_each_reg_in_vstate(vstate, state, reg, ({
+		if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+			*reg = *known_reg;
+	}));
 }
 
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
-- 
cgit v1.2.3


From fe32f366931a950889e8d72be86fafc867dab777 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Thu, 1 Sep 2022 12:43:08 -0700
Subject: EDAC/skx_common: Use driver decoder first

The performance of driver decoder[1] is better than the performance
of firmware decoder[2], especially on frequent correctable errors.

So use the driver decoder first, fall back to firmware decoder if
the driver decoder is unavailable. Also rename the function pointer
skx_decode to driver_decode (better name to contrast with adxl_decode).

[1] Decode errors by extracting error information from registers of
    memory controllers and/or MCA bank registers.

[2] Decode errors by calling ACPI DSM methods.

Co-developed-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220901194310.115427-1-tony.luck@intel.com/
---
 drivers/edac/skx_base.c   |  9 +++++++--
 drivers/edac/skx_common.c | 16 +++++++++-------
 drivers/edac/skx_common.h |  1 +
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 1abc020d49ab..7e2762f62eec 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -714,8 +714,13 @@ static int __init skx_init(void)
 
 	skx_set_decode(skx_decode, skx_show_retry_rd_err_log);
 
-	if (nvdimm_count && skx_adxl_get() == -ENODEV)
-		skx_printk(KERN_NOTICE, "Only decoding DDR4 address!\n");
+	if (nvdimm_count && skx_adxl_get() != -ENODEV) {
+		skx_set_decode(NULL, skx_show_retry_rd_err_log);
+	} else {
+		if (nvdimm_count)
+			skx_printk(KERN_NOTICE, "Only decoding DDR4 address!\n");
+		skx_set_decode(skx_decode, skx_show_retry_rd_err_log);
+	}
 
 	/* Ensure that the OPSTATE is set correctly for POLL or NMI */
 	opstate_init();
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 19c17c5198c5..9b10c359849b 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -40,7 +40,7 @@ static char *adxl_msg;
 static unsigned long adxl_nm_bitmap;
 
 static char skx_msg[MSG_SIZE];
-static skx_decode_f skx_decode;
+static skx_decode_f driver_decode;
 static skx_show_retry_log_f skx_show_retry_rd_err_log;
 static u64 skx_tolm, skx_tohm;
 static LIST_HEAD(dev_edac_list);
@@ -173,6 +173,8 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me
 			break;
 	}
 
+	res->decoded_by_adxl = true;
+
 	return true;
 }
 
@@ -183,7 +185,7 @@ void skx_set_mem_cfg(bool mem_cfg_2lm)
 
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
 {
-	skx_decode = decode;
+	driver_decode = decode;
 	skx_show_retry_rd_err_log = show_retry_log;
 }
 
@@ -591,7 +593,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			break;
 		}
 	}
-	if (adxl_component_count) {
+	if (res->decoded_by_adxl) {
 		len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s",
 			 overflow ? " OVERFLOW" : "",
 			 (uncorrected_error && recoverable) ? " recoverable" : "",
@@ -651,11 +653,11 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	memset(&res, 0, sizeof(res));
 	res.addr = mce->addr;
 
-	if (adxl_component_count) {
-		if (!skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce)))
+	/* Try driver decoder first */
+	if (!(driver_decode && driver_decode(&res))) {
+		/* Then try firmware decoder (ACPI DSM methods) */
+		if (!(adxl_component_count && skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce))))
 			return NOTIFY_DONE;
-	} else if (!skx_decode || !skx_decode(&res)) {
-		return NOTIFY_DONE;
 	}
 
 	mci = res.dev->imc[res.imc].mci;
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 03ac067a80b9..880ecd15ca42 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -136,6 +136,7 @@ struct decoded_addr {
 	int	column;
 	int	bank_address;
 	int	bank_group;
+	bool	decoded_by_adxl;
 };
 
 struct res_config {
-- 
cgit v1.2.3


From 627d551a9e75ef81525822ba2a0d9d5a64791d89 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Thu, 1 Sep 2022 12:43:09 -0700
Subject: EDAC/skx_common: Make output format similar

The decoded output format of driver decoder is different from the
output format of firmware decoder. Make output format similar regardless
of decode function (Align driver decoder's to firmware decoder's).

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220901194310.115427-1-tony.luck@intel.com/
---
 drivers/edac/skx_common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 9b10c359849b..16ca3de57c24 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -600,12 +600,12 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			 mscod, errcode, adxl_msg);
 	} else {
 		len = snprintf(skx_msg, MSG_SIZE,
-			 "%s%s err_code:0x%04x:0x%04x socket:%d imc:%d rank:%d bg:%d ba:%d row:0x%x col:0x%x",
+			 "%s%s err_code:0x%04x:0x%04x ProcessorSocketId:0x%x MemoryControllerId:0x%x PhysicalRankId:0x%x Row:0x%x Column:0x%x Bank:0x%x BankGroup:0x%x",
 			 overflow ? " OVERFLOW" : "",
 			 (uncorrected_error && recoverable) ? " recoverable" : "",
 			 mscod, errcode,
 			 res->socket, res->imc, res->rank,
-			 res->bank_group, res->bank_address, res->row, res->column);
+			 res->row, res->column, res->bank_address, res->bank_group);
 	}
 
 	if (skx_show_retry_rd_err_log)
-- 
cgit v1.2.3


From 2738c69a8813453b35549465867ae591f8598eb0 Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Thu, 1 Sep 2022 12:43:10 -0700
Subject: EDAC/i10nm: Add driver decoder for Ice Lake and Tremont CPUs

Current i10nm_edac only supports firmware decoder (ACPI DSM methods).
MCA bank registers of Ice Lake or Tremont CPUs contain the information
to decode DDR memory errors. To get better decoding performance, add
the driver decoder (decoding DDR memory errors via extracting error
information from MCA bank registers) for Ice Lake and Tremont CPUs.

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220901194310.115427-1-tony.luck@intel.com/
---
 arch/x86/include/asm/mce.h |   1 +
 drivers/edac/i10nm_base.c  | 134 ++++++++++++++++++++++++++++++++++++++++++++-
 drivers/edac/skx_common.c  |   1 +
 drivers/edac/skx_common.h  |   5 ++
 4 files changed, 139 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index cc73061e7255..6e986088817d 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -42,6 +42,7 @@
 #define MCI_STATUS_CEC_SHIFT	38           /* Corrected Error Count */
 #define MCI_STATUS_CEC_MASK	GENMASK_ULL(52,38)
 #define MCI_STATUS_CEC(c)	(((c) & MCI_STATUS_CEC_MASK) >> MCI_STATUS_CEC_SHIFT)
+#define MCI_STATUS_MSCOD(m)	(((m) >> 16) & 0xffff)
 
 /* AMD-specific bits */
 #define MCI_STATUS_TCC		BIT_ULL(55)  /* Task context corrupt */
diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 6cf50ee0b77c..817f618fcff0 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -74,6 +74,8 @@ static struct list_head *i10nm_edac_list;
 
 static struct res_config *res_cfg;
 static int retry_rd_err_log;
+static int decoding_via_mca;
+static bool mem_cfg_2lm;
 
 static u32 offsets_scrub_icx[]  = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8};
 static u32 offsets_scrub_spr[]  = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8};
@@ -231,6 +233,103 @@ static bool i10nm_check_2lm(struct res_config *cfg)
 	return false;
 }
 
+/*
+ * Check whether the error comes from DDRT by ICX/Tremont model specific error code.
+ * Refer to SDM vol3B 16.11.3 Intel IMC MC error codes for IA32_MCi_STATUS.
+ */
+static bool i10nm_mscod_is_ddrt(u32 mscod)
+{
+	switch (mscod) {
+	case 0x0106: case 0x0107:
+	case 0x0800: case 0x0804:
+	case 0x0806 ... 0x0808:
+	case 0x080a ... 0x080e:
+	case 0x0810: case 0x0811:
+	case 0x0816: case 0x081e:
+	case 0x081f:
+		return true;
+	}
+
+	return false;
+}
+
+static bool i10nm_mc_decode_available(struct mce *mce)
+{
+	u8 bank;
+
+	if (!decoding_via_mca || mem_cfg_2lm)
+		return false;
+
+	if ((mce->status & (MCI_STATUS_MISCV | MCI_STATUS_ADDRV))
+			!= (MCI_STATUS_MISCV | MCI_STATUS_ADDRV))
+		return false;
+
+	bank = mce->bank;
+
+	switch (res_cfg->type) {
+	case I10NM:
+		if (bank < 13 || bank > 26)
+			return false;
+
+		/* DDRT errors can't be decoded from MCA bank registers */
+		if (MCI_MISC_ECC_MODE(mce->misc) == MCI_MISC_ECC_DDRT)
+			return false;
+
+		if (i10nm_mscod_is_ddrt(MCI_STATUS_MSCOD(mce->status)))
+			return false;
+
+		/* Check whether one of {13,14,17,18,21,22,25,26} */
+		return ((bank - 13) & BIT(1)) == 0;
+	default:
+		return false;
+	}
+}
+
+static bool i10nm_mc_decode(struct decoded_addr *res)
+{
+	struct mce *m = res->mce;
+	struct skx_dev *d;
+	u8 bank;
+
+	if (!i10nm_mc_decode_available(m))
+		return false;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		if (d->imc[0].src_id == m->socketid) {
+			res->socket = m->socketid;
+			res->dev = d;
+			break;
+		}
+	}
+
+	switch (res_cfg->type) {
+	case I10NM:
+		bank = m->bank - 13;
+		res->imc = bank / 4;
+		res->channel = bank % 2;
+		break;
+	default:
+		return false;
+	}
+
+	if (!res->dev) {
+		skx_printk(KERN_ERR, "No device for src_id %d imc %d\n",
+			   m->socketid, res->imc);
+		return false;
+	}
+
+	res->column       = GET_BITFIELD(m->misc, 9, 18) << 2;
+	res->row          = GET_BITFIELD(m->misc, 19, 39);
+	res->bank_group   = GET_BITFIELD(m->misc, 40, 41);
+	res->bank_address = GET_BITFIELD(m->misc, 42, 43);
+	res->bank_group  |= GET_BITFIELD(m->misc, 44, 44) << 2;
+	res->rank         = GET_BITFIELD(m->misc, 56, 58);
+	res->dimm         = res->rank >> 2;
+	res->rank         = res->rank % 4;
+
+	return true;
+}
+
 static int i10nm_get_ddr_munits(void)
 {
 	struct pci_dev *mdev;
@@ -574,7 +673,8 @@ static int __init i10nm_init(void)
 		return -ENODEV;
 	}
 
-	skx_set_mem_cfg(i10nm_check_2lm(cfg));
+	mem_cfg_2lm = i10nm_check_2lm(cfg);
+	skx_set_mem_cfg(mem_cfg_2lm);
 
 	rc = i10nm_get_ddr_munits();
 
@@ -626,9 +726,11 @@ static int __init i10nm_init(void)
 	setup_i10nm_debug();
 
 	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
-		skx_set_decode(NULL, show_retry_rd_err_log);
+		skx_set_decode(i10nm_mc_decode, show_retry_rd_err_log);
 		if (retry_rd_err_log == 2)
 			enable_retry_rd_err_log(true);
+	} else {
+		skx_set_decode(i10nm_mc_decode, NULL);
 	}
 
 	i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION);
@@ -658,6 +760,34 @@ static void __exit i10nm_exit(void)
 module_init(i10nm_init);
 module_exit(i10nm_exit);
 
+static int set_decoding_via_mca(const char *buf, const struct kernel_param *kp)
+{
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(buf, 0, &val);
+
+	if (ret || val > 1)
+		return -EINVAL;
+
+	if (val && mem_cfg_2lm) {
+		i10nm_printk(KERN_NOTICE, "Decoding errors via MCA banks for 2LM isn't supported yet\n");
+		return -EIO;
+	}
+
+	ret = param_set_int(buf, kp);
+
+	return ret;
+}
+
+static const struct kernel_param_ops decoding_via_mca_param_ops = {
+	.set = set_decoding_via_mca,
+	.get = param_get_int,
+};
+
+module_param_cb(decoding_via_mca, &decoding_via_mca_param_ops, &decoding_via_mca, 0644);
+MODULE_PARM_DESC(decoding_via_mca, "decoding_via_mca: 0=off(default), 1=enable");
+
 module_param(retry_rd_err_log, int, 0444);
 MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)");
 
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 16ca3de57c24..7276ce3a33e1 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -651,6 +651,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 		return NOTIFY_DONE;
 
 	memset(&res, 0, sizeof(res));
+	res.mce  = mce;
 	res.addr = mce->addr;
 
 	/* Try driver decoder first */
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 880ecd15ca42..c542f1562825 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -10,6 +10,7 @@
 #define _SKX_COMM_EDAC_H
 
 #include <linux/bits.h>
+#include <asm/mce.h>
 
 #define MSG_SIZE		1024
 
@@ -52,6 +53,9 @@
 #define IS_DIMM_PRESENT(r)		GET_BITFIELD(r, 15, 15)
 #define IS_NVDIMM_PRESENT(r, i)		GET_BITFIELD(r, i, i)
 
+#define MCI_MISC_ECC_MODE(m)	(((m) >> 59) & 15)
+#define MCI_MISC_ECC_DDRT	8	/* read from DDRT */
+
 /*
  * Each cpu socket contains some pci devices that provide global
  * information, and also some that are local to each of the two
@@ -120,6 +124,7 @@ enum {
 #define BIT_NM_DIMM	BIT_ULL(INDEX_NM_DIMM)
 
 struct decoded_addr {
+	struct mce *mce;
 	struct skx_dev *dev;
 	u64	addr;
 	int	socket;
-- 
cgit v1.2.3


From 2a40f883781d6cbbf547ed13b0cec2f9808d839d Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 8 Sep 2022 13:57:50 -0700
Subject: Bluetooth: Fix HCIGETDEVINFO regression

Recent changes breaks HCIGETDEVINFO since it changes the size of
hci_dev_info.

Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_sock.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/net/bluetooth/hci_sock.h b/include/net/bluetooth/hci_sock.h
index 0520e21ab698..9949870f7d78 100644
--- a/include/net/bluetooth/hci_sock.h
+++ b/include/net/bluetooth/hci_sock.h
@@ -124,8 +124,6 @@ struct hci_dev_info {
 	__u16 acl_pkts;
 	__u16 sco_mtu;
 	__u16 sco_pkts;
-	__u16 iso_mtu;
-	__u16 iso_pkts;
 
 	struct hci_dev_stats stat;
 };
-- 
cgit v1.2.3


From 665f5d3577ef43e929d59cf39683037887c351bf Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 9 Sep 2022 16:15:11 +0200
Subject: libbpf: Remove gcc support for bpf_tail_call_static for now

This reverts commit 14e5ce79943a ("libbpf: Add GCC support for
bpf_tail_call_static"). Reason is that gcc invented their own BPF asm
which is not conform with LLVM one, and going forward this would be
more painful to maintain here and in other areas of the library. Thus
remove it; ask to gcc folks is to align with LLVM one to use exact
same syntax.

Fixes: 14e5ce79943a ("libbpf: Add GCC support for bpf_tail_call_static")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: James Hilliard <james.hilliard1@gmail.com>
Cc: Jose E. Marchesi <jose.marchesi@oracle.com>
---
 tools/lib/bpf/bpf_helpers.h | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index 867b734839dd..7349b16b8e2f 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -131,7 +131,7 @@
 /*
  * Helper function to perform a tail call with a constant/immediate map slot.
  */
-#if (!defined(__clang__) || __clang_major__ >= 8) && defined(__bpf__)
+#if __clang_major__ >= 8 && defined(__bpf__)
 static __always_inline void
 bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
 {
@@ -139,8 +139,8 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
 		__bpf_unreachable();
 
 	/*
-	 * Provide a hard guarantee that the compiler won't optimize setting r2
-	 * (map pointer) and r3 (constant map index) from _different paths_ ending
+	 * Provide a hard guarantee that LLVM won't optimize setting r2 (map
+	 * pointer) and r3 (constant map index) from _different paths_ ending
 	 * up at the _same_ call insn as otherwise we won't be able to use the
 	 * jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel
 	 * given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key
@@ -148,19 +148,12 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
 	 *
 	 * Note on clobber list: we need to stay in-line with BPF calling
 	 * convention, so even if we don't end up using r0, r4, r5, we need
-	 * to mark them as clobber so that the compiler doesn't end up using
-	 * them before / after the call.
+	 * to mark them as clobber so that LLVM doesn't end up using them
+	 * before / after the call.
 	 */
-	asm volatile(
-#ifdef __clang__
-		     "r1 = %[ctx]\n\t"
+	asm volatile("r1 = %[ctx]\n\t"
 		     "r2 = %[map]\n\t"
 		     "r3 = %[slot]\n\t"
-#else
-		     "mov %%r1,%[ctx]\n\t"
-		     "mov %%r2,%[map]\n\t"
-		     "mov %%r3,%[slot]\n\t"
-#endif
 		     "call 12"
 		     :: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot)
 		     : "r0", "r1", "r2", "r3", "r4", "r5");
-- 
cgit v1.2.3


From 0ffe2412531e95a309d7f0bfe985fc4ca4d39de8 Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Fri, 9 Sep 2022 00:49:39 +0000
Subject: bpf: Invoke cgroup/connect{4,6} programs for unprivileged ICMP ping

Usually when a TCP/UDP connection is initiated, we can bind the socket
to a specific IP attached to an interface in a cgroup/connect hook.
But for pings, this is impossible, as the hook is not being called.

This adds the hook invocation to unprivileged ICMP ping (i.e. ping
sockets created with SOCK_DGRAM IPPROTO_ICMP(V6) as opposed to
SOCK_RAW. Logic is mirrored from UDP sockets where the hook is invoked
during pre_connect, after a check for suficiently sized addr_len.

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Link: https://lore.kernel.org/r/5764914c252fad4cd134fb6664c6ede95f409412.1662682323.git.zhuyifei@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 net/ipv4/ping.c | 15 +++++++++++++++
 net/ipv6/ping.c | 16 ++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index b83c2bd9d722..517042caf6dc 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -33,6 +33,7 @@
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
 #include <linux/export.h>
+#include <linux/bpf-cgroup.h>
 #include <net/sock.h>
 #include <net/ping.h>
 #include <net/udp.h>
@@ -295,6 +296,19 @@ void ping_close(struct sock *sk, long timeout)
 }
 EXPORT_SYMBOL_GPL(ping_close);
 
+static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+			    int addr_len)
+{
+	/* This check is replicated from __ip4_datagram_connect() and
+	 * intended to prevent BPF program called below from accessing bytes
+	 * that are out of the bound specified by user in addr_len.
+	 */
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
+}
+
 /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
 static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
 				struct sockaddr *uaddr, int addr_len)
@@ -1009,6 +1023,7 @@ struct proto ping_prot = {
 	.owner =	THIS_MODULE,
 	.init =		ping_init_sock,
 	.close =	ping_close,
+	.pre_connect =	ping_pre_connect,
 	.connect =	ip4_datagram_connect,
 	.disconnect =	__udp_disconnect,
 	.setsockopt =	ip_setsockopt,
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 91b840514656..5f2ef8493714 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -20,6 +20,7 @@
 #include <net/udp.h>
 #include <net/transp_v6.h>
 #include <linux/proc_fs.h>
+#include <linux/bpf-cgroup.h>
 #include <net/ping.h>
 
 static void ping_v6_destroy(struct sock *sk)
@@ -49,6 +50,20 @@ static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
 	return 0;
 }
 
+static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+			       int addr_len)
+{
+	/* This check is replicated from __ip6_datagram_connect() and
+	 * intended to prevent BPF program called below from accessing
+	 * bytes that are out of the bound specified by user in addr_len.
+	 */
+
+	if (addr_len < SIN6_LEN_RFC2133)
+		return -EINVAL;
+
+	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
+}
+
 static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -191,6 +206,7 @@ struct proto pingv6_prot = {
 	.init =		ping_init_sock,
 	.close =	ping_close,
 	.destroy =	ping_v6_destroy,
+	.pre_connect =	ping_v6_pre_connect,
 	.connect =	ip6_datagram_connect_v6_only,
 	.disconnect =	__udp_disconnect,
 	.setsockopt =	ipv6_setsockopt,
-- 
cgit v1.2.3


From e42921c3c346b1b49068af3f3881322081e1dddd Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Fri, 9 Sep 2022 00:49:40 +0000
Subject: selftests/bpf: Deduplicate write_sysctl() to test_progs.c

This helper is needed in multiple tests. Instead of copying it over
and over, better to deduplicate this helper to test_progs.c.

test_progs.c is chosen over testing_helpers.c because of this helper's
use of CHECK / ASSERT_*, and the CHECK was modified to use ASSERT_*
so it does not rely on a duration variable.

Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Link: https://lore.kernel.org/r/9b4fc9a27bd52f771b657b4c4090fc8d61f3a6b5.1662682323.git.zhuyifei@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/btf_skc_cls_ingress.c   | 20 --------------------
 .../selftests/bpf/prog_tests/tcp_hdr_options.c       | 20 --------------------
 tools/testing/selftests/bpf/test_progs.c             | 17 +++++++++++++++++
 tools/testing/selftests/bpf/test_progs.h             |  1 +
 4 files changed, 18 insertions(+), 40 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
index 664ffc0364f4..7a277035c275 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
@@ -22,26 +22,6 @@ static __u32 duration;
 
 #define PROG_PIN_FILE "/sys/fs/bpf/btf_skc_cls_ingress"
 
-static int write_sysctl(const char *sysctl, const char *value)
-{
-	int fd, err, len;
-
-	fd = open(sysctl, O_WRONLY);
-	if (CHECK(fd == -1, "open sysctl", "open(%s): %s (%d)\n",
-		  sysctl, strerror(errno), errno))
-		return -1;
-
-	len = strlen(value);
-	err = write(fd, value, len);
-	close(fd);
-	if (CHECK(err != len, "write sysctl",
-		  "write(%s, %s, %d): err:%d %s (%d)\n",
-		  sysctl, value, len, err, strerror(errno), errno))
-		return -1;
-
-	return 0;
-}
-
 static int prepare_netns(void)
 {
 	if (CHECK(unshare(CLONE_NEWNET), "create netns",
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
index 1fa772079967..f24436d33cd6 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
@@ -54,26 +54,6 @@ static int create_netns(void)
 	return 0;
 }
 
-static int write_sysctl(const char *sysctl, const char *value)
-{
-	int fd, err, len;
-
-	fd = open(sysctl, O_WRONLY);
-	if (CHECK(fd == -1, "open sysctl", "open(%s): %s (%d)\n",
-		  sysctl, strerror(errno), errno))
-		return -1;
-
-	len = strlen(value);
-	err = write(fd, value, len);
-	close(fd);
-	if (CHECK(err != len, "write sysctl",
-		  "write(%s, %s): err:%d %s (%d)\n",
-		  sysctl, value, err, strerror(errno), errno))
-		return -1;
-
-	return 0;
-}
-
 static void print_hdr_stg(const struct hdr_stg *hdr_stg, const char *prefix)
 {
 	fprintf(stderr, "%s{active:%u, resend_syn:%u, syncookie:%u, fastopen:%u}\n",
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 3561c97701f2..0e9a47f97890 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -943,6 +943,23 @@ int trigger_module_test_write(int write_sz)
 	return 0;
 }
 
+int write_sysctl(const char *sysctl, const char *value)
+{
+	int fd, err, len;
+
+	fd = open(sysctl, O_WRONLY);
+	if (!ASSERT_NEQ(fd, -1, "open sysctl"))
+		return -1;
+
+	len = strlen(value);
+	err = write(fd, value, len);
+	close(fd);
+	if (!ASSERT_EQ(err, len, "write sysctl"))
+		return -1;
+
+	return 0;
+}
+
 #define MAX_BACKTRACE_SZ 128
 void crash_handler(int signum)
 {
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 5fe1365c2bb1..b090996daee5 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -384,6 +384,7 @@ int extract_build_id(char *build_id, size_t size);
 int kern_sync_rcu(void);
 int trigger_module_test_read(int read_sz);
 int trigger_module_test_write(int write_sz);
+int write_sysctl(const char *sysctl, const char *value);
 
 #ifdef __x86_64__
 #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep"
-- 
cgit v1.2.3


From 58c449a96946929467b537589c8a23f11e04af39 Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Fri, 9 Sep 2022 00:49:41 +0000
Subject: selftests/bpf: Ensure cgroup/connect{4,6} programs can bind unpriv
 ICMP ping

This tests that when an unprivileged ICMP ping socket connects,
the hooks are actually invoked. We also ensure that if the hook does
not call bpf_bind(), the bound address is unmodified, and if the
hook calls bpf_bind(), the bound address is exactly what we provided
to the helper.

A new netns is used to enable ping_group_range in the test without
affecting ouside of the test, because by default, not even root is
permitted to use unprivileged ICMP ping...

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Link: https://lore.kernel.org/r/086b227c1b97f4e94193e58aae7576d0261b68a4.1662682323.git.zhuyifei@google.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/connect_ping.c        | 178 +++++++++++++++++++++
 tools/testing/selftests/bpf/progs/connect_ping.c   |  53 ++++++
 2 files changed, 231 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/connect_ping.c
 create mode 100644 tools/testing/selftests/bpf/progs/connect_ping.c

diff --git a/tools/testing/selftests/bpf/prog_tests/connect_ping.c b/tools/testing/selftests/bpf/prog_tests/connect_ping.c
new file mode 100644
index 000000000000..289218c2216c
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/connect_ping.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2022 Google LLC.
+ */
+
+#define _GNU_SOURCE
+#include <sys/mount.h>
+
+#include "test_progs.h"
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+
+#include "connect_ping.skel.h"
+
+/* 2001:db8::1 */
+#define BINDADDR_V6 { { { 0x20,0x01,0x0d,0xb8,0,0,0,0,0,0,0,0,0,0,0,1 } } }
+static const struct in6_addr bindaddr_v6 = BINDADDR_V6;
+
+static void subtest(int cgroup_fd, struct connect_ping *skel,
+		    int family, int do_bind)
+{
+	struct sockaddr_in sa4 = {
+		.sin_family = AF_INET,
+		.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+	};
+	struct sockaddr_in6 sa6 = {
+		.sin6_family = AF_INET6,
+		.sin6_addr = IN6ADDR_LOOPBACK_INIT,
+	};
+	struct sockaddr *sa;
+	socklen_t sa_len;
+	int protocol;
+	int sock_fd;
+
+	switch (family) {
+	case AF_INET:
+		sa = (struct sockaddr *)&sa4;
+		sa_len = sizeof(sa4);
+		protocol = IPPROTO_ICMP;
+		break;
+	case AF_INET6:
+		sa = (struct sockaddr *)&sa6;
+		sa_len = sizeof(sa6);
+		protocol = IPPROTO_ICMPV6;
+		break;
+	}
+
+	memset(skel->bss, 0, sizeof(*skel->bss));
+	skel->bss->do_bind = do_bind;
+
+	sock_fd = socket(family, SOCK_DGRAM, protocol);
+	if (!ASSERT_GE(sock_fd, 0, "sock-create"))
+		return;
+
+	if (!ASSERT_OK(connect(sock_fd, sa, sa_len), "connect"))
+		goto close_sock;
+
+	if (!ASSERT_EQ(skel->bss->invocations_v4, family == AF_INET ? 1 : 0,
+		       "invocations_v4"))
+		goto close_sock;
+	if (!ASSERT_EQ(skel->bss->invocations_v6, family == AF_INET6 ? 1 : 0,
+		       "invocations_v6"))
+		goto close_sock;
+	if (!ASSERT_EQ(skel->bss->has_error, 0, "has_error"))
+		goto close_sock;
+
+	if (!ASSERT_OK(getsockname(sock_fd, sa, &sa_len),
+		       "getsockname"))
+		goto close_sock;
+
+	switch (family) {
+	case AF_INET:
+		if (!ASSERT_EQ(sa4.sin_family, family, "sin_family"))
+			goto close_sock;
+		if (!ASSERT_EQ(sa4.sin_addr.s_addr,
+			       htonl(do_bind ? 0x01010101 : INADDR_LOOPBACK),
+			       "sin_addr"))
+			goto close_sock;
+		break;
+	case AF_INET6:
+		if (!ASSERT_EQ(sa6.sin6_family, AF_INET6, "sin6_family"))
+			goto close_sock;
+		if (!ASSERT_EQ(memcmp(&sa6.sin6_addr,
+				      do_bind ? &bindaddr_v6 : &in6addr_loopback,
+				      sizeof(sa6.sin6_addr)),
+			       0, "sin6_addr"))
+			goto close_sock;
+		break;
+	}
+
+close_sock:
+	close(sock_fd);
+}
+
+void test_connect_ping(void)
+{
+	struct connect_ping *skel;
+	int cgroup_fd;
+
+	if (!ASSERT_OK(unshare(CLONE_NEWNET | CLONE_NEWNS), "unshare"))
+		return;
+
+	/* overmount sysfs, and making original sysfs private so overmount
+	 * does not propagate to other mntns.
+	 */
+	if (!ASSERT_OK(mount("none", "/sys", NULL, MS_PRIVATE, NULL),
+		       "remount-private-sys"))
+		return;
+	if (!ASSERT_OK(mount("sysfs", "/sys", "sysfs", 0, NULL),
+		       "mount-sys"))
+		return;
+	if (!ASSERT_OK(mount("bpffs", "/sys/fs/bpf", "bpf", 0, NULL),
+		       "mount-bpf"))
+		goto clean_mount;
+
+	if (!ASSERT_OK(system("ip link set dev lo up"), "lo-up"))
+		goto clean_mount;
+	if (!ASSERT_OK(system("ip addr add 1.1.1.1 dev lo"), "lo-addr-v4"))
+		goto clean_mount;
+	if (!ASSERT_OK(system("ip -6 addr add 2001:db8::1 dev lo"), "lo-addr-v6"))
+		goto clean_mount;
+	if (write_sysctl("/proc/sys/net/ipv4/ping_group_range", "0 0"))
+		goto clean_mount;
+
+	cgroup_fd = test__join_cgroup("/connect_ping");
+	if (!ASSERT_GE(cgroup_fd, 0, "cg-create"))
+		goto clean_mount;
+
+	skel = connect_ping__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel-load"))
+		goto close_cgroup;
+	skel->links.connect_v4_prog =
+		bpf_program__attach_cgroup(skel->progs.connect_v4_prog, cgroup_fd);
+	if (!ASSERT_OK_PTR(skel->links.connect_v4_prog, "cg-attach-v4"))
+		goto skel_destroy;
+	skel->links.connect_v6_prog =
+		bpf_program__attach_cgroup(skel->progs.connect_v6_prog, cgroup_fd);
+	if (!ASSERT_OK_PTR(skel->links.connect_v6_prog, "cg-attach-v6"))
+		goto skel_destroy;
+
+	/* Connect a v4 ping socket to localhost, assert that only v4 is called,
+	 * and called exactly once, and that the socket's bound address is
+	 * original loopback address.
+	 */
+	if (test__start_subtest("ipv4"))
+		subtest(cgroup_fd, skel, AF_INET, 0);
+
+	/* Connect a v4 ping socket to localhost, assert that only v4 is called,
+	 * and called exactly once, and that the socket's bound address is
+	 * address we explicitly bound.
+	 */
+	if (test__start_subtest("ipv4-bind"))
+		subtest(cgroup_fd, skel, AF_INET, 1);
+
+	/* Connect a v6 ping socket to localhost, assert that only v6 is called,
+	 * and called exactly once, and that the socket's bound address is
+	 * original loopback address.
+	 */
+	if (test__start_subtest("ipv6"))
+		subtest(cgroup_fd, skel, AF_INET6, 0);
+
+	/* Connect a v6 ping socket to localhost, assert that only v6 is called,
+	 * and called exactly once, and that the socket's bound address is
+	 * address we explicitly bound.
+	 */
+	if (test__start_subtest("ipv6-bind"))
+		subtest(cgroup_fd, skel, AF_INET6, 1);
+
+skel_destroy:
+	connect_ping__destroy(skel);
+
+close_cgroup:
+	close(cgroup_fd);
+
+clean_mount:
+	umount2("/sys", MNT_DETACH);
+}
diff --git a/tools/testing/selftests/bpf/progs/connect_ping.c b/tools/testing/selftests/bpf/progs/connect_ping.c
new file mode 100644
index 000000000000..60178192b672
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/connect_ping.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2022 Google LLC.
+ */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+
+/* 2001:db8::1 */
+#define BINDADDR_V6 { { { 0x20,0x01,0x0d,0xb8,0,0,0,0,0,0,0,0,0,0,0,1 } } }
+
+__u32 do_bind = 0;
+__u32 has_error = 0;
+__u32 invocations_v4 = 0;
+__u32 invocations_v6 = 0;
+
+SEC("cgroup/connect4")
+int connect_v4_prog(struct bpf_sock_addr *ctx)
+{
+	struct sockaddr_in sa = {
+		.sin_family = AF_INET,
+		.sin_addr.s_addr = bpf_htonl(0x01010101),
+	};
+
+	__sync_fetch_and_add(&invocations_v4, 1);
+
+	if (do_bind && bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)))
+		has_error = 1;
+
+	return 1;
+}
+
+SEC("cgroup/connect6")
+int connect_v6_prog(struct bpf_sock_addr *ctx)
+{
+	struct sockaddr_in6 sa = {
+		.sin6_family = AF_INET6,
+		.sin6_addr = BINDADDR_V6,
+	};
+
+	__sync_fetch_and_add(&invocations_v6, 1);
+
+	if (do_bind && bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)))
+		has_error = 1;
+
+	return 1;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3


From a614ee94a8497c43015b3de662020def93295ddd Mon Sep 17 00:00:00 2001
From: Chris Lu <chris.lu@mediatek.com>
Date: Tue, 30 Aug 2022 06:55:58 +0800
Subject: Bluetooth: btusb: Add a new PID/VID 13d3/3578 for MT7921

Add VID 13D3 & PID 3578 for MediaTek MT7921 USB Bluetooth chip.

The information in /sys/kernel/debug/usb/devices about the Bluetooth
device is listed as the below.

T:  Bus=03 Lev=01 Prnt=01 Port=07 Cnt=03 Dev#=  5 Spd=480  MxCh= 0
D:  Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=13d3 ProdID=3578 Rev= 1.00
S:  Manufacturer=MediaTek Inc.
S:  Product=Wireless_Device
S:  SerialNumber=000000000
C:* #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=100mA
A:  FirstIf#= 0 IfCount= 3 Cls=e0(wlcon) Sub=01 Prot=01
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=125us
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
I:  If#= 1 Alt= 6 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  63 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  63 Ivl=1ms
I:* If#= 2 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS=  64 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS=  64 Ivl=125us
I:  If#= 2 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS= 512 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS= 512 Ivl=125us

Co-developed-by: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Chris Lu <chris.lu@mediatek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index ef7726a3e701..c3daba17de7f 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -485,6 +485,9 @@ static const struct usb_device_id blacklist_table[] = {
 	{ USB_DEVICE(0x13d3, 0x3567), .driver_info = BTUSB_MEDIATEK |
 						     BTUSB_WIDEBAND_SPEECH |
 						     BTUSB_VALID_LE_STATES },
+	{ USB_DEVICE(0x13d3, 0x3578), .driver_info = BTUSB_MEDIATEK |
+						     BTUSB_WIDEBAND_SPEECH |
+						     BTUSB_VALID_LE_STATES },
 	{ USB_DEVICE(0x0489, 0xe0cd), .driver_info = BTUSB_MEDIATEK |
 						     BTUSB_WIDEBAND_SPEECH |
 						     BTUSB_VALID_LE_STATES },
-- 
cgit v1.2.3


From b43331b42e4453fe8b210d372d602e2025276419 Mon Sep 17 00:00:00 2001
From: Kiran K <kiran.k@intel.com>
Date: Fri, 19 Aug 2022 13:18:15 +0530
Subject: Bluetooth: btintel: Add support for Magnetor

Hardware variant for Magnetor core (CNVi) is added.

Signed-off-by: Kiran K <kiran.k@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btintel.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c
index 818681c89db8..37ec17508ff6 100644
--- a/drivers/bluetooth/btintel.c
+++ b/drivers/bluetooth/btintel.c
@@ -449,6 +449,7 @@ static int btintel_version_info_tlv(struct hci_dev *hdev,
 	case 0x17:	/* TyP */
 	case 0x18:	/* Slr */
 	case 0x19:	/* Slr-F */
+	case 0x1b:      /* Mgr */
 		break;
 	default:
 		bt_dev_err(hdev, "Unsupported Intel hardware variant (0x%x)",
@@ -2330,6 +2331,7 @@ static void btintel_set_msft_opcode(struct hci_dev *hdev, u8 hw_variant)
 	case 0x17:
 	case 0x18:
 	case 0x19:
+	case 0x1b:
 		hci_set_msft_opcode(hdev, 0xFC1E);
 		break;
 	default:
@@ -2542,6 +2544,7 @@ static int btintel_setup_combined(struct hci_dev *hdev)
 	case 0x17:
 	case 0x18:
 	case 0x19:
+	case 0x1b:
 		/* Display version information of TLV type */
 		btintel_version_info_tlv(hdev, &ver_tlv);
 
-- 
cgit v1.2.3


From dd0a1794f4334ddbf9b7c5e7d642aaffff38c69b Mon Sep 17 00:00:00 2001
From: Kiran K <kiran.k@intel.com>
Date: Wed, 7 Sep 2022 12:49:45 +0530
Subject: Bluetooth: btintel: Mark Intel controller to support LE_STATES quirk

HarrrisonPeak, CyclonePeak, SnowFieldPeak and SandyPeak controllers
are marked to support HCI_QUIRK_LE_STATES.

Signed-off-by: Kiran K <kiran.k@intel.com>
Signed-off-by: Chethan T N <chethan.tumkur.narayan@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btintel.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c
index 37ec17508ff6..a657e9a3e96a 100644
--- a/drivers/bluetooth/btintel.c
+++ b/drivers/bluetooth/btintel.c
@@ -2441,15 +2441,20 @@ static int btintel_setup_combined(struct hci_dev *hdev)
 					       INTEL_ROM_LEGACY_NO_WBS_SUPPORT))
 				set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED,
 					&hdev->quirks);
+			if (ver.hw_variant == 0x08 && ver.fw_variant == 0x22)
+				set_bit(HCI_QUIRK_VALID_LE_STATES,
+					&hdev->quirks);
 
 			err = btintel_legacy_rom_setup(hdev, &ver);
 			break;
 		case 0x0b:      /* SfP */
-		case 0x0c:      /* WsP */
 		case 0x11:      /* JfP */
 		case 0x12:      /* ThP */
 		case 0x13:      /* HrP */
 		case 0x14:      /* CcP */
+			set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks);
+			fallthrough;
+		case 0x0c:	/* WsP */
 			/* Apply the device specific HCI quirks
 			 *
 			 * All Legacy bootloader devices support WBS
@@ -2457,11 +2462,6 @@ static int btintel_setup_combined(struct hci_dev *hdev)
 			set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED,
 				&hdev->quirks);
 
-			/* Valid LE States quirk for JfP/ThP familiy */
-			if (ver.hw_variant == 0x11 || ver.hw_variant == 0x12)
-				set_bit(HCI_QUIRK_VALID_LE_STATES,
-					&hdev->quirks);
-
 			/* Setup MSFT Extension support */
 			btintel_set_msft_opcode(hdev, ver.hw_variant);
 
@@ -2532,9 +2532,8 @@ static int btintel_setup_combined(struct hci_dev *hdev)
 		 */
 		set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks);
 
-		/* Valid LE States quirk for JfP/ThP familiy */
-		if (ver.hw_variant == 0x11 || ver.hw_variant == 0x12)
-			set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks);
+		/* Set Valid LE States quirk */
+		set_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks);
 
 		/* Setup MSFT Extension support */
 		btintel_set_msft_opcode(hdev, ver.hw_variant);
-- 
cgit v1.2.3


From 211391bf04b3c74e250c566eeff9cf808156c693 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Mon, 5 Sep 2022 14:34:12 +0200
Subject: ACPI: tables: FPDT: Don't call acpi_os_map_memory() on invalid phys
 address

On a Packard Bell Dot SC (Intel Atom N2600 model) there is a FPDT table
which contains invalid physical addresses, with high bits set which fall
outside the range of the CPU-s supported physical address range.

Calling acpi_os_map_memory() on such an invalid phys address leads to
the below WARN_ON in ioremap triggering resulting in an oops/stacktrace.

Add code to verify the physical address before calling acpi_os_map_memory()
to fix / avoid the oops.

[    1.226900] ioremap: invalid physical address 3001000000000000
[    1.226949] ------------[ cut here ]------------
[    1.226962] WARNING: CPU: 1 PID: 1 at arch/x86/mm/ioremap.c:200 __ioremap_caller.cold+0x43/0x5f
[    1.226996] Modules linked in:
[    1.227016] CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.0.0-rc3+ #490
[    1.227029] Hardware name: Packard Bell dot s/SJE01_CT, BIOS V1.10 07/23/2013
[    1.227038] RIP: 0010:__ioremap_caller.cold+0x43/0x5f
[    1.227054] Code: 96 00 00 e9 f8 af 24 ff 89 c6 48 c7 c7 d8 0c 84 99 e8 6a 96 00 00 e9 76 af 24 ff 48 89 fe 48 c7 c7 a8 0c 84 99 e8 56 96 00 00 <0f> 0b e9 60 af 24 ff 48 8b 34 24 48 c7 c7 40 0d 84 99 e8 3f 96 00
[    1.227067] RSP: 0000:ffffb18c40033d60 EFLAGS: 00010286
[    1.227084] RAX: 0000000000000032 RBX: 3001000000000000 RCX: 0000000000000000
[    1.227095] RDX: 0000000000000001 RSI: 00000000ffffdfff RDI: 00000000ffffffff
[    1.227105] RBP: 3001000000000000 R08: 0000000000000000 R09: ffffb18c40033c18
[    1.227115] R10: 0000000000000003 R11: ffffffff99d62fe8 R12: 0000000000000008
[    1.227124] R13: 0003001000000000 R14: 0000000000001000 R15: 3001000000000000
[    1.227135] FS:  0000000000000000(0000) GS:ffff913a3c080000(0000) knlGS:0000000000000000
[    1.227146] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    1.227156] CR2: 0000000000000000 CR3: 0000000018c26000 CR4: 00000000000006e0
[    1.227167] Call Trace:
[    1.227176]  <TASK>
[    1.227185]  ? acpi_os_map_iomem+0x1c9/0x1e0
[    1.227215]  ? kmem_cache_alloc_trace+0x187/0x370
[    1.227254]  acpi_os_map_iomem+0x1c9/0x1e0
[    1.227288]  acpi_init_fpdt+0xa8/0x253
[    1.227308]  ? acpi_debugfs_init+0x1f/0x1f
[    1.227339]  do_one_initcall+0x5a/0x300
[    1.227406]  ? rcu_read_lock_sched_held+0x3f/0x80
[    1.227442]  kernel_init_freeable+0x28b/0x2cc
[    1.227512]  ? rest_init+0x170/0x170
[    1.227538]  kernel_init+0x16/0x140
[    1.227552]  ret_from_fork+0x1f/0x30
[    1.227639]  </TASK>
[    1.227647] irq event stamp: 186819
[    1.227656] hardirqs last  enabled at (186825): [<ffffffff98184a6e>] __up_console_sem+0x5e/0x70
[    1.227672] hardirqs last disabled at (186830): [<ffffffff98184a53>] __up_console_sem+0x43/0x70
[    1.227686] softirqs last  enabled at (186576): [<ffffffff980fbc9d>] __irq_exit_rcu+0xed/0x160
[    1.227701] softirqs last disabled at (186569): [<ffffffff980fbc9d>] __irq_exit_rcu+0xed/0x160
[    1.227715] ---[ end trace 0000000000000000 ]---

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_fpdt.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/acpi/acpi_fpdt.c b/drivers/acpi/acpi_fpdt.c
index 6922a44b3ce7..a2056c4c8cb7 100644
--- a/drivers/acpi/acpi_fpdt.c
+++ b/drivers/acpi/acpi_fpdt.c
@@ -143,6 +143,23 @@ static const struct attribute_group boot_attr_group = {
 
 static struct kobject *fpdt_kobj;
 
+#if defined CONFIG_X86 && defined CONFIG_PHYS_ADDR_T_64BIT
+#include <linux/processor.h>
+static bool fpdt_address_valid(u64 address)
+{
+	/*
+	 * On some systems the table contains invalid addresses
+	 * with unsuppored high address bits set, check for this.
+	 */
+	return !(address >> boot_cpu_data.x86_phys_bits);
+}
+#else
+static bool fpdt_address_valid(u64 address)
+{
+	return true;
+}
+#endif
+
 static int fpdt_process_subtable(u64 address, u32 subtable_type)
 {
 	struct fpdt_subtable_header *subtable_header;
@@ -151,6 +168,11 @@ static int fpdt_process_subtable(u64 address, u32 subtable_type)
 	u32 length, offset;
 	int result;
 
+	if (!fpdt_address_valid(address)) {
+		pr_info(FW_BUG "invalid physical address: 0x%llx!\n", address);
+		return -EINVAL;
+	}
+
 	subtable_header = acpi_os_map_memory(address, sizeof(*subtable_header));
 	if (!subtable_header)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 6d2b5a1cf4dd943909a927133664ad1627783d95 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Tue, 16 Aug 2022 13:16:25 +0300
Subject: ACPI: resource: Filter out the non memory resources in is_memory()

This will generalise the function so it should become
useful in more places.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/resource.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 510cdec375c4..e644e90d1884 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -690,6 +690,9 @@ static int is_memory(struct acpi_resource *ares, void *not_used)
 
 	memset(&win, 0, sizeof(win));
 
+	if (acpi_dev_filter_resource_type(ares, IORESOURCE_MEM))
+		return 1;
+
 	return !(acpi_dev_resource_memory(ares, res)
 	       || acpi_dev_resource_address_space(ares, &win)
 	       || acpi_dev_resource_ext_address_space(ares, &win));
-- 
cgit v1.2.3


From 6bb057bfd9d509755349cd2a6ca5e5e6e6071304 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Tue, 16 Aug 2022 13:16:26 +0300
Subject: ACPI: resource: Add helper function acpi_dev_get_memory_resources()

Wrapper function that finds all memory type resources by
using acpi_dev_get_resources(). It removes the need for the
drivers to check the resource data type separately.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/resource.c | 17 +++++++++++++++++
 include/linux/acpi.h    |  1 +
 2 files changed, 18 insertions(+)

diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index e644e90d1884..8032d50ca944 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -721,6 +721,23 @@ int acpi_dev_get_dma_resources(struct acpi_device *adev, struct list_head *list)
 }
 EXPORT_SYMBOL_GPL(acpi_dev_get_dma_resources);
 
+/**
+ * acpi_dev_get_memory_resources - Get current memory resources of a device.
+ * @adev: ACPI device node to get the resources for.
+ * @list: Head of the resultant list of resources (must be empty).
+ *
+ * This is a helper function that locates all memory type resources of @adev
+ * with acpi_dev_get_resources().
+ *
+ * The number of resources in the output list is returned on success, an error
+ * code reflecting the error condition is returned otherwise.
+ */
+int acpi_dev_get_memory_resources(struct acpi_device *adev, struct list_head *list)
+{
+	return acpi_dev_get_resources(adev, list, is_memory, NULL);
+}
+EXPORT_SYMBOL_GPL(acpi_dev_get_memory_resources);
+
 /**
  * acpi_dev_filter_resource_type - Filter ACPI resource according to resource
  *				   types
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6f64b2f3dc54..ed4aa395cc49 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -506,6 +506,7 @@ int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
 			   void *preproc_data);
 int acpi_dev_get_dma_resources(struct acpi_device *adev,
 			       struct list_head *list);
+int acpi_dev_get_memory_resources(struct acpi_device *adev, struct list_head *list);
 int acpi_dev_filter_resource_type(struct acpi_resource *ares,
 				  unsigned long types);
 
-- 
cgit v1.2.3


From 064d42d90e412936434fae7d49204d8055e5b0e5 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Tue, 16 Aug 2022 13:16:27 +0300
Subject: ACPI: APD: Use the helper acpi_dev_get_memory_resources()

It removes the need to check the resource data type
separately.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_apd.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c
index ad245bbd965e..3bbe2276cac7 100644
--- a/drivers/acpi/acpi_apd.c
+++ b/drivers/acpi/acpi_apd.c
@@ -60,12 +60,6 @@ static int acpi_apd_setup(struct apd_private_data *pdata)
 }
 
 #ifdef CONFIG_X86_AMD_PLATFORM_DEVICE
-static int misc_check_res(struct acpi_resource *ares, void *data)
-{
-	struct resource res;
-
-	return !acpi_dev_resource_memory(ares, &res);
-}
 
 static int fch_misc_setup(struct apd_private_data *pdata)
 {
@@ -82,8 +76,7 @@ static int fch_misc_setup(struct apd_private_data *pdata)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&resource_list);
-	ret = acpi_dev_get_resources(adev, &resource_list, misc_check_res,
-				     NULL);
+	ret = acpi_dev_get_memory_resources(adev, &resource_list);
 	if (ret < 0)
 		return -ENOENT;
 
-- 
cgit v1.2.3


From 840baca4c44130e9e1954d7b1807f3280c804bc2 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Tue, 16 Aug 2022 13:16:28 +0300
Subject: ACPI: LPSS: Use the helper acpi_dev_get_memory_resources()

It removes the need to check the resource data type
separately.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_lpss.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index c4d4d21391d7..4f6cba8fe8de 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -392,13 +392,6 @@ static const struct acpi_device_id acpi_lpss_device_ids[] = {
 
 #ifdef CONFIG_X86_INTEL_LPSS
 
-static int is_memory(struct acpi_resource *res, void *not_used)
-{
-	struct resource r;
-
-	return !acpi_dev_resource_memory(res, &r);
-}
-
 /* LPSS main clock device. */
 static struct platform_device *lpss_clk_dev;
 
@@ -659,7 +652,7 @@ static int acpi_lpss_create_device(struct acpi_device *adev,
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&resource_list);
-	ret = acpi_dev_get_resources(adev, &resource_list, is_memory, NULL);
+	ret = acpi_dev_get_memory_resources(adev, &resource_list);
 	if (ret < 0)
 		goto err_out;
 
-- 
cgit v1.2.3


From e12dee3736731e24b1e7367f87d66ac0fcd73ce7 Mon Sep 17 00:00:00 2001
From: Tamim Khan <tamim@fusetak.com>
Date: Sun, 28 Aug 2022 23:04:19 -0400
Subject: ACPI: resource: Skip IRQ override on Asus Vivobook K3402ZA/K3502ZA

In the ACPI DSDT table for Asus VivoBook K3402ZA/K3502ZA
IRQ 1 is described as ActiveLow; however, the kernel overrides
it to Edge_High. This prevents the internal keyboard from working
on these laptops. In order to fix this add these laptops to the
skip_override_table so that the kernel does not override IRQ 1 to
Edge_High.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=216158
Reviewed-by: Hui Wang <hui.wang@canonical.com>
Tested-by: Tamim Khan <tamim@fusetak.com>
Tested-by: Sunand <sunandchakradhar@gmail.com>
Signed-off-by: Tamim Khan <tamim@fusetak.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/resource.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 8032d50ca944..e40b2153911d 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -399,6 +399,24 @@ static const struct dmi_system_id medion_laptop[] = {
 	{ }
 };
 
+static const struct dmi_system_id asus_laptop[] = {
+	{
+		.ident = "Asus Vivobook K3402ZA",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_BOARD_NAME, "K3402ZA"),
+		},
+	},
+	{
+		.ident = "Asus Vivobook K3502ZA",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_BOARD_NAME, "K3502ZA"),
+		},
+	},
+	{ }
+};
+
 struct irq_override_cmp {
 	const struct dmi_system_id *system;
 	unsigned char irq;
@@ -409,6 +427,7 @@ struct irq_override_cmp {
 
 static const struct irq_override_cmp skip_override_table[] = {
 	{ medion_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0 },
+	{ asus_laptop, 1, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW, 0 },
 };
 
 static bool acpi_dev_irq_override(u32 gsi, u8 triggering, u8 polarity,
-- 
cgit v1.2.3


From 1538dc8c1561f0de4ba57a69e2a421a1a3951618 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Wed, 7 Sep 2022 13:30:07 +0300
Subject: usb: typec: intel_pmc_mux: Use the helper
 acpi_dev_get_memory_resources()

It removes the need to check the resource data type
separately.

Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/usb/typec/mux/intel_pmc_mux.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/usb/typec/mux/intel_pmc_mux.c b/drivers/usb/typec/mux/intel_pmc_mux.c
index a8e273fe204a..e1f4df7238bf 100644
--- a/drivers/usb/typec/mux/intel_pmc_mux.c
+++ b/drivers/usb/typec/mux/intel_pmc_mux.c
@@ -569,15 +569,6 @@ err_unregister_switch:
 	return ret;
 }
 
-static int is_memory(struct acpi_resource *res, void *data)
-{
-	struct resource_win win = {};
-	struct resource *r = &win.res;
-
-	return !(acpi_dev_resource_memory(res, r) ||
-		 acpi_dev_resource_address_space(res, &win));
-}
-
 /* IOM ACPI IDs and IOM_PORT_STATUS_OFFSET */
 static const struct acpi_device_id iom_acpi_ids[] = {
 	/* TigerLake */
@@ -611,7 +602,7 @@ static int pmc_usb_probe_iom(struct pmc_usb *pmc)
 		return -ENODEV;
 
 	INIT_LIST_HEAD(&resource_list);
-	ret = acpi_dev_get_resources(adev, &resource_list, is_memory, NULL);
+	ret = acpi_dev_get_memory_resources(adev, &resource_list);
 	if (ret < 0)
 		return ret;
 
-- 
cgit v1.2.3


From 71bb5c82aaaea007167f3ba68d3a669c74d7d55d Mon Sep 17 00:00:00 2001
From: Doug Smythies <dsmythies@telus.net>
Date: Tue, 6 Sep 2022 13:28:57 -0700
Subject: cpufreq: intel_pstate: Add Tigerlake support in no-HWP mode

Users may disable HWP in firmware, in which case intel_pstate wouldn't load
unless the CPU model is explicitly supported.

Add TIGERLAKE to the list of CPUs that can register intel_pstate while not
advertising the HWP capability. Without this change, an TIGERLAKE in no-HWP
mode could only use the acpi_cpufreq frequency scaling driver.

See also commits:
d8de7a44e11f: cpufreq: intel_pstate: Add Skylake servers support
fbdc21e9b038: cpufreq: intel_pstate: Add Icelake servers support in no-HWP mode
706c5328851d: cpufreq: intel_pstate: Add Cometlake support in no-HWP mode

Reported by: M. Cargi Ari <cagriari@pm.me>
Signed-off-by: Doug Smythies <dsmythies@telus.net>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 57cdb3679885..fc3ebeb0bbe5 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -2416,6 +2416,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
 	X86_MATCH(SKYLAKE_X,		core_funcs),
 	X86_MATCH(COMETLAKE,		core_funcs),
 	X86_MATCH(ICELAKE_X,		core_funcs),
+	X86_MATCH(TIGERLAKE,		core_funcs),
 	{}
 };
 MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);
-- 
cgit v1.2.3


From cf7de6a53600ea554a8358e44fbcf47b449235f9 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sun, 11 Sep 2022 00:07:11 +0900
Subject: bpf: add missing percpu_counter_destroy() in htab_map_alloc()

syzbot is reporting ODEBUG bug in htab_map_alloc() [1], for
commit 86fe28f7692d96d2 ("bpf: Optimize element count in non-preallocated
hash map.") added percpu_counter_init() to htab_map_alloc() but forgot to
add percpu_counter_destroy() to the error path.

Link: https://syzkaller.appspot.com/bug?extid=5d1da78b375c3b5e6c2b [1]
Reported-by: syzbot <syzbot+5d1da78b375c3b5e6c2b@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Fixes: 86fe28f7692d96d2 ("bpf: Optimize element count in non-preallocated hash map.")
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/e2e4cc0e-9d36-4ca1-9bfa-ce23e6f8310b@I-love.SAKURA.ne.jp
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/hashtab.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 0fe3f136cbbe..86aec20c22d0 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -622,6 +622,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 free_prealloc:
 	prealloc_destroy(htab);
 free_map_locked:
+	if (htab->use_percpu_counter)
+		percpu_counter_destroy(&htab->pcount);
 	for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
 		free_percpu(htab->map_locked[i]);
 	bpf_map_area_free(htab->buckets);
-- 
cgit v1.2.3


From 57c92f11a215717bf90880828b7a23c736c3c0d9 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@bytedance.com>
Date: Wed, 7 Sep 2022 16:57:46 +0100
Subject: bpf: Simplify code by using for_each_cpu_wrap()

In the percpu freelist code, it is a common pattern to iterate over
the possible CPUs mask starting with the current CPU. The pattern is
implemented using a hand rolled while loop with the loop variable
increment being open-coded.

Simplify the code by using for_each_cpu_wrap() helper to iterate over
the possible cpus starting with the current CPU. As a result, some of
the special-casing in the loop also gets simplified.

No functional change intended.

Signed-off-by: Punit Agrawal <punit.agrawal@bytedance.com>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20220907155746.1750329-1-punit.agrawal@bytedance.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/percpu_freelist.c | 48 +++++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 32 deletions(-)

diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 00b874c8e889..b6e7f5c5b9ab 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -58,23 +58,21 @@ static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
 {
 	int cpu, orig_cpu;
 
-	orig_cpu = cpu = raw_smp_processor_id();
+	orig_cpu = raw_smp_processor_id();
 	while (1) {
-		struct pcpu_freelist_head *head;
+		for_each_cpu_wrap(cpu, cpu_possible_mask, orig_cpu) {
+			struct pcpu_freelist_head *head;
 
-		head = per_cpu_ptr(s->freelist, cpu);
-		if (raw_spin_trylock(&head->lock)) {
-			pcpu_freelist_push_node(head, node);
-			raw_spin_unlock(&head->lock);
-			return;
+			head = per_cpu_ptr(s->freelist, cpu);
+			if (raw_spin_trylock(&head->lock)) {
+				pcpu_freelist_push_node(head, node);
+				raw_spin_unlock(&head->lock);
+				return;
+			}
 		}
-		cpu = cpumask_next(cpu, cpu_possible_mask);
-		if (cpu >= nr_cpu_ids)
-			cpu = 0;
 
 		/* cannot lock any per cpu lock, try extralist */
-		if (cpu == orig_cpu &&
-		    pcpu_freelist_try_push_extra(s, node))
+		if (pcpu_freelist_try_push_extra(s, node))
 			return;
 	}
 }
@@ -125,13 +123,12 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 {
 	struct pcpu_freelist_head *head;
 	struct pcpu_freelist_node *node;
-	int orig_cpu, cpu;
+	int cpu;
 
-	orig_cpu = cpu = raw_smp_processor_id();
-	while (1) {
+	for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
 		head = per_cpu_ptr(s->freelist, cpu);
 		if (!READ_ONCE(head->first))
-			goto next_cpu;
+			continue;
 		raw_spin_lock(&head->lock);
 		node = head->first;
 		if (node) {
@@ -140,12 +137,6 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
 			return node;
 		}
 		raw_spin_unlock(&head->lock);
-next_cpu:
-		cpu = cpumask_next(cpu, cpu_possible_mask);
-		if (cpu >= nr_cpu_ids)
-			cpu = 0;
-		if (cpu == orig_cpu)
-			break;
 	}
 
 	/* per cpu lists are all empty, try extralist */
@@ -164,13 +155,12 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
 {
 	struct pcpu_freelist_head *head;
 	struct pcpu_freelist_node *node;
-	int orig_cpu, cpu;
+	int cpu;
 
-	orig_cpu = cpu = raw_smp_processor_id();
-	while (1) {
+	for_each_cpu_wrap(cpu, cpu_possible_mask, raw_smp_processor_id()) {
 		head = per_cpu_ptr(s->freelist, cpu);
 		if (!READ_ONCE(head->first))
-			goto next_cpu;
+			continue;
 		if (raw_spin_trylock(&head->lock)) {
 			node = head->first;
 			if (node) {
@@ -180,12 +170,6 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
 			}
 			raw_spin_unlock(&head->lock);
 		}
-next_cpu:
-		cpu = cpumask_next(cpu, cpu_possible_mask);
-		if (cpu >= nr_cpu_ids)
-			cpu = 0;
-		if (cpu == orig_cpu)
-			break;
 	}
 
 	/* cannot pop from per cpu lists, try extralist */
-- 
cgit v1.2.3


From 65269888c695cf4643c6fdb989ea28bf1623685d Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Wed, 7 Sep 2022 10:40:36 -0600
Subject: bpf: Remove duplicate PTR_TO_BTF_ID RO check

Since commit 27ae7997a661 ("bpf: Introduce BPF_PROG_TYPE_STRUCT_OPS")
there has existed bpf_verifier_ops:btf_struct_access. When
btf_struct_access is _unset_ for a prog type, the verifier runs the
default implementation, which is to enforce read only:

        if (env->ops->btf_struct_access) {
                [...]
        } else {
                if (atype != BPF_READ) {
                        verbose(env, "only read is supported\n");
                        return -EACCES;
                }

                [...]
        }

When btf_struct_access is _set_, the expectation is that
btf_struct_access has full control over accesses, including if writes
are allowed.

Rather than carve out an exception for each prog type that may write to
BTF ptrs, delete the redundant check and give full control to
btf_struct_access.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/962da2bff1238746589e332ff1aecc49403cd7ce.1662568410.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c0f175ac187a..c3efd461f36c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -13406,9 +13406,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 				insn->code = BPF_LDX | BPF_PROBE_MEM |
 					BPF_SIZE((insn)->code);
 				env->prog->aux->num_exentries++;
-			} else if (resolve_prog_type(env->prog) != BPF_PROG_TYPE_STRUCT_OPS) {
-				verbose(env, "Writes through BTF pointers are not allowed\n");
-				return -EINVAL;
 			}
 			continue;
 		default:
-- 
cgit v1.2.3


From d4f7bdb2ed7bf320a8772258fdc257655d225afb Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Wed, 7 Sep 2022 10:40:37 -0600
Subject: bpf: Add stub for btf_struct_access()

Add corresponding unimplemented stub for when CONFIG_BPF_SYSCALL=n

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/4021398e884433b1fef57a4d28361bb9fcf1bd05.1662568410.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 48ae05099f36..54178b9e9c3a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2211,6 +2211,15 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id)
 	return ERR_PTR(-ENOTSUPP);
 }
 
+static inline int btf_struct_access(struct bpf_verifier_log *log,
+				    const struct btf *btf,
+				    const struct btf_type *t, int off, int size,
+				    enum bpf_access_type atype,
+				    u32 *next_btf_id, enum bpf_type_flag *flag)
+{
+	return -EACCES;
+}
+
 static inline const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
-- 
cgit v1.2.3


From 896f07c07da01aa7cee820a23c2bce1d8e9fe1e6 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Wed, 7 Sep 2022 10:40:38 -0600
Subject: bpf: Use 0 instead of NOT_INIT for btf_struct_access() writes

Returning a bpf_reg_type only makes sense in the context of a BPF_READ.
For writes, prefer to explicitly return 0 for clarity.

Note that is non-functional change as it just so happened that NOT_INIT
== 0.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/01772bc1455ae16600796ac78c6cc9fff34f95ff.1662568410.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/ipv4/bpf_tcp_ca.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 85a9e500c42d..6da16ae6a962 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -124,7 +124,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
 		return -EACCES;
 	}
 
-	return NOT_INIT;
+	return 0;
 }
 
 BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt)
-- 
cgit v1.2.3


From 84c6ac417ceacd086efc330afece8922969610b7 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Wed, 7 Sep 2022 10:40:39 -0600
Subject: bpf: Export btf_type_by_id() and bpf_log()

These symbols will be used in nf_conntrack.ko to support direct writes
to `nf_conn`.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/3c98c19dc50d3b18ea5eca135b4fc3a5db036060.1662568410.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c      | 1 +
 kernel/bpf/verifier.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 9d12212fcd61..98be25d13325 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -818,6 +818,7 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
 		return NULL;
 	return btf->types[type_id];
 }
+EXPORT_SYMBOL_GPL(btf_type_by_id);
 
 /*
  * Regular int is not a bit field and it must be either
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c3efd461f36c..9109e07b759a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -370,6 +370,7 @@ __printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
 	bpf_verifier_vlog(log, fmt, args);
 	va_end(args);
 }
+EXPORT_SYMBOL_GPL(bpf_log);
 
 static const char *ltrim(const char *s)
 {
-- 
cgit v1.2.3


From 864b656f82ccd433d3e38149c3673d295ad64bf6 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Wed, 7 Sep 2022 10:40:40 -0600
Subject: bpf: Add support for writing to nf_conn:mark

Support direct writes to nf_conn:mark from TC and XDP prog types. This
is useful when applications want to store per-connection metadata. This
is also particularly useful for applications that run both bpf and
iptables/nftables because the latter can trivially access this metadata.

One example use case would be if a bpf prog is responsible for advanced
packet classification and iptables/nftables is later used for routing
due to pre-existing/legacy code.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/ebca06dea366e3e7e861c12f375a548cc4c61108.1662568410.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/netfilter/nf_conntrack_bpf.h | 23 +++++++++++
 net/core/filter.c                        | 54 ++++++++++++++++++++++++++
 net/netfilter/nf_conntrack_bpf.c         | 66 +++++++++++++++++++++++++++++++-
 net/netfilter/nf_conntrack_core.c        |  1 +
 4 files changed, 143 insertions(+), 1 deletion(-)

diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h
index a473b56842c5..a61a93d1c6dc 100644
--- a/include/net/netfilter/nf_conntrack_bpf.h
+++ b/include/net/netfilter/nf_conntrack_bpf.h
@@ -3,13 +3,22 @@
 #ifndef _NF_CONNTRACK_BPF_H
 #define _NF_CONNTRACK_BPF_H
 
+#include <linux/bpf.h>
 #include <linux/btf.h>
 #include <linux/kconfig.h>
+#include <linux/mutex.h>
 
 #if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
     (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
 
 extern int register_nf_conntrack_bpf(void);
+extern void cleanup_nf_conntrack_bpf(void);
+
+extern struct mutex nf_conn_btf_access_lock;
+extern int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf,
+		       const struct btf_type *t, int off, int size,
+		       enum bpf_access_type atype, u32 *next_btf_id,
+		       enum bpf_type_flag *flag);
 
 #else
 
@@ -18,6 +27,20 @@ static inline int register_nf_conntrack_bpf(void)
 	return 0;
 }
 
+static inline void cleanup_nf_conntrack_bpf(void)
+{
+}
+
+static inline int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
+						 const struct btf *btf,
+						 const struct btf_type *t, int off,
+						 int size, enum bpf_access_type atype,
+						 u32 *next_btf_id,
+						 enum bpf_type_flag *flag)
+{
+	return -EACCES;
+}
+
 #endif
 
 #endif /* _NF_CONNTRACK_BPF_H */
diff --git a/net/core/filter.c b/net/core/filter.c
index e872f45399b0..4b2be211bcbe 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/atomic.h>
+#include <linux/bpf_verifier.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/mm.h>
@@ -8604,6 +8605,36 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
+DEFINE_MUTEX(nf_conn_btf_access_lock);
+EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
+
+int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf,
+		const struct btf_type *t, int off, int size,
+		enum bpf_access_type atype, u32 *next_btf_id,
+		enum bpf_type_flag *flag);
+EXPORT_SYMBOL_GPL(nfct_bsa);
+
+static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
+					const struct btf *btf,
+					const struct btf_type *t, int off,
+					int size, enum bpf_access_type atype,
+					u32 *next_btf_id,
+					enum bpf_type_flag *flag)
+{
+	int ret = -EACCES;
+
+	if (atype == BPF_READ)
+		return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+					 flag);
+
+	mutex_lock(&nf_conn_btf_access_lock);
+	if (nfct_bsa)
+		ret = nfct_bsa(log, btf, t, off, size, atype, next_btf_id, flag);
+	mutex_unlock(&nf_conn_btf_access_lock);
+
+	return ret;
+}
+
 static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
@@ -8663,6 +8694,27 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog,
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
+static int xdp_btf_struct_access(struct bpf_verifier_log *log,
+				 const struct btf *btf,
+				 const struct btf_type *t, int off,
+				 int size, enum bpf_access_type atype,
+				 u32 *next_btf_id,
+				 enum bpf_type_flag *flag)
+{
+	int ret = -EACCES;
+
+	if (atype == BPF_READ)
+		return btf_struct_access(log, btf, t, off, size, atype, next_btf_id,
+					 flag);
+
+	mutex_lock(&nf_conn_btf_access_lock);
+	if (nfct_bsa)
+		ret = nfct_bsa(log, btf, t, off, size, atype, next_btf_id, flag);
+	mutex_unlock(&nf_conn_btf_access_lock);
+
+	return ret;
+}
+
 static bool sock_addr_is_valid_access(int off, int size,
 				      enum bpf_access_type type,
 				      const struct bpf_prog *prog,
@@ -10557,6 +10609,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
 	.gen_ld_abs		= bpf_gen_ld_abs,
+	.btf_struct_access	= tc_cls_act_btf_struct_access,
 };
 
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -10568,6 +10621,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
 	.is_valid_access	= xdp_is_valid_access,
 	.convert_ctx_access	= xdp_convert_ctx_access,
 	.gen_prologue		= bpf_noop_prologue,
+	.btf_struct_access	= xdp_btf_struct_access,
 };
 
 const struct bpf_prog_ops xdp_prog_ops = {
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 1cd87b28c9b0..77eb8e959f61 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -6,8 +6,10 @@
  * are exposed through to BPF programs is explicitly unstable.
  */
 
+#include <linux/bpf_verifier.h>
 #include <linux/bpf.h>
 #include <linux/btf.h>
+#include <linux/mutex.h>
 #include <linux/types.h>
 #include <linux/btf_ids.h>
 #include <linux/net_namespace.h>
@@ -184,6 +186,54 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
 	return ct;
 }
 
+BTF_ID_LIST(btf_nf_conn_ids)
+BTF_ID(struct, nf_conn)
+BTF_ID(struct, nf_conn___init)
+
+/* Check writes into `struct nf_conn` */
+static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
+					   const struct btf *btf,
+					   const struct btf_type *t, int off,
+					   int size, enum bpf_access_type atype,
+					   u32 *next_btf_id,
+					   enum bpf_type_flag *flag)
+{
+	const struct btf_type *ncit;
+	const struct btf_type *nct;
+	size_t end;
+
+	ncit = btf_type_by_id(btf, btf_nf_conn_ids[1]);
+	nct = btf_type_by_id(btf, btf_nf_conn_ids[0]);
+
+	if (t != nct && t != ncit) {
+		bpf_log(log, "only read is supported\n");
+		return -EACCES;
+	}
+
+	/* `struct nf_conn` and `struct nf_conn___init` have the same layout
+	 * so we are safe to simply merge offset checks here
+	 */
+	switch (off) {
+#if defined(CONFIG_NF_CONNTRACK_MARK)
+	case offsetof(struct nf_conn, mark):
+		end = offsetofend(struct nf_conn, mark);
+		break;
+#endif
+	default:
+		bpf_log(log, "no write support to nf_conn at off %d\n", off);
+		return -EACCES;
+	}
+
+	if (off + size > end) {
+		bpf_log(log,
+			"write access at off %d with size %d beyond the member of nf_conn ended at %zu\n",
+			off, size, end);
+		return -EACCES;
+	}
+
+	return 0;
+}
+
 __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in nf_conntrack BTF");
@@ -449,5 +499,19 @@ int register_nf_conntrack_bpf(void)
 	int ret;
 
 	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set);
-	return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set);
+	if (!ret) {
+		mutex_lock(&nf_conn_btf_access_lock);
+		nfct_bsa = _nf_conntrack_btf_struct_access;
+		mutex_unlock(&nf_conn_btf_access_lock);
+	}
+
+	return ret;
+}
+
+void cleanup_nf_conntrack_bpf(void)
+{
+	mutex_lock(&nf_conn_btf_access_lock);
+	nfct_bsa = NULL;
+	mutex_unlock(&nf_conn_btf_access_lock);
 }
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index da65c6e8eeeb..0195f60fc43b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2512,6 +2512,7 @@ static int kill_all(struct nf_conn *i, void *data)
 
 void nf_conntrack_cleanup_start(void)
 {
+	cleanup_nf_conntrack_bpf();
 	conntrack_gc_work.exiting = true;
 }
 
-- 
cgit v1.2.3


From e2d75e954c0a277b8fa0ddf666ddd4f9b73195f7 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Wed, 7 Sep 2022 10:40:41 -0600
Subject: selftests/bpf: Add tests for writing to nf_conn:mark

Add a simple extension to the existing selftest to write to
nf_conn:mark. Also add a failure test for writing to unsupported field.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/f78966b81b9349d2b8ebb4cee2caf15cb6b38ee2.1662568410.git.dxu@dxuuu.xyz
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/bpf_nf.c      |  2 ++
 tools/testing/selftests/bpf/progs/test_bpf_nf.c      |  9 +++++++--
 tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c | 14 ++++++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
index 544bf90ac2a7..ab9117ae7545 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
@@ -17,6 +17,7 @@ struct {
 	{ "set_status_after_insert", "kernel function bpf_ct_set_status args#0 expected pointer to STRUCT nf_conn___init but" },
 	{ "change_timeout_after_alloc", "kernel function bpf_ct_change_timeout args#0 expected pointer to STRUCT nf_conn but" },
 	{ "change_status_after_alloc", "kernel function bpf_ct_change_status args#0 expected pointer to STRUCT nf_conn but" },
+	{ "write_not_allowlisted_field", "no write support to nf_conn at off" },
 };
 
 enum {
@@ -113,6 +114,7 @@ static void test_bpf_nf_ct(int mode)
 	ASSERT_LE(skel->bss->test_delta_timeout, 10, "Test for max ct timeout update");
 	/* expected status is IPS_SEEN_REPLY */
 	ASSERT_EQ(skel->bss->test_status, 2, "Test for ct status update ");
+	ASSERT_EQ(skel->bss->test_insert_lookup_mark, 77, "Test for insert and lookup mark value");
 	ASSERT_EQ(skel->data->test_exist_lookup, 0, "Test existing connection lookup");
 	ASSERT_EQ(skel->bss->test_exist_lookup_mark, 43, "Test existing connection lookup ctmark");
 end:
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
index 2722441850cc..b5e7079701e8 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
@@ -23,6 +23,7 @@ int test_insert_entry = -EAFNOSUPPORT;
 int test_succ_lookup = -ENOENT;
 u32 test_delta_timeout = 0;
 u32 test_status = 0;
+u32 test_insert_lookup_mark = 0;
 __be32 saddr = 0;
 __be16 sport = 0;
 __be32 daddr = 0;
@@ -144,6 +145,7 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
 
 		bpf_ct_set_timeout(ct, 10000);
 		bpf_ct_set_status(ct, IPS_CONFIRMED);
+		ct->mark = 77;
 
 		ct_ins = bpf_ct_insert_entry(ct);
 		if (ct_ins) {
@@ -157,6 +159,7 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
 				test_delta_timeout = ct_lk->timeout - bpf_jiffies64();
 				test_delta_timeout /= CONFIG_HZ;
 				test_status = IPS_SEEN_REPLY;
+				test_insert_lookup_mark = ct_lk->mark;
 				bpf_ct_change_status(ct_lk, IPS_SEEN_REPLY);
 				bpf_ct_release(ct_lk);
 				test_succ_lookup = 0;
@@ -175,8 +178,10 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
 		       sizeof(opts_def));
 	if (ct) {
 		test_exist_lookup = 0;
-		if (ct->mark == 42)
-			test_exist_lookup_mark = 43;
+		if (ct->mark == 42) {
+			ct->mark++;
+			test_exist_lookup_mark = ct->mark;
+		}
 		bpf_ct_release(ct);
 	} else {
 		test_exist_lookup = opts_def.error;
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c
index bf79af15c808..0e4759ab38ff 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf_fail.c
@@ -69,6 +69,20 @@ int lookup_insert(struct __sk_buff *ctx)
 	return 0;
 }
 
+SEC("?tc")
+int write_not_allowlisted_field(struct __sk_buff *ctx)
+{
+	struct bpf_ct_opts___local opts = {};
+	struct bpf_sock_tuple tup = {};
+	struct nf_conn *ct;
+
+	ct = bpf_skb_ct_lookup(ctx, &tup, sizeof(tup.ipv4), &opts, sizeof(opts));
+	if (!ct)
+		return 0;
+	ct->status = 0xF00;
+	return 0;
+}
+
 SEC("?tc")
 int set_timeout_after_insert(struct __sk_buff *ctx)
 {
-- 
cgit v1.2.3


From f7c946f288e32fd8b5fd69825683420d473672bd Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Thu, 8 Sep 2022 10:06:12 +0200
Subject: selftests/bpf: fix ct status check in bpf_nf selftests

Check properly the connection tracking entry status configured running
bpf_ct_change_status kfunc.
Remove unnecessary IPS_CONFIRMED status configuration since it is
already done during entry allocation.

Fixes: 6eb7fba007a7 ("selftests/bpf: Add tests for new nf_conntrack kfuncs")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/813a5161a71911378dfac8770ec890428e4998aa.1662623574.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/bpf_nf.c | 5 +++--
 tools/testing/selftests/bpf/progs/test_bpf_nf.c | 7 ++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
index ab9117ae7545..0677a51694c9 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <test_progs.h>
 #include <network_helpers.h>
+#include <linux/netfilter/nf_conntrack_common.h>
 #include "test_bpf_nf.skel.h"
 #include "test_bpf_nf_fail.skel.h"
 
@@ -112,9 +113,9 @@ static void test_bpf_nf_ct(int mode)
 	/* allow some tolerance for test_delta_timeout value to avoid races. */
 	ASSERT_GT(skel->bss->test_delta_timeout, 8, "Test for min ct timeout update");
 	ASSERT_LE(skel->bss->test_delta_timeout, 10, "Test for max ct timeout update");
-	/* expected status is IPS_SEEN_REPLY */
-	ASSERT_EQ(skel->bss->test_status, 2, "Test for ct status update ");
 	ASSERT_EQ(skel->bss->test_insert_lookup_mark, 77, "Test for insert and lookup mark value");
+	ASSERT_EQ(skel->bss->test_status, IPS_CONFIRMED | IPS_SEEN_REPLY,
+		  "Test for ct status update ");
 	ASSERT_EQ(skel->data->test_exist_lookup, 0, "Test existing connection lookup");
 	ASSERT_EQ(skel->bss->test_exist_lookup_mark, 43, "Test existing connection lookup ctmark");
 end:
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
index b5e7079701e8..88842da86ddc 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
@@ -144,7 +144,6 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
 		struct nf_conn *ct_ins;
 
 		bpf_ct_set_timeout(ct, 10000);
-		bpf_ct_set_status(ct, IPS_CONFIRMED);
 		ct->mark = 77;
 
 		ct_ins = bpf_ct_insert_entry(ct);
@@ -158,9 +157,11 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
 				bpf_ct_change_timeout(ct_lk, 10000);
 				test_delta_timeout = ct_lk->timeout - bpf_jiffies64();
 				test_delta_timeout /= CONFIG_HZ;
-				test_status = IPS_SEEN_REPLY;
 				test_insert_lookup_mark = ct_lk->mark;
-				bpf_ct_change_status(ct_lk, IPS_SEEN_REPLY);
+				bpf_ct_change_status(ct_lk,
+						     IPS_CONFIRMED | IPS_SEEN_REPLY);
+				test_status = ct_lk->status;
+
 				bpf_ct_release(ct_lk);
 				test_succ_lookup = 0;
 			}
-- 
cgit v1.2.3


From a37a32583e282d8d815e22add29bc1e91e19951a Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <oss@lmb.io>
Date: Sat, 10 Sep 2022 11:01:20 +0000
Subject: bpf: btf: fix truncated last_member_type_id in btf_struct_resolve

When trying to finish resolving a struct member, btf_struct_resolve
saves the member type id in a u16 temporary variable. This truncates
the 32 bit type id value if it exceeds UINT16_MAX.

As a result, structs that have members with type ids > UINT16_MAX and
which need resolution will fail with a message like this:

    [67414] STRUCT ff_device size=120 vlen=12
        effect_owners type_id=67434 bits_offset=960 Member exceeds struct_size

Fix this by changing the type of last_member_type_id to u32.

Fixes: a0791f0df7d2 ("bpf: fix BTF limits")
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Lorenz Bauer <oss@lmb.io>
Link: https://lore.kernel.org/r/20220910110120.339242-1-oss@lmb.io
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 7e64447659f3..36fd4b509294 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3128,7 +3128,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
 	if (v->next_member) {
 		const struct btf_type *last_member_type;
 		const struct btf_member *last_member;
-		u16 last_member_type_id;
+		u32 last_member_type_id;
 
 		last_member = btf_type_member(v->t) + v->next_member - 1;
 		last_member_type_id = last_member->type;
-- 
cgit v1.2.3


From 1bfe26fb082724be453e4d7fd9bb358e3ba669b2 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Thu, 8 Sep 2022 16:07:16 -0700
Subject: bpf: Add verifier support for custom callback return range

Verifier logic to confirm that a callback function returns 0 or 1 was
added in commit 69c087ba6225b ("bpf: Add bpf_for_each_map_elem() helper").
At the time, callback return value was only used to continue or stop
iteration.

In order to support callbacks with a broader return value range, such as
those added in rbtree series[0] and others, add a callback_ret_range to
bpf_func_state. Verifier's helpers which set in_callback_fn will also
set the new field, which the verifier will later use to check return
value bounds.

Default to tnum_range(0, 0) instead of using tnum_unknown as a sentinel
value as the latter would prevent the valid range (0, U64_MAX) being
used. Previous global default tnum_range(0, 1) is explicitly set for
extant callback helpers. The change to global default was made after
discussion around this patch in rbtree series [1], goal here is to make
it more obvious that callback_ret_range should be explicitly set.

  [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com/
  [1]: lore.kernel.org/bpf/20220830172759.4069786-2-davemarchevsky@fb.com/

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/20220908230716.2751723-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h | 1 +
 kernel/bpf/verifier.c        | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b49a349cc6ae..e197f8fb27e2 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -248,6 +248,7 @@ struct bpf_func_state {
 	 */
 	u32 async_entry_cnt;
 	bool in_callback_fn;
+	struct tnum callback_ret_range;
 	bool in_async_callback_fn;
 
 	/* The following fields should be last. See copy_func_state() */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9109e07b759a..c259d734f863 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1750,6 +1750,7 @@ static void init_func_state(struct bpf_verifier_env *env,
 	state->callsite = callsite;
 	state->frameno = frameno;
 	state->subprogno = subprogno;
+	state->callback_ret_range = tnum_range(0, 0);
 	init_reg_state(env, state);
 	mark_verifier_state_scratched(env);
 }
@@ -6790,6 +6791,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
 		return err;
 
 	callee->in_callback_fn = true;
+	callee->callback_ret_range = tnum_range(0, 1);
 	return 0;
 }
 
@@ -6811,6 +6813,7 @@ static int set_loop_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 
 	callee->in_callback_fn = true;
+	callee->callback_ret_range = tnum_range(0, 1);
 	return 0;
 }
 
@@ -6840,6 +6843,7 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 	callee->in_async_callback_fn = true;
+	callee->callback_ret_range = tnum_range(0, 1);
 	return 0;
 }
 
@@ -6867,6 +6871,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
 	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 	callee->in_callback_fn = true;
+	callee->callback_ret_range = tnum_range(0, 1);
 	return 0;
 }
 
@@ -6894,7 +6899,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	caller = state->frame[state->curframe];
 	if (callee->in_callback_fn) {
 		/* enforce R0 return value range [0, 1]. */
-		struct tnum range = tnum_range(0, 1);
+		struct tnum range = callee->callback_ret_range;
 
 		if (r0->type != SCALAR_VALUE) {
 			verbose(env, "R0 not a scalar value\n");
-- 
cgit v1.2.3


From d42d6f5a5fcd734669e75867c632e724f1dc3552 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Sun, 11 Sep 2022 17:40:38 +0800
Subject: EDAC: Remove obsolete declarations in edac_module.h

Commit

  4de78c6877ec ("drivers/edac: mod PCI poll names"),

renamed the respective variables and accessors but left the old accessor
declarations edac_get_log_ce(), edac_get_log_ue(), edac_get_poll_msec()
and edac_get_panic_on_ue() in place. Remove them.

  [ bp: Masssage commit message. ]

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220911094038.3224365-1-cuigaosheng1@huawei.com
---
 drivers/edac/edac_module.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h
index 96f6de0c8ff6..50ed9f2425bb 100644
--- a/drivers/edac/edac_module.h
+++ b/drivers/edac/edac_module.h
@@ -28,13 +28,9 @@ void edac_mc_sysfs_exit(void);
 extern int edac_create_sysfs_mci_device(struct mem_ctl_info *mci,
 					const struct attribute_group **groups);
 extern void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci);
-extern int edac_get_log_ue(void);
-extern int edac_get_log_ce(void);
-extern int edac_get_panic_on_ue(void);
 extern int edac_mc_get_log_ue(void);
 extern int edac_mc_get_log_ce(void);
 extern int edac_mc_get_panic_on_ue(void);
-extern int edac_get_poll_msec(void);
 extern unsigned int edac_mc_get_poll_msec(void);
 
 unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
-- 
cgit v1.2.3


From 825cf206ed510c4a1758bef8957e2b039253e2e3 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Aug 2022 23:58:44 -0700
Subject: statx: add direct I/O alignment information

Traditionally, the conditions for when DIO (direct I/O) is supported
were fairly simple.  For both block devices and regular files, DIO had
to be aligned to the logical block size of the block device.

However, due to filesystem features that have been added over time (e.g.
multi-device support, data journalling, inline data, encryption, verity,
compression, checkpoint disabling, log-structured mode), the conditions
for when DIO is allowed on a regular file have gotten increasingly
complex.  Whether a particular regular file supports DIO, and with what
alignment, can depend on various file attributes and filesystem mount
options, as well as which block device(s) the file's data is located on.

Moreover, the general rule of DIO needing to be aligned to the block
device's logical block size was recently relaxed to allow user buffers
(but not file offsets) aligned to the DMA alignment instead.  See
commit bf8d08532bc1 ("iomap: add support for dma aligned direct-io").

XFS has an ioctl XFS_IOC_DIOINFO that exposes DIO alignment information.
Uplifting this to the VFS is one possibility.  However, as discussed
(https://lore.kernel.org/linux-fsdevel/20220120071215.123274-1-ebiggers@kernel.org/T/#u),
this ioctl is rarely used and not known to be used outside of
XFS-specific code.  It was also never intended to indicate when a file
doesn't support DIO at all, nor was it intended for block devices.

Therefore, let's expose this information via statx().  Add the
STATX_DIOALIGN flag and two new statx fields associated with it:

* stx_dio_mem_align: the alignment (in bytes) required for user memory
  buffers for DIO, or 0 if DIO is not supported on the file.

* stx_dio_offset_align: the alignment (in bytes) required for file
  offsets and I/O segment lengths for DIO, or 0 if DIO is not supported
  on the file.  This will only be nonzero if stx_dio_mem_align is
  nonzero, and vice versa.

Note that as with other statx() extensions, if STATX_DIOALIGN isn't set
in the returned statx struct, then these new fields won't be filled in.
This will happen if the file is neither a regular file nor a block
device, or if the file is a regular file and the filesystem doesn't
support STATX_DIOALIGN.  It might also happen if the caller didn't
include STATX_DIOALIGN in the request mask, since statx() isn't required
to return unrequested information.

This commit only adds the VFS-level plumbing for STATX_DIOALIGN.  For
regular files, individual filesystems will still need to add code to
support it.  For block devices, a separate commit will wire it up too.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220827065851.135710-2-ebiggers@kernel.org
---
 fs/stat.c                 | 2 ++
 include/linux/stat.h      | 2 ++
 include/uapi/linux/stat.h | 4 +++-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/stat.c b/fs/stat.c
index 9ced8860e0f3..a7930d744483 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -611,6 +611,8 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
 	tmp.stx_dev_major = MAJOR(stat->dev);
 	tmp.stx_dev_minor = MINOR(stat->dev);
 	tmp.stx_mnt_id = stat->mnt_id;
+	tmp.stx_dio_mem_align = stat->dio_mem_align;
+	tmp.stx_dio_offset_align = stat->dio_offset_align;
 
 	return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
 }
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 7df06931f25d..ff277ced50e9 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -50,6 +50,8 @@ struct kstat {
 	struct timespec64 btime;			/* File creation time */
 	u64		blocks;
 	u64		mnt_id;
+	u32		dio_mem_align;
+	u32		dio_offset_align;
 };
 
 #endif
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 1500a0f58041..7cab2c65d3d7 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -124,7 +124,8 @@ struct statx {
 	__u32	stx_dev_minor;
 	/* 0x90 */
 	__u64	stx_mnt_id;
-	__u64	__spare2;
+	__u32	stx_dio_mem_align;	/* Memory buffer alignment for direct I/O */
+	__u32	stx_dio_offset_align;	/* File offset alignment for direct I/O */
 	/* 0xa0 */
 	__u64	__spare3[12];	/* Spare space for future expansion */
 	/* 0x100 */
@@ -152,6 +153,7 @@ struct statx {
 #define STATX_BASIC_STATS	0x000007ffU	/* The stuff in the normal stat struct */
 #define STATX_BTIME		0x00000800U	/* Want/got stx_btime */
 #define STATX_MNT_ID		0x00001000U	/* Got stx_mnt_id */
+#define STATX_DIOALIGN		0x00002000U	/* Want/got direct I/O alignment info */
 
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
 
-- 
cgit v1.2.3


From 2d985f8c6b91b5007a16e640bb9c038c5fb2839b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Aug 2022 23:58:45 -0700
Subject: vfs: support STATX_DIOALIGN on block devices

Add support for STATX_DIOALIGN to block devices, so that direct I/O
alignment restrictions are exposed to userspace in a generic way.

Note that this breaks the tradition of stat operating only on the block
device node, not the block device itself.  However, it was felt that
doing this is preferable, in order to make the interface useful and
avoid needing separate interfaces for regular files and block devices.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Link: https://lore.kernel.org/r/20220827065851.135710-3-ebiggers@kernel.org
---
 block/bdev.c           | 23 +++++++++++++++++++++++
 fs/stat.c              | 12 ++++++++++++
 include/linux/blkdev.h |  4 ++++
 3 files changed, 39 insertions(+)

diff --git a/block/bdev.c b/block/bdev.c
index ce05175e71ce..d699ecdb3260 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -26,6 +26,7 @@
 #include <linux/namei.h>
 #include <linux/part_stat.h>
 #include <linux/uaccess.h>
+#include <linux/stat.h>
 #include "../fs/internal.h"
 #include "blk.h"
 
@@ -1069,3 +1070,25 @@ void sync_bdevs(bool wait)
 	spin_unlock(&blockdev_superblock->s_inode_list_lock);
 	iput(old_inode);
 }
+
+/*
+ * Handle STATX_DIOALIGN for block devices.
+ *
+ * Note that the inode passed to this is the inode of a block device node file,
+ * not the block device's internal inode.  Therefore it is *not* valid to use
+ * I_BDEV() here; the block device has to be looked up by i_rdev instead.
+ */
+void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
+{
+	struct block_device *bdev;
+
+	bdev = blkdev_get_no_open(inode->i_rdev);
+	if (!bdev)
+		return;
+
+	stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+	stat->dio_offset_align = bdev_logical_block_size(bdev);
+	stat->result_mask |= STATX_DIOALIGN;
+
+	blkdev_put_no_open(bdev);
+}
diff --git a/fs/stat.c b/fs/stat.c
index a7930d744483..ef50573c72a2 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -5,6 +5,7 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 
+#include <linux/blkdev.h>
 #include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/errno.h>
@@ -230,11 +231,22 @@ retry:
 		goto out;
 
 	error = vfs_getattr(&path, stat, request_mask, flags);
+
 	stat->mnt_id = real_mount(path.mnt)->mnt_id;
 	stat->result_mask |= STATX_MNT_ID;
+
 	if (path.mnt->mnt_root == path.dentry)
 		stat->attributes |= STATX_ATTR_MOUNT_ROOT;
 	stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
+
+	/* Handle STATX_DIOALIGN for block devices. */
+	if (request_mask & STATX_DIOALIGN) {
+		struct inode *inode = d_backing_inode(path.dentry);
+
+		if (S_ISBLK(inode->i_mode))
+			bdev_statx_dioalign(inode, stat);
+	}
+
 	path_put(&path);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 84b13fdd34a7..8038c5fbde40 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1498,6 +1498,7 @@ int sync_blockdev(struct block_device *bdev);
 int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend);
 int sync_blockdev_nowait(struct block_device *bdev);
 void sync_bdevs(bool wait);
+void bdev_statx_dioalign(struct inode *inode, struct kstat *stat);
 void printk_all_partitions(void);
 #else
 static inline void invalidate_bdev(struct block_device *bdev)
@@ -1514,6 +1515,9 @@ static inline int sync_blockdev_nowait(struct block_device *bdev)
 static inline void sync_bdevs(bool wait)
 {
 }
+static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
+{
+}
 static inline void printk_all_partitions(void)
 {
 }
-- 
cgit v1.2.3


From 53dd3f802a6e269868cb599609287a841e65a996 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Aug 2022 23:58:46 -0700
Subject: fscrypt: change fscrypt_dio_supported() to prepare for STATX_DIOALIGN

To prepare for STATX_DIOALIGN support, make two changes to
fscrypt_dio_supported().

First, remove the filesystem-block-alignment check and make the
filesystems handle it instead.  It previously made sense to have it in
fs/crypto/; however, to support STATX_DIOALIGN the alignment restriction
would have to be returned to filesystems.  It ends up being simpler if
filesystems handle this part themselves, especially for f2fs which only
allows fs-block-aligned DIO in the first place.

Second, make fscrypt_dio_supported() work on inodes whose encryption key
hasn't been set up yet, by making it set up the key if needed.  This is
required for statx(), since statx() doesn't require a file descriptor.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220827065851.135710-4-ebiggers@kernel.org
---
 fs/crypto/inline_crypt.c | 49 ++++++++++++++++++++++++------------------------
 fs/ext4/file.c           |  9 +++++++--
 fs/f2fs/f2fs.h           |  2 +-
 include/linux/fscrypt.h  |  7 ++-----
 4 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 90f3e68f166e..8d4bee5bccbf 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -401,46 +401,45 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio,
 EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
 
 /**
- * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is
- *			     supported as far as encryption is concerned
- * @iocb: the file and position the I/O is targeting
- * @iter: the I/O data segment(s)
+ * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an
+ *			     inode, as far as encryption is concerned
+ * @inode: the inode in question
  *
  * Return: %true if there are no encryption constraints that prevent DIO from
  *	   being supported; %false if DIO is unsupported.  (Note that in the
  *	   %true case, the filesystem might have other, non-encryption-related
- *	   constraints that prevent DIO from actually being supported.)
+ *	   constraints that prevent DIO from actually being supported.  Also, on
+ *	   encrypted files the filesystem is still responsible for only allowing
+ *	   DIO when requests are filesystem-block-aligned.)
  */
-bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+bool fscrypt_dio_supported(struct inode *inode)
 {
-	const struct inode *inode = file_inode(iocb->ki_filp);
-	const unsigned int blocksize = i_blocksize(inode);
+	int err;
 
 	/* If the file is unencrypted, no veto from us. */
 	if (!fscrypt_needs_contents_encryption(inode))
 		return true;
 
-	/* We only support DIO with inline crypto, not fs-layer crypto. */
-	if (!fscrypt_inode_uses_inline_crypto(inode))
-		return false;
-
 	/*
-	 * Since the granularity of encryption is filesystem blocks, the file
-	 * position and total I/O length must be aligned to the filesystem block
-	 * size -- not just to the block device's logical block size as is
-	 * traditionally the case for DIO on many filesystems.
+	 * We only support DIO with inline crypto, not fs-layer crypto.
 	 *
-	 * We require that the user-provided memory buffers be filesystem block
-	 * aligned too.  It is simpler to have a single alignment value required
-	 * for all properties of the I/O, as is normally the case for DIO.
-	 * Also, allowing less aligned buffers would imply that data units could
-	 * cross bvecs, which would greatly complicate the I/O stack, which
-	 * assumes that bios can be split at any bvec boundary.
+	 * To determine whether the inode is using inline crypto, we have to set
+	 * up the key if it wasn't already done.  This is because in the current
+	 * design of fscrypt, the decision of whether to use inline crypto or
+	 * not isn't made until the inode's encryption key is being set up.  In
+	 * the DIO read/write case, the key will always be set up already, since
+	 * the file will be open.  But in the case of statx(), the key might not
+	 * be set up yet, as the file might not have been opened yet.
 	 */
-	if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize))
+	err = fscrypt_require_key(inode);
+	if (err) {
+		/*
+		 * Key unavailable or couldn't be set up.  This edge case isn't
+		 * worth worrying about; just report that DIO is unsupported.
+		 */
 		return false;
-
-	return true;
+	}
+	return fscrypt_inode_uses_inline_crypto(inode);
 }
 EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 109d07629f81..26d742620897 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -40,8 +40,13 @@ static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 
-	if (!fscrypt_dio_supported(iocb, iter))
-		return false;
+	if (IS_ENCRYPTED(inode)) {
+		if (!fscrypt_dio_supported(inode))
+			return false;
+		if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter),
+				i_blocksize(inode)))
+			return false;
+	}
 	if (fsverity_active(inode))
 		return false;
 	if (ext4_should_journal_data(inode))
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 3c7cdb70fe2e..0759da1919f4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4498,7 +4498,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int rw = iov_iter_rw(iter);
 
-	if (!fscrypt_dio_supported(iocb, iter))
+	if (!fscrypt_dio_supported(inode))
 		return true;
 	if (fsverity_active(inode))
 		return true;
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 7d2f1e0f23b1..13598859d5b3 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -768,7 +768,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
 bool fscrypt_mergeable_bio_bh(struct bio *bio,
 			      const struct buffer_head *next_bh);
 
-bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter);
+bool fscrypt_dio_supported(struct inode *inode);
 
 u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks);
 
@@ -801,11 +801,8 @@ static inline bool fscrypt_mergeable_bio_bh(struct bio *bio,
 	return true;
 }
 
-static inline bool fscrypt_dio_supported(struct kiocb *iocb,
-					 struct iov_iter *iter)
+static inline bool fscrypt_dio_supported(struct inode *inode)
 {
-	const struct inode *inode = file_inode(iocb->ki_filp);
-
 	return !fscrypt_needs_contents_encryption(inode);
 }
 
-- 
cgit v1.2.3


From 8434ef1d8aafc523443525bbc6237a07d7ec5606 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Aug 2022 23:58:47 -0700
Subject: ext4: support STATX_DIOALIGN

Add support for STATX_DIOALIGN to ext4, so that direct I/O alignment
restrictions are exposed to userspace in a generic way.

Acked-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220827065851.135710-5-ebiggers@kernel.org
---
 fs/ext4/ext4.h  |  1 +
 fs/ext4/file.c  | 42 ++++++++++++++++++++++++++----------------
 fs/ext4/inode.c | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9bca5565547b..e6674504ca2a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2979,6 +2979,7 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 extern int  ext4_write_inode(struct inode *, struct writeback_control *);
 extern int  ext4_setattr(struct user_namespace *, struct dentry *,
 			 struct iattr *);
+extern u32  ext4_dio_alignment(struct inode *inode);
 extern int  ext4_getattr(struct user_namespace *, const struct path *,
 			 struct kstat *, u32, unsigned int);
 extern void ext4_evict_inode(struct inode *);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 26d742620897..8bb1c35fd6dd 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -36,24 +36,34 @@
 #include "acl.h"
 #include "truncate.h"
 
-static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+/*
+ * Returns %true if the given DIO request should be attempted with DIO, or
+ * %false if it should fall back to buffered I/O.
+ *
+ * DIO isn't well specified; when it's unsupported (either due to the request
+ * being misaligned, or due to the file not supporting DIO at all), filesystems
+ * either fall back to buffered I/O or return EINVAL.  For files that don't use
+ * any special features like encryption or verity, ext4 has traditionally
+ * returned EINVAL for misaligned DIO.  iomap_dio_rw() uses this convention too.
+ * In this case, we should attempt the DIO, *not* fall back to buffered I/O.
+ *
+ * In contrast, in cases where DIO is unsupported due to ext4 features, ext4
+ * traditionally falls back to buffered I/O.
+ *
+ * This function implements the traditional ext4 behavior in all these cases.
+ */
+static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
+	u32 dio_align = ext4_dio_alignment(inode);
 
-	if (IS_ENCRYPTED(inode)) {
-		if (!fscrypt_dio_supported(inode))
-			return false;
-		if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter),
-				i_blocksize(inode)))
-			return false;
-	}
-	if (fsverity_active(inode))
+	if (dio_align == 0)
 		return false;
-	if (ext4_should_journal_data(inode))
-		return false;
-	if (ext4_has_inline_data(inode))
-		return false;
-	return true;
+
+	if (dio_align == 1)
+		return true;
+
+	return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align);
 }
 
 static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -68,7 +78,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		inode_lock_shared(inode);
 	}
 
-	if (!ext4_dio_supported(iocb, to)) {
+	if (!ext4_should_use_dio(iocb, to)) {
 		inode_unlock_shared(inode);
 		/*
 		 * Fallback to buffered I/O if the operation being performed on
@@ -516,7 +526,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 
 	/* Fallback to buffered I/O if the inode does not support direct I/O. */
-	if (!ext4_dio_supported(iocb, from)) {
+	if (!ext4_should_use_dio(iocb, from)) {
 		if (ilock_shared)
 			inode_unlock_shared(inode);
 		else
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 601214453c3a..364774230d87 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5550,6 +5550,22 @@ err_out:
 	return error;
 }
 
+u32 ext4_dio_alignment(struct inode *inode)
+{
+	if (fsverity_active(inode))
+		return 0;
+	if (ext4_should_journal_data(inode))
+		return 0;
+	if (ext4_has_inline_data(inode))
+		return 0;
+	if (IS_ENCRYPTED(inode)) {
+		if (!fscrypt_dio_supported(inode))
+			return 0;
+		return i_blocksize(inode);
+	}
+	return 1; /* use the iomap defaults */
+}
+
 int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
 		 struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
@@ -5565,6 +5581,27 @@ int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path,
 		stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
 	}
 
+	/*
+	 * Return the DIO alignment restrictions if requested.  We only return
+	 * this information when requested, since on encrypted files it might
+	 * take a fair bit of work to get if the file wasn't opened recently.
+	 */
+	if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+		u32 dio_align = ext4_dio_alignment(inode);
+
+		stat->result_mask |= STATX_DIOALIGN;
+		if (dio_align == 1) {
+			struct block_device *bdev = inode->i_sb->s_bdev;
+
+			/* iomap defaults */
+			stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+			stat->dio_offset_align = bdev_logical_block_size(bdev);
+		} else {
+			stat->dio_mem_align = dio_align;
+			stat->dio_offset_align = dio_align;
+		}
+	}
+
 	flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
 	if (flags & EXT4_APPEND_FL)
 		stat->attributes |= STATX_ATTR_APPEND;
-- 
cgit v1.2.3


From 2db0487faa2114729d3ae7a423febbba919fee8a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Aug 2022 23:58:48 -0700
Subject: f2fs: move f2fs_force_buffered_io() into file.c

f2fs_force_buffered_io() is only used in file.c, so move it into there.
No behavior change.  This makes it easier to review later patches.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Link: https://lore.kernel.org/r/20220827065851.135710-6-ebiggers@kernel.org
---
 fs/f2fs/f2fs.h | 40 ----------------------------------------
 fs/f2fs/file.c | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 0759da1919f4..aea816a133a8 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -4471,17 +4471,6 @@ static inline void f2fs_i_compr_blocks_update(struct inode *inode,
 	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
-static inline int block_unaligned_IO(struct inode *inode,
-				struct kiocb *iocb, struct iov_iter *iter)
-{
-	unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
-	unsigned int blocksize_mask = (1 << i_blkbits) - 1;
-	loff_t offset = iocb->ki_pos;
-	unsigned long align = offset | iov_iter_alignment(iter);
-
-	return align & blocksize_mask;
-}
-
 static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
 								int flag)
 {
@@ -4492,35 +4481,6 @@ static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi,
 	return sbi->aligned_blksize;
 }
 
-static inline bool f2fs_force_buffered_io(struct inode *inode,
-				struct kiocb *iocb, struct iov_iter *iter)
-{
-	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	int rw = iov_iter_rw(iter);
-
-	if (!fscrypt_dio_supported(inode))
-		return true;
-	if (fsverity_active(inode))
-		return true;
-	if (f2fs_compressed_file(inode))
-		return true;
-
-	/* disallow direct IO if any of devices has unaligned blksize */
-	if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
-		return true;
-
-	if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
-		if (block_unaligned_IO(inode, iocb, iter))
-			return true;
-		if (F2FS_IO_ALIGNED(sbi))
-			return true;
-	}
-	if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
-		return true;
-
-	return false;
-}
-
 static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
 {
 	return fsverity_active(inode) &&
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ce4905a073b3..8a9455bf956f 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -808,6 +808,46 @@ int f2fs_truncate(struct inode *inode)
 	return 0;
 }
 
+static int block_unaligned_IO(struct inode *inode, struct kiocb *iocb,
+			      struct iov_iter *iter)
+{
+	unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
+	unsigned int blocksize_mask = (1 << i_blkbits) - 1;
+	loff_t offset = iocb->ki_pos;
+	unsigned long align = offset | iov_iter_alignment(iter);
+
+	return align & blocksize_mask;
+}
+
+static bool f2fs_force_buffered_io(struct inode *inode,
+				   struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	int rw = iov_iter_rw(iter);
+
+	if (!fscrypt_dio_supported(inode))
+		return true;
+	if (fsverity_active(inode))
+		return true;
+	if (f2fs_compressed_file(inode))
+		return true;
+
+	/* disallow direct IO if any of devices has unaligned blksize */
+	if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
+		return true;
+
+	if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
+		if (block_unaligned_IO(inode, iocb, iter))
+			return true;
+		if (F2FS_IO_ALIGNED(sbi))
+			return true;
+	}
+	if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
+		return true;
+
+	return false;
+}
+
 int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 		 struct kstat *stat, u32 request_mask, unsigned int query_flags)
 {
-- 
cgit v1.2.3


From bd3673293175288d71bce48e46e58e3bcc19829e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Aug 2022 23:58:49 -0700
Subject: f2fs: simplify f2fs_force_buffered_io()

f2fs only allows direct I/O that is aligned to the filesystem block
size.  Given that fact, simplify f2fs_force_buffered_io() by removing
the redundant call to block_unaligned_IO().

This makes it easier to reuse this code for STATX_DIOALIGN.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Link: https://lore.kernel.org/r/20220827065851.135710-7-ebiggers@kernel.org
---
 fs/f2fs/file.c | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8a9455bf956f..8e11311db210 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -808,22 +808,9 @@ int f2fs_truncate(struct inode *inode)
 	return 0;
 }
 
-static int block_unaligned_IO(struct inode *inode, struct kiocb *iocb,
-			      struct iov_iter *iter)
-{
-	unsigned int i_blkbits = READ_ONCE(inode->i_blkbits);
-	unsigned int blocksize_mask = (1 << i_blkbits) - 1;
-	loff_t offset = iocb->ki_pos;
-	unsigned long align = offset | iov_iter_alignment(iter);
-
-	return align & blocksize_mask;
-}
-
-static bool f2fs_force_buffered_io(struct inode *inode,
-				   struct kiocb *iocb, struct iov_iter *iter)
+static bool f2fs_force_buffered_io(struct inode *inode, int rw)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	int rw = iov_iter_rw(iter);
 
 	if (!fscrypt_dio_supported(inode))
 		return true;
@@ -836,13 +823,9 @@ static bool f2fs_force_buffered_io(struct inode *inode,
 	if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize)
 		return true;
 
-	if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
-		if (block_unaligned_IO(inode, iocb, iter))
-			return true;
-		if (F2FS_IO_ALIGNED(sbi))
-			return true;
-	}
-	if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED))
+	if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi))
+		return true;
+	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
 		return true;
 
 	return false;
@@ -4222,7 +4205,7 @@ static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb,
 	if (!(iocb->ki_flags & IOCB_DIRECT))
 		return false;
 
-	if (f2fs_force_buffered_io(inode, iocb, iter))
+	if (f2fs_force_buffered_io(inode, iov_iter_rw(iter)))
 		return false;
 
 	/*
-- 
cgit v1.2.3


From c8c02272a9f74bcba4a930a496b2a0c661873c35 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Aug 2022 23:58:50 -0700
Subject: f2fs: support STATX_DIOALIGN

Add support for STATX_DIOALIGN to f2fs, so that direct I/O alignment
restrictions are exposed to userspace in a generic way.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Link: https://lore.kernel.org/r/20220827065851.135710-8-ebiggers@kernel.org
---
 fs/f2fs/file.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8e11311db210..791770507328 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -847,6 +847,24 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 		stat->btime.tv_nsec = fi->i_crtime.tv_nsec;
 	}
 
+	/*
+	 * Return the DIO alignment restrictions if requested.  We only return
+	 * this information when requested, since on encrypted files it might
+	 * take a fair bit of work to get if the file wasn't opened recently.
+	 *
+	 * f2fs sometimes supports DIO reads but not DIO writes.  STATX_DIOALIGN
+	 * cannot represent that, so in that case we report no DIO support.
+	 */
+	if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
+		unsigned int bsize = i_blocksize(inode);
+
+		stat->result_mask |= STATX_DIOALIGN;
+		if (!f2fs_force_buffered_io(inode, WRITE)) {
+			stat->dio_mem_align = bsize;
+			stat->dio_offset_align = bsize;
+		}
+	}
+
 	flags = fi->i_flags;
 	if (flags & F2FS_COMPR_FL)
 		stat->attributes |= STATX_ATTR_COMPRESSED;
-- 
cgit v1.2.3


From 61a223df421f698c253143014cfd384255b3cf1e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 26 Aug 2022 23:58:51 -0700
Subject: xfs: support STATX_DIOALIGN

Add support for STATX_DIOALIGN to xfs, so that direct I/O alignment
restrictions are exposed to userspace in a generic way.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20220827065851.135710-9-ebiggers@kernel.org
---
 fs/xfs/xfs_iops.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 45518b8c613c..f51c60d7e205 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -604,6 +604,16 @@ xfs_vn_getattr(
 		stat->blksize = BLKDEV_IOSIZE;
 		stat->rdev = inode->i_rdev;
 		break;
+	case S_IFREG:
+		if (request_mask & STATX_DIOALIGN) {
+			struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
+			struct block_device	*bdev = target->bt_bdev;
+
+			stat->result_mask |= STATX_DIOALIGN;
+			stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+			stat->dio_offset_align = bdev_logical_block_size(bdev);
+		}
+		fallthrough;
 	default:
 		stat->blksize = xfs_stat_blksize(ip);
 		stat->rdev = 0;
-- 
cgit v1.2.3


From edeed42402cebd9147b484d6df9964869bdc0681 Mon Sep 17 00:00:00 2001
From: Jason Wang <wangborong@cdjrlc.com>
Date: Thu, 11 Aug 2022 20:14:50 +0800
Subject: MIPS: Fix comment typo

The double `it' is duplicated in the comment, remove one.

Signed-off-by: Jason Wang <wangborong@cdjrlc.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/math-emu/cp1emu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index 587cf1d115e8..265bc57819df 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -1032,7 +1032,7 @@ static int cop1Emulate(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 	 */
 	if (dec_insn.micro_mips_mode) {
 		/*
-		 * If next instruction is a 16-bit instruction, then it
+		 * If next instruction is a 16-bit instruction, then
 		 * it cannot be a FPU instruction. This could happen
 		 * since we can be called for non-FPU instructions.
 		 */
-- 
cgit v1.2.3


From 877336c1856ea4277994ba7cdd8aede416e75a4e Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 18 Aug 2022 22:59:43 +0200
Subject: MIPS: move from strlcpy with unused retval to strscpy

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/kernel/prom.c          | 6 +++---
 arch/mips/kernel/relocate.c      | 2 +-
 arch/mips/pic32/pic32mzda/init.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c
index 7db6ff9aed7d..f88ce78e13e3 100644
--- a/arch/mips/kernel/prom.c
+++ b/arch/mips/kernel/prom.c
@@ -26,7 +26,7 @@ __init void mips_set_machine_name(const char *name)
 	if (name == NULL)
 		return;
 
-	strlcpy(mips_machine_name, name, sizeof(mips_machine_name));
+	strscpy(mips_machine_name, name, sizeof(mips_machine_name));
 	pr_info("MIPS: machine is %s\n", mips_get_machine_name());
 }
 
@@ -52,9 +52,9 @@ int __init __dt_register_buses(const char *bus0, const char *bus1)
 	if (!of_have_populated_dt())
 		panic("device tree not present");
 
-	strlcpy(of_ids[0].compatible, bus0, sizeof(of_ids[0].compatible));
+	strscpy(of_ids[0].compatible, bus0, sizeof(of_ids[0].compatible));
 	if (bus1) {
-		strlcpy(of_ids[1].compatible, bus1,
+		strscpy(of_ids[1].compatible, bus1,
 			sizeof(of_ids[1].compatible));
 	}
 
diff --git a/arch/mips/kernel/relocate.c b/arch/mips/kernel/relocate.c
index 56b51de2dc51..58fc8d089402 100644
--- a/arch/mips/kernel/relocate.c
+++ b/arch/mips/kernel/relocate.c
@@ -340,7 +340,7 @@ void *__init relocate_kernel(void)
 	early_init_dt_scan(fdt);
 	if (boot_command_line[0]) {
 		/* Boot command line was passed in device tree */
-		strlcpy(arcs_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+		strscpy(arcs_cmdline, boot_command_line, COMMAND_LINE_SIZE);
 	}
 #endif /* CONFIG_USE_OF */
 
diff --git a/arch/mips/pic32/pic32mzda/init.c b/arch/mips/pic32/pic32mzda/init.c
index d9c8c4e46aff..08c46cf122d7 100644
--- a/arch/mips/pic32/pic32mzda/init.c
+++ b/arch/mips/pic32/pic32mzda/init.c
@@ -44,7 +44,7 @@ void __init plat_mem_setup(void)
 	pr_info(" builtin_cmdline  : %s\n", CONFIG_CMDLINE);
 #endif
 	if (dtb != __dtb_start)
-		strlcpy(arcs_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+		strscpy(arcs_cmdline, boot_command_line, COMMAND_LINE_SIZE);
 
 #ifdef CONFIG_EARLY_PRINTK
 	fw_init_early_console(-1);
-- 
cgit v1.2.3


From a343ca9bd728ed070b209d1ec40fe38dad7e516c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 22 Aug 2022 19:54:41 -0700
Subject: MIPS: ath25: clean up non-kernel-doc comment warning

Don't use "/**" kernel-doc notation for comments that are not in
kernel-doc format.

Eliminates one warning:

arch/mips/pci/pci-ar2315.c:6: warning: This comment starts with
  '/**', but isn't a kernel-doc comment. Refer
  Documentation/doc-guide/kernel-doc.rst
    * Both AR2315 and AR2316 chips have PCI interface unit, which supports DMA

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Aditya Srivastava <yashsri421@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: linux-mips@vger.kernel.org
Cc: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Acked-by: Sergey Ryazanov <ryazanov.s.a@gmail.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/pci/pci-ar2315.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/pci/pci-ar2315.c b/arch/mips/pci/pci-ar2315.c
index 30e0922f4cea..e17d862cfa4c 100644
--- a/arch/mips/pci/pci-ar2315.c
+++ b/arch/mips/pci/pci-ar2315.c
@@ -2,7 +2,7 @@
 /*
  */
 
-/**
+/*
  * Both AR2315 and AR2316 chips have PCI interface unit, which supports DMA
  * and interrupt. PCI interface supports MMIO access method, but does not
  * seem to support I/O ports.
-- 
cgit v1.2.3


From 050a3f1db5b968ce838724f1c92787483770335d Mon Sep 17 00:00:00 2001
From: Shaomin Deng <dengshaomin@cdjrlc.com>
Date: Sat, 27 Aug 2022 11:46:40 -0400
Subject: MIPS: Fix comments typo

Remove the repeated word "as" in comments.

Signed-off-by: Shaomin Deng <dengshaomin@cdjrlc.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/cavium-octeon/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c
index e7f994393ae8..a71727f7a608 100644
--- a/arch/mips/cavium-octeon/setup.c
+++ b/arch/mips/cavium-octeon/setup.c
@@ -530,7 +530,7 @@ void octeon_user_io_init(void)
 	/* Get the current settings for CP0_CVMMEMCTL_REG */
 	cvmmemctl.u64 = read_c0_cvmmemctl();
 	/* R/W If set, marked write-buffer entries time out the same
-	 * as as other entries; if clear, marked write-buffer entries
+	 * as other entries; if clear, marked write-buffer entries
 	 * use the maximum timeout. */
 	cvmmemctl.s.dismarkwblongto = 1;
 	/* R/W If set, a merged store does not clear the write-buffer
-- 
cgit v1.2.3


From 0153682e9e6accd1796dd6bbe9df792d03d88b1f Mon Sep 17 00:00:00 2001
From: Aleksander Jan Bajkowski <olek2@wp.pl>
Date: Mon, 29 Aug 2022 20:37:17 +0200
Subject: MIPS: dts: lantiq: rename dts files with soc name being the prefix

Rename lantiq dts files with soc name being the prefix, so that the
board dts file can be located easily by soc name, and we also gain
the consistency of naming.

This patch is a preparation for the addition of dts for newer SoCs
(Lantiq ARX100, VRX200 and xRX330).

Signed-off-by: Aleksander Jan Bajkowski <olek2@wp.pl>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/boot/dts/lantiq/Makefile             |   2 +-
 arch/mips/boot/dts/lantiq/danube_easy50712.dts | 115 +++++++++++++++++++++++++
 arch/mips/boot/dts/lantiq/easy50712.dts        | 115 -------------------------
 3 files changed, 116 insertions(+), 116 deletions(-)
 create mode 100644 arch/mips/boot/dts/lantiq/danube_easy50712.dts
 delete mode 100644 arch/mips/boot/dts/lantiq/easy50712.dts

diff --git a/arch/mips/boot/dts/lantiq/Makefile b/arch/mips/boot/dts/lantiq/Makefile
index f5dfc06242b9..ae6e3e21ebeb 100644
--- a/arch/mips/boot/dts/lantiq/Makefile
+++ b/arch/mips/boot/dts/lantiq/Makefile
@@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-dtb-$(CONFIG_DT_EASY50712)	+= easy50712.dtb
+dtb-$(CONFIG_DT_EASY50712)	+= danube_easy50712.dtb
 
 obj-$(CONFIG_BUILTIN_DTB)	+= $(addsuffix .o, $(dtb-y))
diff --git a/arch/mips/boot/dts/lantiq/danube_easy50712.dts b/arch/mips/boot/dts/lantiq/danube_easy50712.dts
new file mode 100644
index 000000000000..1ce20b7d05cb
--- /dev/null
+++ b/arch/mips/boot/dts/lantiq/danube_easy50712.dts
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/dts-v1/;
+
+/include/ "danube.dtsi"
+
+/ {
+	chosen {
+		bootargs = "console=ttyLTQ0,115200 init=/etc/preinit";
+	};
+
+	memory@0 {
+		device_type = "memory";
+		reg = <0x0 0x2000000>;
+	};
+
+	fpi@10000000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		localbus@0 {
+			#address-cells = <2>;
+			#size-cells = <1>;
+			ranges = <0 0 0x0 0x3ffffff /* addrsel0 */
+				1 0 0x4000000 0x4000010>; /* addsel1 */
+			compatible = "lantiq,localbus", "simple-bus";
+
+			nor-boot@0 {
+				compatible = "lantiq,nor";
+				bank-width = <2>;
+				reg = <0 0x0 0x2000000>;
+				#address-cells = <1>;
+				#size-cells = <1>;
+
+				partition@0 {
+					label = "uboot";
+					reg = <0x00000 0x10000>; /* 64 KB */
+				};
+
+				partition@10000 {
+					label = "uboot_env";
+					reg = <0x10000 0x10000>; /* 64 KB */
+				};
+
+				partition@20000 {
+					label = "linux";
+					reg = <0x20000 0x3d0000>;
+				};
+
+				partition@400000 {
+					label = "rootfs";
+					reg = <0x400000 0x400000>;
+				};
+			};
+		};
+
+		gpio: pinmux@e100b10 {
+			compatible = "lantiq,danube-pinctrl";
+			pinctrl-names = "default";
+			pinctrl-0 = <&state_default>;
+
+			#gpio-cells = <2>;
+			gpio-controller;
+			reg = <0xe100b10 0xa0>;
+
+			state_default: pinmux {
+				stp {
+					lantiq,groups = "stp";
+					lantiq,function = "stp";
+				};
+				exin {
+					lantiq,groups = "exin1";
+					lantiq,function = "exin";
+				};
+				pci {
+					lantiq,groups = "gnt1";
+					lantiq,function = "pci";
+				};
+				conf_out {
+					lantiq,pins = "io4", "io5", "io6"; /* stp */
+					lantiq,open-drain;
+					lantiq,pull = <0>;
+				};
+			};
+		};
+
+		etop@e180000 {
+			compatible = "lantiq,etop-xway";
+			reg = <0xe180000 0x40000>;
+			interrupt-parent = <&icu0>;
+			interrupts = <73 78>;
+			phy-mode = "rmii";
+			mac-address = [ 00 11 22 33 44 55 ];
+		};
+
+		stp0: stp@e100bb0 {
+			#gpio-cells = <2>;
+			compatible = "lantiq,gpio-stp-xway";
+			gpio-controller;
+			reg = <0xe100bb0 0x40>;
+
+			lantiq,shadow = <0xfff>;
+			lantiq,groups = <0x3>;
+		};
+
+		pci@e105400 {
+			lantiq,bus-clock = <33333333>;
+			interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
+			interrupt-map = <
+				0x7000 0 0 1 &icu0 29 1 // slot 14, irq 29
+			>;
+			gpios-reset = <&gpio 21 0>;
+			req-mask = <0x1>;		/* GNT1 */
+		};
+
+	};
+};
diff --git a/arch/mips/boot/dts/lantiq/easy50712.dts b/arch/mips/boot/dts/lantiq/easy50712.dts
deleted file mode 100644
index 1ce20b7d05cb..000000000000
--- a/arch/mips/boot/dts/lantiq/easy50712.dts
+++ /dev/null
@@ -1,115 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/dts-v1/;
-
-/include/ "danube.dtsi"
-
-/ {
-	chosen {
-		bootargs = "console=ttyLTQ0,115200 init=/etc/preinit";
-	};
-
-	memory@0 {
-		device_type = "memory";
-		reg = <0x0 0x2000000>;
-	};
-
-	fpi@10000000 {
-		#address-cells = <1>;
-		#size-cells = <1>;
-		localbus@0 {
-			#address-cells = <2>;
-			#size-cells = <1>;
-			ranges = <0 0 0x0 0x3ffffff /* addrsel0 */
-				1 0 0x4000000 0x4000010>; /* addsel1 */
-			compatible = "lantiq,localbus", "simple-bus";
-
-			nor-boot@0 {
-				compatible = "lantiq,nor";
-				bank-width = <2>;
-				reg = <0 0x0 0x2000000>;
-				#address-cells = <1>;
-				#size-cells = <1>;
-
-				partition@0 {
-					label = "uboot";
-					reg = <0x00000 0x10000>; /* 64 KB */
-				};
-
-				partition@10000 {
-					label = "uboot_env";
-					reg = <0x10000 0x10000>; /* 64 KB */
-				};
-
-				partition@20000 {
-					label = "linux";
-					reg = <0x20000 0x3d0000>;
-				};
-
-				partition@400000 {
-					label = "rootfs";
-					reg = <0x400000 0x400000>;
-				};
-			};
-		};
-
-		gpio: pinmux@e100b10 {
-			compatible = "lantiq,danube-pinctrl";
-			pinctrl-names = "default";
-			pinctrl-0 = <&state_default>;
-
-			#gpio-cells = <2>;
-			gpio-controller;
-			reg = <0xe100b10 0xa0>;
-
-			state_default: pinmux {
-				stp {
-					lantiq,groups = "stp";
-					lantiq,function = "stp";
-				};
-				exin {
-					lantiq,groups = "exin1";
-					lantiq,function = "exin";
-				};
-				pci {
-					lantiq,groups = "gnt1";
-					lantiq,function = "pci";
-				};
-				conf_out {
-					lantiq,pins = "io4", "io5", "io6"; /* stp */
-					lantiq,open-drain;
-					lantiq,pull = <0>;
-				};
-			};
-		};
-
-		etop@e180000 {
-			compatible = "lantiq,etop-xway";
-			reg = <0xe180000 0x40000>;
-			interrupt-parent = <&icu0>;
-			interrupts = <73 78>;
-			phy-mode = "rmii";
-			mac-address = [ 00 11 22 33 44 55 ];
-		};
-
-		stp0: stp@e100bb0 {
-			#gpio-cells = <2>;
-			compatible = "lantiq,gpio-stp-xway";
-			gpio-controller;
-			reg = <0xe100bb0 0x40>;
-
-			lantiq,shadow = <0xfff>;
-			lantiq,groups = <0x3>;
-		};
-
-		pci@e105400 {
-			lantiq,bus-clock = <33333333>;
-			interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
-			interrupt-map = <
-				0x7000 0 0 1 &icu0 29 1 // slot 14, irq 29
-			>;
-			gpios-reset = <&gpio 21 0>;
-			req-mask = <0x1>;		/* GNT1 */
-		};
-
-	};
-};
-- 
cgit v1.2.3


From c5783af354688b24abd359f7086c282ec74de993 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 7 Sep 2022 16:40:44 -0700
Subject: sh: machvec: Use char[] for section boundaries

As done for other sections, define the extern as a character array,
which relaxes many of the compiler-time object size checks, which would
otherwise assume it's a single long. Solves the following build error:

arch/sh/kernel/machvec.c: error: array subscript 'struct sh_machine_vector[0]' is partly outside array bounds of 'long int[1]' [-Werror=array-bounds]:  => 105:33

Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: linux-sh@vger.kernel.org
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/lkml/alpine.DEB.2.22.394.2209050944290.964530@ramsan.of.borg/
Fixes: 9655ad03af2d ("sh: Fixup machvec support.")
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Acked-by: Rich Felker <dalias@libc.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/sh/include/asm/sections.h |  2 +-
 arch/sh/kernel/machvec.c       | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/sh/include/asm/sections.h b/arch/sh/include/asm/sections.h
index 8edb824049b9..0cb0ca149ac3 100644
--- a/arch/sh/include/asm/sections.h
+++ b/arch/sh/include/asm/sections.h
@@ -4,7 +4,7 @@
 
 #include <asm-generic/sections.h>
 
-extern long __machvec_start, __machvec_end;
+extern char __machvec_start[], __machvec_end[];
 extern char __uncached_start, __uncached_end;
 extern char __start_eh_frame[], __stop_eh_frame[];
 
diff --git a/arch/sh/kernel/machvec.c b/arch/sh/kernel/machvec.c
index d606679a211e..57efaf5b82ae 100644
--- a/arch/sh/kernel/machvec.c
+++ b/arch/sh/kernel/machvec.c
@@ -20,8 +20,8 @@
 #define MV_NAME_SIZE 32
 
 #define for_each_mv(mv) \
-	for ((mv) = (struct sh_machine_vector *)&__machvec_start; \
-	     (mv) && (unsigned long)(mv) < (unsigned long)&__machvec_end; \
+	for ((mv) = (struct sh_machine_vector *)__machvec_start; \
+	     (mv) && (unsigned long)(mv) < (unsigned long)__machvec_end; \
 	     (mv)++)
 
 static struct sh_machine_vector * __init get_mv_byname(const char *name)
@@ -87,8 +87,8 @@ void __init sh_mv_setup(void)
 	if (!machvec_selected) {
 		unsigned long machvec_size;
 
-		machvec_size = ((unsigned long)&__machvec_end -
-				(unsigned long)&__machvec_start);
+		machvec_size = ((unsigned long)__machvec_end -
+				(unsigned long)__machvec_start);
 
 		/*
 		 * Sanity check for machvec section alignment. Ensure
@@ -102,7 +102,7 @@ void __init sh_mv_setup(void)
 		 * vector (usually the only one) from .machvec.init.
 		 */
 		if (machvec_size >= sizeof(struct sh_machine_vector))
-			sh_mv = *(struct sh_machine_vector *)&__machvec_start;
+			sh_mv = *(struct sh_machine_vector *)__machvec_start;
 	}
 
 	pr_notice("Booting machvec: %s\n", get_system_type());
-- 
cgit v1.2.3


From 5da39ac5d648cdbfdfa8bea0e0cde279ded5c7c2 Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley@microchip.com>
Date: Fri, 9 Sep 2022 13:31:10 +0100
Subject: clk: microchip: mpfs: fix clk_cfg array bounds violation

There is an array bounds violation present during clock registration,
triggered by current code by only specific toolchains. This seems to
fail gracefully in v6.0-rc1, using a toolchain build from the riscv-
gnu-toolchain repo and with clang-15, and life carries on. While
converting the driver to use standard clock structs/ops, kernel panics
were seen during boot when built with clang-15:

[    0.581754] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000b1
[    0.591520] Oops [#1]
[    0.594045] Modules linked in:
[    0.597435] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.0.0-rc1-00011-g8e1459cf4eca #1
[    0.606188] Hardware name: Microchip PolarFire-SoC Icicle Kit (DT)
[    0.613012] epc : __clk_register+0x4a6/0x85c
[    0.617759]  ra : __clk_register+0x49e/0x85c
[    0.622489] epc : ffffffff803faf7c ra : ffffffff803faf74 sp : ffffffc80400b720
[    0.630466]  gp : ffffffff810e93f8 tp : ffffffe77fe60000 t0 : ffffffe77ffb3800
[    0.638443]  t1 : 000000000000000a t2 : ffffffffffffffff s0 : ffffffc80400b7c0
[    0.646420]  s1 : 0000000000000001 a0 : 0000000000000001 a1 : 0000000000000000
[    0.654396]  a2 : 0000000000000001 a3 : 0000000000000000 a4 : 0000000000000000
[    0.662373]  a5 : ffffffff803a5810 a6 : 0000000200000022 a7 : 0000000000000006
[    0.670350]  s2 : ffffffff81099d48 s3 : ffffffff80d6e28e s4 : 0000000000000028
[    0.678327]  s5 : ffffffff810ed3c8 s6 : ffffffff810ed3d0 s7 : ffffffe77ffbc100
[    0.686304]  s8 : ffffffe77ffb1540 s9 : ffffffe77ffb1540 s10: 0000000000000008
[    0.694281]  s11: 0000000000000000 t3 : 00000000000000c6 t4 : 0000000000000007
[    0.702258]  t5 : ffffffff810c78c0 t6 : ffffffe77ff88cd0
[    0.708125] status: 0000000200000120 badaddr: 00000000000000b1 cause: 000000000000000d
[    0.716869] [<ffffffff803fb892>] devm_clk_hw_register+0x62/0xaa
[    0.723420] [<ffffffff80403412>] mpfs_clk_probe+0x1e0/0x244

In v6.0-rc1 and later, this issue is visible without the follow on
patches doing the conversion using toolchains provided by our Yocto
meta layer too.

It fails on "clk_periph_timer" - which uses a different parent, that it
tries to find using the macro:
\#define PARENT_CLK(PARENT) (&mpfs_cfg_clks[CLK_##PARENT].cfg.hw)

If parent is RTCREF, so the macro becomes: &mpfs_cfg_clks[33].cfg.hw
which is well beyond the end of the array. Amazingly, builds with GCC
11.1 see no problem here, booting correctly and hooking the parent up
etc. Builds with clang-15 do not, with the above panic.

Change the macro to use specific offsets depending on the parent rather
than the dt-binding's clock IDs.

Fixes: 1c6a7ea32b8c ("clk: microchip: mpfs: add RTCREF clock control")
CC: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Signed-off-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Link: https://lore.kernel.org/r/20220909123123.2699583-2-conor.dooley@microchip.com
---
 drivers/clk/microchip/clk-mpfs.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/clk/microchip/clk-mpfs.c b/drivers/clk/microchip/clk-mpfs.c
index 070c3b896559..f0f9c9a1cc48 100644
--- a/drivers/clk/microchip/clk-mpfs.c
+++ b/drivers/clk/microchip/clk-mpfs.c
@@ -239,6 +239,11 @@ static const struct clk_ops mpfs_clk_cfg_ops = {
 	.hw.init = CLK_HW_INIT(_name, _parent, &mpfs_clk_cfg_ops, 0),			\
 }
 
+#define CLK_CPU_OFFSET		0u
+#define CLK_AXI_OFFSET		1u
+#define CLK_AHB_OFFSET		2u
+#define CLK_RTCREF_OFFSET	3u
+
 static struct mpfs_cfg_hw_clock mpfs_cfg_clks[] = {
 	CLK_CFG(CLK_CPU, "clk_cpu", "clk_msspll", 0, 2, mpfs_div_cpu_axi_table, 0,
 		REG_CLOCK_CONFIG_CR),
@@ -362,7 +367,7 @@ static const struct clk_ops mpfs_periph_clk_ops = {
 				  _flags),					\
 }
 
-#define PARENT_CLK(PARENT) (&mpfs_cfg_clks[CLK_##PARENT].hw)
+#define PARENT_CLK(PARENT) (&mpfs_cfg_clks[CLK_##PARENT##_OFFSET].hw)
 
 /*
  * Critical clocks:
-- 
cgit v1.2.3


From 05d27090b6dc88bce71a608d1271536e582b73d1 Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley@microchip.com>
Date: Fri, 9 Sep 2022 13:31:11 +0100
Subject: clk: microchip: mpfs: make the rtc's ahb clock critical

The onboard RTC's AHB bus clock must be kept running as the RTC will
stop & lose track of time if the AHB interface clock is disabled.

Fixes: 635e5e73370e ("clk: microchip: Add driver for Microchip PolarFire SoC")
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Signed-off-by: Claudiu Beznea <claudiu.beznea@microchip.com>
Link: https://lore.kernel.org/r/20220909123123.2699583-3-conor.dooley@microchip.com
---
 drivers/clk/microchip/clk-mpfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/clk/microchip/clk-mpfs.c b/drivers/clk/microchip/clk-mpfs.c
index f0f9c9a1cc48..b6b89413e090 100644
--- a/drivers/clk/microchip/clk-mpfs.c
+++ b/drivers/clk/microchip/clk-mpfs.c
@@ -375,6 +375,8 @@ static const struct clk_ops mpfs_periph_clk_ops = {
  *   trap handler
  * - CLK_MMUART0: reserved by the hss
  * - CLK_DDRC: provides clock to the ddr subsystem
+ * - CLK_RTC: the onboard RTC's AHB bus clock must be kept running as the rtc will stop
+ *   if the AHB interface clock is disabled
  * - CLK_FICx: these provide the processor side clocks to the "FIC" (Fabric InterConnect)
  *   clock domain crossers which provide the interface to the FPGA fabric. Disabling them
  *   causes the FPGA fabric to go into reset.
@@ -399,7 +401,7 @@ static struct mpfs_periph_hw_clock mpfs_periph_clks[] = {
 	CLK_PERIPH(CLK_CAN0, "clk_periph_can0", PARENT_CLK(AHB), 14, 0),
 	CLK_PERIPH(CLK_CAN1, "clk_periph_can1", PARENT_CLK(AHB), 15, 0),
 	CLK_PERIPH(CLK_USB, "clk_periph_usb", PARENT_CLK(AHB), 16, 0),
-	CLK_PERIPH(CLK_RTC, "clk_periph_rtc", PARENT_CLK(AHB), 18, 0),
+	CLK_PERIPH(CLK_RTC, "clk_periph_rtc", PARENT_CLK(AHB), 18, CLK_IS_CRITICAL),
 	CLK_PERIPH(CLK_QSPI, "clk_periph_qspi", PARENT_CLK(AHB), 19, 0),
 	CLK_PERIPH(CLK_GPIO0, "clk_periph_gpio0", PARENT_CLK(AHB), 20, 0),
 	CLK_PERIPH(CLK_GPIO1, "clk_periph_gpio1", PARENT_CLK(AHB), 21, 0),
-- 
cgit v1.2.3


From 1e7d8bcbe37d3c63babe628443f13f77970dd06b Mon Sep 17 00:00:00 2001
From: Nathan Lynch <nathanl@linux.ibm.com>
Date: Thu, 8 Sep 2022 17:02:22 -0500
Subject: lockdown: ratelimit denial messages

User space can flood the log with lockdown denial messages:

[  662.555584] Lockdown: bash: debugfs access is restricted; see man kernel_lockdown.7
[  662.563237] Lockdown: bash: debugfs access is restricted; see man kernel_lockdown.7
[  662.571134] Lockdown: bash: debugfs access is restricted; see man kernel_lockdown.7
[  662.578668] Lockdown: bash: debugfs access is restricted; see man kernel_lockdown.7
[  662.586021] Lockdown: bash: debugfs access is restricted; see man kernel_lockdown.7
[  662.593398] Lockdown: bash: debugfs access is restricted; see man kernel_lockdown.7

Ratelimiting these shouldn't meaningfully degrade the quality of the
information logged.

Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 security/lockdown/lockdown.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/lockdown/lockdown.c b/security/lockdown/lockdown.c
index 87cbdc64d272..a79b985e917e 100644
--- a/security/lockdown/lockdown.c
+++ b/security/lockdown/lockdown.c
@@ -63,7 +63,7 @@ static int lockdown_is_locked_down(enum lockdown_reason what)
 
 	if (kernel_locked_down >= what) {
 		if (lockdown_reasons[what])
-			pr_notice("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n",
+			pr_notice_ratelimited("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n",
 				  current->comm, lockdown_reasons[what]);
 		return -EPERM;
 	}
-- 
cgit v1.2.3


From 09b71adab09570fcc715ff31a6835d0e445e4a54 Mon Sep 17 00:00:00 2001
From: Xu Panda <xu.panda@zte.com.cn>
Date: Mon, 12 Sep 2022 09:17:19 +0000
Subject: selinux: remove the unneeded result variable

Return the value avc_has_perm() directly instead of storing it in
another redundant variable.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Xu Panda <xu.panda@zte.com.cn>
[PM: subject line tweak]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 security/selinux/hooks.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 79573504783b..6e87abbae47c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -5986,7 +5986,6 @@ static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq)
 	struct ipc_security_struct *isec;
 	struct common_audit_data ad;
 	u32 sid = current_sid();
-	int rc;
 
 	isec = selinux_ipc(msq);
 	ipc_init_security(isec, SECCLASS_MSGQ);
@@ -5994,10 +5993,9 @@ static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq)
 	ad.type = LSM_AUDIT_DATA_IPC;
 	ad.u.ipc_id = msq->key;
 
-	rc = avc_has_perm(&selinux_state,
-			  sid, isec->sid, SECCLASS_MSGQ,
-			  MSGQ__CREATE, &ad);
-	return rc;
+	return avc_has_perm(&selinux_state,
+			    sid, isec->sid, SECCLASS_MSGQ,
+			    MSGQ__CREATE, &ad);
 }
 
 static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
@@ -6125,7 +6123,6 @@ static int selinux_shm_alloc_security(struct kern_ipc_perm *shp)
 	struct ipc_security_struct *isec;
 	struct common_audit_data ad;
 	u32 sid = current_sid();
-	int rc;
 
 	isec = selinux_ipc(shp);
 	ipc_init_security(isec, SECCLASS_SHM);
@@ -6133,10 +6130,9 @@ static int selinux_shm_alloc_security(struct kern_ipc_perm *shp)
 	ad.type = LSM_AUDIT_DATA_IPC;
 	ad.u.ipc_id = shp->key;
 
-	rc = avc_has_perm(&selinux_state,
-			  sid, isec->sid, SECCLASS_SHM,
-			  SHM__CREATE, &ad);
-	return rc;
+	return avc_has_perm(&selinux_state,
+			    sid, isec->sid, SECCLASS_SHM,
+			    SHM__CREATE, &ad);
 }
 
 static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg)
@@ -6210,7 +6206,6 @@ static int selinux_sem_alloc_security(struct kern_ipc_perm *sma)
 	struct ipc_security_struct *isec;
 	struct common_audit_data ad;
 	u32 sid = current_sid();
-	int rc;
 
 	isec = selinux_ipc(sma);
 	ipc_init_security(isec, SECCLASS_SEM);
@@ -6218,10 +6213,9 @@ static int selinux_sem_alloc_security(struct kern_ipc_perm *sma)
 	ad.type = LSM_AUDIT_DATA_IPC;
 	ad.u.ipc_id = sma->key;
 
-	rc = avc_has_perm(&selinux_state,
-			  sid, isec->sid, SECCLASS_SEM,
-			  SEM__CREATE, &ad);
-	return rc;
+	return avc_has_perm(&selinux_state,
+			    sid, isec->sid, SECCLASS_SEM,
+			    SEM__CREATE, &ad);
 }
 
 static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg)
-- 
cgit v1.2.3


From 06c1c49d0cd1d6cec5b78963109ba728e49e0063 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 13 Sep 2022 10:28:56 -0700
Subject: fortify: Adjust KUnit test for modular build

A much better "unknown size" string pointer is available directly from
struct test, so use that instead of a global that isn't shared with
modules.

Reported-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/lkml/YyCOHOchVuE/E7vS@dev-arch.thelio-3990X
Fixes: 875bfd5276f3 ("fortify: Add KUnit test for FORTIFY_SOURCE internals")
Cc: linux-hardening@vger.kernel.org
Build-tested-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: David Gow <davidgow@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 lib/fortify_kunit.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c
index 99bc0ea60d27..409af07f340a 100644
--- a/lib/fortify_kunit.c
+++ b/lib/fortify_kunit.c
@@ -17,7 +17,6 @@
 
 #include <kunit/test.h>
 #include <linux/string.h>
-#include <linux/init.h>
 
 static const char array_of_10[] = "this is 10";
 static const char *ptr_of_11 = "this is 11!";
@@ -31,7 +30,7 @@ static void known_sizes_test(struct kunit *test)
 
 	KUNIT_EXPECT_EQ(test, __compiletime_strlen(array_unknown), SIZE_MAX);
 	/* Externally defined and dynamically sized string pointer: */
-	KUNIT_EXPECT_EQ(test, __compiletime_strlen(saved_command_line), SIZE_MAX);
+	KUNIT_EXPECT_EQ(test, __compiletime_strlen(test->name), SIZE_MAX);
 }
 
 /* This is volatile so the optimizer can't perform DCE below. */
-- 
cgit v1.2.3


From f74ca25d6d6629ffd4fd80a1a73037253b57d06b Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 12 Sep 2022 01:21:42 +0900
Subject: Bluetooth: avoid hci_dev_test_and_set_flag() in mgmt_init_hdev()

syzbot is again reporting attempt to cancel uninitialized work
at mgmt_index_removed() [1], for setting of HCI_MGMT flag from
mgmt_init_hdev() from hci_mgmt_cmd() from hci_sock_sendmsg() can
race with testing of HCI_MGMT flag from mgmt_index_removed() from
hci_sock_bind() due to lack of serialization via hci_dev_lock().

Since mgmt_init_hdev() is called with mgmt_chan_list_lock held, we can
safely split hci_dev_test_and_set_flag() into hci_dev_test_flag() and
hci_dev_set_flag(). Thus, in order to close this race, set HCI_MGMT flag
after INIT_DELAYED_WORK() completed.

This is a local fix based on mgmt_chan_list_lock. Lack of serialization
via hci_dev_lock() might be causing different race conditions somewhere
else. But a global fix based on hci_dev_lock() should deserve a future
patch.

Link: https://syzkaller.appspot.com/bug?extid=844c7bf1b1aa4119c5de
Reported-by: syzbot+844c7bf1b1aa4119c5de@syzkaller.appspotmail.com
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Fixes: 3f2893d3c142986a ("Bluetooth: don't try to cancel uninitialized works at mgmt_index_removed()")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/mgmt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 4c421ebac669..b9b64030a7b0 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -1111,7 +1111,7 @@ static void mesh_send_done(struct work_struct *work)
 
 static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev)
 {
-	if (hci_dev_test_and_set_flag(hdev, HCI_MGMT))
+	if (hci_dev_test_flag(hdev, HCI_MGMT))
 		return;
 
 	BT_INFO("MGMT ver %d.%d", MGMT_VERSION, MGMT_REVISION);
@@ -1127,6 +1127,8 @@ static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev)
 	 * it
 	 */
 	hci_dev_clear_flag(hdev, HCI_BONDABLE);
+
+	hci_dev_set_flag(hdev, HCI_MGMT);
 }
 
 static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
-- 
cgit v1.2.3


From f0ad26ee822b197f2421462df9c358a5687fddfd Mon Sep 17 00:00:00 2001
From: Sean Wang <sean.wang@mediatek.com>
Date: Wed, 31 Aug 2022 12:53:28 +0800
Subject: Bluetooth: btusb: Add a new PID/VID 13d3/3583 for MT7921

Add VID 13D3 & PID 3583 for MediaTek MT7921 USB Bluetooth chip.

The information in /sys/kernel/debug/usb/devices about the Bluetooth
device is listed as the below.

T:  Bus=03 Lev=02 Prnt=02 Port=02 Cnt=01 Dev#=  3 Spd=480  MxCh= 0
D:  Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=13d3 ProdID=3583 Rev= 1.00
S:  Manufacturer=MediaTek Inc.
S:  Product=Wireless_Device
S:  SerialNumber=000000000
C:* #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=100mA
A:  FirstIf#= 0 IfCount= 3 Cls=e0(wlcon) Sub=01 Prot=01
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=125us
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
I:  If#= 1 Alt= 6 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  63 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  63 Ivl=1ms
I:* If#= 2 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS=  64 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS=  64 Ivl=125us
I:  If#= 2 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS= 512 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS= 512 Ivl=125us

Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index c3daba17de7f..2167398e819c 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -488,6 +488,9 @@ static const struct usb_device_id blacklist_table[] = {
 	{ USB_DEVICE(0x13d3, 0x3578), .driver_info = BTUSB_MEDIATEK |
 						     BTUSB_WIDEBAND_SPEECH |
 						     BTUSB_VALID_LE_STATES },
+	{ USB_DEVICE(0x13d3, 0x3583), .driver_info = BTUSB_MEDIATEK |
+						     BTUSB_WIDEBAND_SPEECH |
+						     BTUSB_VALID_LE_STATES },
 	{ USB_DEVICE(0x0489, 0xe0cd), .driver_info = BTUSB_MEDIATEK |
 						     BTUSB_WIDEBAND_SPEECH |
 						     BTUSB_VALID_LE_STATES },
-- 
cgit v1.2.3


From be55622ce673f9692cc15d26d77a050cda42a3d3 Mon Sep 17 00:00:00 2001
From: Daniel Golle <daniel@makrotopia.org>
Date: Fri, 9 Sep 2022 21:00:30 +0100
Subject: Bluetooth: btusb: Add a new VID/PID 0e8d/0608 for MT7921

Add a new PID/VID 0e8d/0608 for MT7921K chip found on AMD RZ608 module.

From /sys/kernel/debug/usb/devices:
T:  Bus=01 Lev=02 Prnt=02 Port=01 Cnt=01 Dev#=  3 Spd=480  MxCh= 0
D:  Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=0e8d ProdID=0608 Rev= 1.00
S:  Manufacturer=MediaTek Inc.
S:  Product=Wireless_Device
S:  SerialNumber=000000000
C:* #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=100mA
A:  FirstIf#= 0 IfCount= 3 Cls=e0(wlcon) Sub=01 Prot=01
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=125us
E:  Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms
I:  If#= 1 Alt= 6 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  63 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  63 Ivl=1ms
I:* If#= 2 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS=  64 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS=  64 Ivl=125us
I:  If#= 2 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none)
E:  Ad=8a(I) Atr=03(Int.) MxPS=  64 Ivl=125us
E:  Ad=0a(O) Atr=03(Int.) MxPS=  64 Ivl=125us

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 2167398e819c..271963805a38 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -494,6 +494,9 @@ static const struct usb_device_id blacklist_table[] = {
 	{ USB_DEVICE(0x0489, 0xe0cd), .driver_info = BTUSB_MEDIATEK |
 						     BTUSB_WIDEBAND_SPEECH |
 						     BTUSB_VALID_LE_STATES },
+	{ USB_DEVICE(0x0e8d, 0x0608), .driver_info = BTUSB_MEDIATEK |
+						     BTUSB_WIDEBAND_SPEECH |
+						     BTUSB_VALID_LE_STATES },
 
 	/* MediaTek MT7922A Bluetooth devices */
 	{ USB_DEVICE(0x0489, 0xe0d8), .driver_info = BTUSB_MEDIATEK |
-- 
cgit v1.2.3


From 9afc675edeeb34d281675f1d5a217d27c5a1a3db Mon Sep 17 00:00:00 2001
From: Zhengping Jiang <jiangzp@google.com>
Date: Tue, 13 Sep 2022 16:37:15 -0700
Subject: Bluetooth: hci_sync: allow advertise when scan without RPA

Address resolution will be paused during active scan to allow any
advertising reports reach the host. If LL privacy is enabled,
advertising will rely on the controller to generate new RPA.

If host is not using RPA, there is no need to stop advertising during
active scan because there is no need to generate RPA in the controller.

Signed-off-by: Zhengping Jiang <jiangzp@google.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_sync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 41b6d19c70b0..422f7c6911d9 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -5351,7 +5351,7 @@ static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval)
 	/* Pause advertising since active scanning disables address resolution
 	 * which advertising depend on in order to generate its RPAs.
 	 */
-	if (use_ll_privacy(hdev)) {
+	if (use_ll_privacy(hdev) && hci_dev_test_flag(hdev, HCI_PRIVACY)) {
 		err = hci_pause_advertising_sync(hdev);
 		if (err) {
 			bt_dev_err(hdev, "pause advertising failed: %d", err);
-- 
cgit v1.2.3


From 47e34cb74d376ddfeaef94abb1d6dfb3c905ee51 Mon Sep 17 00:00:00 2001
From: Dave Marchevsky <davemarchevsky@fb.com>
Date: Mon, 12 Sep 2022 08:45:44 -0700
Subject: bpf: Add verifier check for BPF_PTR_POISON retval and arg

BPF_PTR_POISON was added in commit c0a5a21c25f37 ("bpf: Allow storing
referenced kptr in map") to denote a bpf_func_proto btf_id which the
verifier will replace with a dynamically-determined btf_id at verification
time.

This patch adds verifier 'poison' functionality to BPF_PTR_POISON in
order to prepare for expanded use of the value to poison ret- and
arg-btf_id in ongoing work, namely rbtree and linked list patchsets
[0, 1]. Specifically, when the verifier checks helper calls, it assumes
that BPF_PTR_POISON'ed ret type will be replaced with a valid type before
- or in lieu of - the default ret_btf_id logic. Similarly for arg btf_id.

If poisoned btf_id reaches default handling block for either, consider
this a verifier internal error and fail verification. Otherwise a helper
w/ poisoned btf_id but no verifier logic replacing the type will cause a
crash as the invalid pointer is dereferenced.

Also move BPF_PTR_POISON to existing include/linux/posion.h header and
remove unnecessary shift.

  [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com
  [1]: lore.kernel.org/bpf/20220904204145.3089-1-memxor@gmail.com

Signed-off-by: Dave Marchevsky <davemarchevsky@fb.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220912154544.1398199-1-davemarchevsky@fb.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/poison.h |  3 +++
 kernel/bpf/helpers.c   |  6 +++---
 kernel/bpf/verifier.c  | 30 +++++++++++++++++++++++-------
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/include/linux/poison.h b/include/linux/poison.h
index d62ef5a6b4e9..2d3249eb0e62 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -81,4 +81,7 @@
 /********** net/core/page_pool.c **********/
 #define PP_SIGNATURE		(0x40 + POISON_POINTER_DELTA)
 
+/********** kernel/bpf/ **********/
+#define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA))
+
 #endif
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index fc08035f14ed..41aeaf3862ec 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -15,6 +15,7 @@
 #include <linux/ctype.h>
 #include <linux/jiffies.h>
 #include <linux/pid_namespace.h>
+#include <linux/poison.h>
 #include <linux/proc_ns.h>
 #include <linux/security.h>
 #include <linux/btf_ids.h>
@@ -1376,10 +1377,9 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
 }
 
 /* Unlike other PTR_TO_BTF_ID helpers the btf_id in bpf_kptr_xchg()
- * helper is determined dynamically by the verifier.
+ * helper is determined dynamically by the verifier. Use BPF_PTR_POISON to
+ * denote type that verifier will determine.
  */
-#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
-
 static const struct bpf_func_proto bpf_kptr_xchg_proto = {
 	.func         = bpf_kptr_xchg,
 	.gpl_only     = false,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c259d734f863..8c6fbcd0afaf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,6 +23,7 @@
 #include <linux/error-injection.h>
 #include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
+#include <linux/poison.h>
 
 #include "disasm.h"
 
@@ -5782,13 +5783,22 @@ found:
 		if (meta->func_id == BPF_FUNC_kptr_xchg) {
 			if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno))
 				return -EACCES;
-		} else if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
-						 btf_vmlinux, *arg_btf_id,
-						 strict_type_match)) {
-			verbose(env, "R%d is of type %s but %s is expected\n",
-				regno, kernel_type_name(reg->btf, reg->btf_id),
-				kernel_type_name(btf_vmlinux, *arg_btf_id));
-			return -EACCES;
+		} else {
+			if (arg_btf_id == BPF_PTR_POISON) {
+				verbose(env, "verifier internal error:");
+				verbose(env, "R%d has non-overwritten BPF_PTR_POISON type\n",
+					regno);
+				return -EACCES;
+			}
+
+			if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
+						  btf_vmlinux, *arg_btf_id,
+						  strict_type_match)) {
+				verbose(env, "R%d is of type %s but %s is expected\n",
+					regno, kernel_type_name(reg->btf, reg->btf_id),
+					kernel_type_name(btf_vmlinux, *arg_btf_id));
+				return -EACCES;
+			}
 		}
 	}
 
@@ -7457,6 +7467,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			ret_btf = meta.kptr_off_desc->kptr.btf;
 			ret_btf_id = meta.kptr_off_desc->kptr.btf_id;
 		} else {
+			if (fn->ret_btf_id == BPF_PTR_POISON) {
+				verbose(env, "verifier internal error:");
+				verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n",
+					func_id_name(func_id));
+				return -EINVAL;
+			}
 			ret_btf = btf_vmlinux;
 			ret_btf_id = *fn->ret_btf_id;
 		}
-- 
cgit v1.2.3


From 8c03af3e090e9d57d90f482d344563dd4bae1e66 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 8 Sep 2022 12:04:50 +0200
Subject: x86,retpoline: Be sure to emit INT3 after JMP *%\reg

Both AMD and Intel recommend using INT3 after an indirect JMP. Make sure
to emit one when rewriting the retpoline JMP irrespective of compiler
SLS options or even CONFIG_SLS.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Link: https://lkml.kernel.org/r/Yxm+QkFPOhrVSH6q@hirez.programming.kicks-ass.net
---
 arch/x86/kernel/alternative.c | 9 +++++++++
 arch/x86/net/bpf_jit_comp.c   | 4 +++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 62f6b8b7c4a5..68d84cf8e001 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -453,6 +453,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
 		return ret;
 	i += ret;
 
+	/*
+	 * The compiler is supposed to EMIT an INT3 after every unconditional
+	 * JMP instruction due to AMD BTC. However, if the compiler is too old
+	 * or SLS isn't enabled, we still need an INT3 after indirect JMPs
+	 * even on Intel.
+	 */
+	if (op == JMP32_INSN_OPCODE && i < insn->length)
+		bytes[i++] = INT3_INSN_OPCODE;
+
 	for (; i < insn->length;)
 		bytes[i++] = BYTES_NOP1;
 
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index c1f6c1c51d99..4922517ddb0d 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -419,7 +419,9 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
 		OPTIMIZER_HIDE_VAR(reg);
 		emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
 	} else {
-		EMIT2(0xFF, 0xE0 + reg);
+		EMIT2(0xFF, 0xE0 + reg);	/* jmp *%\reg */
+		if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS))
+			EMIT1(0xCC);		/* int3 */
 	}
 
 	*pprog = prog;
-- 
cgit v1.2.3


From 812e92b824c1db16c9519f8624d48a9901a0d38f Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Tue, 13 Sep 2022 16:08:13 -0700
Subject: Bluetooth: RFCOMM: Fix possible deadlock on socket shutdown/release

Due to change to switch to use lock_sock inside rfcomm_sk_state_change
the socket shutdown/release procedure can cause a deadlock:

    rfcomm_sock_shutdown():
      lock_sock();
      __rfcomm_sock_close():
        rfcomm_dlc_close():
          __rfcomm_dlc_close():
            rfcomm_dlc_lock();
            rfcomm_sk_state_change():
              lock_sock();

To fix this when the call __rfcomm_sock_close is now done without
holding the lock_sock since rfcomm_dlc_lock exists to protect
the dlc data there is no need to use lock_sock in that code path.

Link: https://lore.kernel.org/all/CAD+dNTsbuU4w+Y_P7o+VEN7BYCAbZuwZx2+tH+OTzCdcZF82YA@mail.gmail.com/
Fixes: b7ce436a5d79 ("Bluetooth: switch to lock_sock in RFCOMM")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/rfcomm/sock.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 4bf4ea6cbb5e..21e24da4847f 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -902,7 +902,10 @@ static int rfcomm_sock_shutdown(struct socket *sock, int how)
 	lock_sock(sk);
 	if (!sk->sk_shutdown) {
 		sk->sk_shutdown = SHUTDOWN_MASK;
+
+		release_sock(sk);
 		__rfcomm_sock_close(sk);
+		lock_sock(sk);
 
 		if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
 		    !(current->flags & PF_EXITING))
-- 
cgit v1.2.3


From 83c10cc362d91c0d8d25e60779ee52fdbbf3894d Mon Sep 17 00:00:00 2001
From: Lee Jones <lee@kernel.org>
Date: Mon, 12 Sep 2022 14:38:55 +0100
Subject: bpf: Ensure correct locking around vulnerable function find_vpid()

The documentation for find_vpid() clearly states:

  "Must be called with the tasklist_lock or rcu_read_lock() held."

Presently we do neither for find_vpid() instance in bpf_task_fd_query().
Add proper rcu_read_lock/unlock() to fix the issue.

Fixes: 41bdc4b40ed6f ("bpf: introduce bpf subcommand BPF_TASK_FD_QUERY")
Signed-off-by: Lee Jones <lee@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20220912133855.1218900-1-lee@kernel.org
---
 kernel/bpf/syscall.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 27760627370d..1bd18af8af83 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4395,7 +4395,9 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
 	if (attr->task_fd_query.flags != 0)
 		return -EINVAL;
 
+	rcu_read_lock();
 	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+	rcu_read_unlock();
 	if (!task)
 		return -ENOENT;
 
-- 
cgit v1.2.3


From a02c118ee9e898612cbae42121b9e8663455b515 Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Tue, 13 Sep 2022 16:40:33 +0800
Subject: bpf: use kvmemdup_bpfptr helper

Use kvmemdup_bpfptr helper instead of open-coding to
simplify the code.

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/r/1663058433-14089-1-git-send-email-wangyufen@huawei.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/syscall.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 69be1c612daa..dab156f09f8d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1417,19 +1417,14 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 	}
 
 	value_size = bpf_map_value_size(map);
-
-	err = -ENOMEM;
-	value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
-	if (!value)
+	value = kvmemdup_bpfptr(uvalue, value_size);
+	if (IS_ERR(value)) {
+		err = PTR_ERR(value);
 		goto free_key;
-
-	err = -EFAULT;
-	if (copy_from_bpfptr(value, uvalue, value_size) != 0)
-		goto free_value;
+	}
 
 	err = bpf_map_update_value(map, f, key, value, attr->flags);
 
-free_value:
 	kvfree(value);
 free_key:
 	kvfree(key);
-- 
cgit v1.2.3


From bfeb7e399bacae4ee46ad978f5fce3e47f0978d6 Mon Sep 17 00:00:00 2001
From: Yauheni Kaliuta <ykaliuta@redhat.com>
Date: Mon, 5 Sep 2022 12:01:49 +0300
Subject: bpf: Use bpf_capable() instead of CAP_SYS_ADMIN for blinding decision

The full CAP_SYS_ADMIN requirement for blinding looks too strict nowadays.
These days given unprivileged BPF is disabled by default, the main users
for constant blinding coming from unprivileged in particular via cBPF -> eBPF
migration (e.g. old-style socket filters).

Signed-off-by: Yauheni Kaliuta <ykaliuta@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220831090655.156434-1-ykaliuta@redhat.com
Link: https://lore.kernel.org/bpf/20220905090149.61221-1-ykaliuta@redhat.com
---
 Documentation/admin-guide/sysctl/net.rst | 3 +++
 include/linux/filter.h                   | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 555681ef6195..6394f5dc2303 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -102,6 +102,9 @@ Values:
 	- 1 - enable JIT hardening for unprivileged users only
 	- 2 - enable JIT hardening for all users
 
+where "privileged user" in this context means a process having
+CAP_BPF or CAP_SYS_ADMIN in the root user name space.
+
 bpf_jit_kallsyms
 ----------------
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 527ae1d64e27..75335432fcbc 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1099,7 +1099,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
 		return false;
 	if (!bpf_jit_harden)
 		return false;
-	if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN))
+	if (bpf_jit_harden == 1 && bpf_capable())
 		return false;
 
 	return true;
-- 
cgit v1.2.3


From 9440155ccb948f8e3ce5308907a2e7378799be60 Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Sat, 3 Sep 2022 15:11:53 +0200
Subject: ftrace: Add HAVE_DYNAMIC_FTRACE_NO_PATCHABLE

x86 will shortly start using -fpatchable-function-entry for purposes
other than ftrace, make sure the __patchable_function_entry section
isn't merged in the mcount_loc section.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220903131154.420467-2-jolsa@kernel.org
---
 include/asm-generic/vmlinux.lds.h | 11 ++++++++++-
 kernel/trace/Kconfig              |  6 ++++++
 tools/objtool/check.c             |  3 ++-
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 7515a465ec03..13b197ef0d63 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -154,6 +154,14 @@
 #define MEM_DISCARD(sec) *(.mem##sec)
 #endif
 
+#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
+#define KEEP_PATCHABLE		KEEP(*(__patchable_function_entries))
+#define PATCHABLE_DISCARDS
+#else
+#define KEEP_PATCHABLE
+#define PATCHABLE_DISCARDS	*(__patchable_function_entries)
+#endif
+
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 /*
  * The ftrace call sites are logged to a section whose name depends on the
@@ -172,7 +180,7 @@
 #define MCOUNT_REC()	. = ALIGN(8);				\
 			__start_mcount_loc = .;			\
 			KEEP(*(__mcount_loc))			\
-			KEEP(*(__patchable_function_entries))	\
+			KEEP_PATCHABLE				\
 			__stop_mcount_loc = .;			\
 			ftrace_stub_graph = ftrace_stub;	\
 			ftrace_ops_list_func = arch_ftrace_ops_list_func;
@@ -1024,6 +1032,7 @@
 
 #define COMMON_DISCARDS							\
 	SANITIZER_DISCARDS						\
+	PATCHABLE_DISCARDS						\
 	*(.discard)							\
 	*(.discard.*)							\
 	*(.modinfo)							\
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1052126bdca2..e9e95c790b8e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -51,6 +51,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS
 	 This allows for use of regs_get_kernel_argument() and
 	 kernel_stack_pointer().
 
+config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
+	bool
+	help
+	  If the architecture generates __patchable_function_entries sections
+	  but does not want them included in the ftrace locations.
+
 config HAVE_FTRACE_MCOUNT_RECORD
 	bool
 	help
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index e55fdf952a3a..9216060c3408 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -4113,7 +4113,8 @@ static int validate_ibt(struct objtool_file *file)
 		    !strcmp(sec->name, "__bug_table")			||
 		    !strcmp(sec->name, "__ex_table")			||
 		    !strcmp(sec->name, "__jump_table")			||
-		    !strcmp(sec->name, "__mcount_loc"))
+		    !strcmp(sec->name, "__mcount_loc")			||
+		    strstr(sec->name, "__patchable_function_entries"))
 			continue;
 
 		list_for_each_entry(reloc, &sec->reloc->reloc_list, list)
-- 
cgit v1.2.3


From ceea991a019c57a1fb0edd12a5f836a0fa431aee Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Sat, 3 Sep 2022 15:11:54 +0200
Subject: bpf: Move bpf_dispatcher function out of ftrace locations

The dispatcher function is attached/detached to trampoline by
dispatcher update function. At the same time it's available as
ftrace attachable function.

After discussion [1] the proposed solution is to use compiler
attributes to alter bpf_dispatcher_##name##_func function:

  - remove it from being instrumented with __no_instrument_function__
    attribute, so ftrace has no track of it

  - but still generate 5 nop instructions with patchable_function_entry(5)
    attribute, which are expected by bpf_arch_text_poke used by
    dispatcher update function

Enabling HAVE_DYNAMIC_FTRACE_NO_PATCHABLE option for x86, so
__patchable_function_entries functions are not part of ftrace/mcount
locations.

Adding attributes to bpf_dispatcher_XXX function on x86_64 so it's
kept out of ftrace locations and has 5 byte nop generated at entry.

These attributes need to be arch specific as pointed out by Ilya
Leoshkevic in here [2].

The dispatcher image is generated only for x86_64 arch, so the
code can stay as is for other archs.

  [1] https://lore.kernel.org/bpf/20220722110811.124515-1-jolsa@kernel.org/
  [2] https://lore.kernel.org/bpf/969a14281a7791c334d476825863ee449964dd0c.camel@linux.ibm.com/

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/bpf/20220903131154.420467-3-jolsa@kernel.org
---
 arch/x86/Kconfig    | 1 +
 include/linux/bpf.h | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f9920f1341c8..089c20cefd2b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -284,6 +284,7 @@ config X86
 	select PROC_PID_ARCH_STATUS		if PROC_FS
 	select HAVE_ARCH_NODE_DEV_GROUP		if X86_SGX
 	imply IMA_SECURE_AND_OR_TRUSTED_BOOT    if EFI
+	select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
 
 config INSTRUCTION_DECODER
 	def_bool y
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 54178b9e9c3a..e0dbe0c0a17e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -977,7 +977,14 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs);
 	},							\
 }
 
+#ifdef CONFIG_X86_64
+#define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5)))
+#else
+#define BPF_DISPATCHER_ATTRIBUTES
+#endif
+
 #define DEFINE_BPF_DISPATCHER(name)					\
+	notrace BPF_DISPATCHER_ATTRIBUTES				\
 	noinline __nocfi unsigned int bpf_dispatcher_##name##_func(	\
 		const void *ctx,					\
 		const struct bpf_insn *insnsi,				\
-- 
cgit v1.2.3


From cf060c2c399fa457569123bb9806b455ff53e64c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 9 Sep 2022 12:30:51 -0700
Subject: selftests/bpf: Fix test_verif_scale{1,3} SEC() annotations

Use proper SEC("tc") for test_verif_scale{1,3} programs. It's not
a problem for selftests right now because we manually set type
programmatically, but not having correct SEC() definitions makes it
harded to generically load BPF object files.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220909193053.577111-2-andrii@kernel.org
---
 tools/testing/selftests/bpf/progs/test_verif_scale1.c | 2 +-
 tools/testing/selftests/bpf/progs/test_verif_scale3.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale1.c b/tools/testing/selftests/bpf/progs/test_verif_scale1.c
index d38153dab3dd..ac6135d9374c 100644
--- a/tools/testing/selftests/bpf/progs/test_verif_scale1.c
+++ b/tools/testing/selftests/bpf/progs/test_verif_scale1.c
@@ -5,7 +5,7 @@
 #define ATTR __attribute__((noinline))
 #include "test_jhash.h"
 
-SEC("scale90_noinline")
+SEC("tc")
 int balancer_ingress(struct __sk_buff *ctx)
 {
 	void *data_end = (void *)(long)ctx->data_end;
diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale3.c b/tools/testing/selftests/bpf/progs/test_verif_scale3.c
index 9beb5bf80373..ca33a9b711c4 100644
--- a/tools/testing/selftests/bpf/progs/test_verif_scale3.c
+++ b/tools/testing/selftests/bpf/progs/test_verif_scale3.c
@@ -5,7 +5,7 @@
 #define ATTR __attribute__((noinline))
 #include "test_jhash.h"
 
-SEC("scale90_noinline32")
+SEC("tc")
 int balancer_ingress(struct __sk_buff *ctx)
 {
 	void *data_end = (void *)(long)ctx->data_end;
-- 
cgit v1.2.3


From 749c202cb6ea40f4d7ac95c4a1217a7b506f43a8 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 9 Sep 2022 12:30:52 -0700
Subject: libbpf: Fix crash if SEC("freplace") programs don't have
 attach_prog_fd set

Fix SIGSEGV caused by libbpf trying to find attach type in vmlinux BTF
for freplace programs. It's wrong to search in vmlinux BTF and libbpf
doesn't even mark vmlinux BTF as required for freplace programs. So
trying to search anything in obj->vmlinux_btf might cause NULL
dereference if nothing else in BPF object requires vmlinux BTF.

Instead, error out if freplace (EXT) program doesn't specify
attach_prog_fd during at the load time.

Fixes: 91abb4a6d79d ("libbpf: Support attachment of BPF tracing programs to kernel modules")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220909193053.577111-3-andrii@kernel.org
---
 tools/lib/bpf/libbpf.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 3ad139285fad..2ca30ccc774c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -9084,11 +9084,15 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attac
 	int err = 0;
 
 	/* BPF program's BTF ID */
-	if (attach_prog_fd) {
+	if (prog->type == BPF_PROG_TYPE_EXT || attach_prog_fd) {
+		if (!attach_prog_fd) {
+			pr_warn("prog '%s': attach program FD is not set\n", prog->name);
+			return -EINVAL;
+		}
 		err = libbpf_find_prog_btf_id(attach_name, attach_prog_fd);
 		if (err < 0) {
-			pr_warn("failed to find BPF program (FD %d) BTF ID for '%s': %d\n",
-				 attach_prog_fd, attach_name, err);
+			pr_warn("prog '%s': failed to find BPF program (FD %d) BTF ID for '%s': %d\n",
+				 prog->name, attach_prog_fd, attach_name, err);
 			return err;
 		}
 		*btf_obj_fd = 0;
@@ -9105,7 +9109,8 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attac
 		err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id);
 	}
 	if (err) {
-		pr_warn("failed to find kernel BTF type ID of '%s': %d\n", attach_name, err);
+		pr_warn("prog '%s': failed to find kernel BTF type ID of '%s': %d\n",
+			prog->name, attach_name, err);
 		return err;
 	}
 	return 0;
-- 
cgit v1.2.3


From c8bc5e0509767e51b35ae2f4af6ff90fa6a5f27f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 9 Sep 2022 12:30:53 -0700
Subject: selftests/bpf: Add veristat tool for mass-verifying BPF object files

Add a small tool, veristat, that allows mass-verification of
a set of *libbpf-compatible* BPF ELF object files. For each such object
file, veristat will attempt to verify each BPF program *individually*.
Regardless of success or failure, it parses BPF verifier stats and
outputs them in human-readable table format. In the future we can also
add CSV and JSON output for more scriptable post-processing, if necessary.

veristat allows to specify a set of stats that should be output and
ordering between multiple objects and files (e.g., so that one can
easily order by total instructions processed, instead of default file
name, prog name, verdict, total instructions order).

This tool should be useful for validating various BPF verifier changes
or even validating different kernel versions for regressions.

Here's an example for some of the heaviest selftests/bpf BPF object
files:

  $ sudo ./veristat -s insns,file,prog {pyperf,loop,test_verif_scale,strobemeta,test_cls_redirect,profiler}*.linked3.o
  File                                  Program                               Verdict  Duration, us  Total insns  Total states  Peak states
  ------------------------------------  ------------------------------------  -------  ------------  -----------  ------------  -----------
  loop3.linked3.o                       while_true                            failure        350990      1000001          9663         9663
  test_verif_scale3.linked3.o           balancer_ingress                      success        115244       845499          8636         2141
  test_verif_scale2.linked3.o           balancer_ingress                      success         77688       773445          3048          788
  pyperf600.linked3.o                   on_event                              success       2079872       624585         30335        30241
  pyperf600_nounroll.linked3.o          on_event                              success        353972       568128         37101         2115
  strobemeta.linked3.o                  on_event                              success        455230       557149         15915        13537
  test_verif_scale1.linked3.o           balancer_ingress                      success         89880       554754          8636         2141
  strobemeta_nounroll2.linked3.o        on_event                              success        433906       501725         17087         1912
  loop6.linked3.o                       trace_virtqueue_add_sgs               success        282205       398057          8717          919
  loop1.linked3.o                       nested_loops                          success        125630       361349          5504         5504
  pyperf180.linked3.o                   on_event                              success       2511740       160398         11470        11446
  pyperf100.linked3.o                   on_event                              success        744329        87681          6213         6191
  test_cls_redirect.linked3.o           cls_redirect                          success         54087        78925          4782          903
  strobemeta_subprogs.linked3.o         on_event                              success         57898        65420          1954          403
  test_cls_redirect_subprogs.linked3.o  cls_redirect                          success         54522        64965          4619          958
  strobemeta_nounroll1.linked3.o        on_event                              success         43313        57240          1757          382
  pyperf50.linked3.o                    on_event                              success        194355        46378          3263         3241
  profiler2.linked3.o                   tracepoint__syscalls__sys_enter_kill  success         23869        43372          1423          542
  pyperf_subprogs.linked3.o             on_event                              success         29179        36358          2499         2499
  profiler1.linked3.o                   tracepoint__syscalls__sys_enter_kill  success         13052        27036          1946          936
  profiler3.linked3.o                   tracepoint__syscalls__sys_enter_kill  success         21023        26016          2186          915
  profiler2.linked3.o                   kprobe__vfs_link                      success          5255        13896           303          271
  profiler1.linked3.o                   kprobe__vfs_link                      success          7792        12687          1042         1041
  profiler3.linked3.o                   kprobe__vfs_link                      success          7332        10601           865          865
  profiler2.linked3.o                   kprobe_ret__do_filp_open              success          3417         8900           216          199
  profiler2.linked3.o                   kprobe__vfs_symlink                   success          3548         8775           203          186
  pyperf_global.linked3.o               on_event                              success         10007         7563           520          520
  profiler3.linked3.o                   kprobe_ret__do_filp_open              success          4708         6464           532          532
  profiler1.linked3.o                   kprobe_ret__do_filp_open              success          3090         6445           508          508
  profiler3.linked3.o                   kprobe__vfs_symlink                   success          4477         6358           521          521
  profiler1.linked3.o                   kprobe__vfs_symlink                   success          3381         6347           507          507
  profiler2.linked3.o                   raw_tracepoint__sched_process_exec    success          2464         5874           292          189
  profiler3.linked3.o                   raw_tracepoint__sched_process_exec    success          2677         4363           397          283
  profiler2.linked3.o                   kprobe__proc_sys_write                success          1800         4355           143          138
  profiler1.linked3.o                   raw_tracepoint__sched_process_exec    success          1649         4019           333          240
  pyperf600_bpf_loop.linked3.o          on_event                              success          2711         3966           306          306
  profiler2.linked3.o                   raw_tracepoint__sched_process_exit    success          1234         3138            83           66
  profiler3.linked3.o                   kprobe__proc_sys_write                success          1755         2623           223          223
  profiler1.linked3.o                   kprobe__proc_sys_write                success          1222         2456           193          193
  loop2.linked3.o                       while_true                            success           608         1783            57           30
  profiler3.linked3.o                   raw_tracepoint__sched_process_exit    success           789         1680           146          146
  profiler1.linked3.o                   raw_tracepoint__sched_process_exit    success           592         1526           133          133
  strobemeta_bpf_loop.linked3.o         on_event                              success          1015         1512           106          106
  loop4.linked3.o                       combinations                          success           165          524            18           17
  profiler3.linked3.o                   raw_tracepoint__sched_process_fork    success           196          299            25           25
  profiler1.linked3.o                   raw_tracepoint__sched_process_fork    success           109          265            19           19
  profiler2.linked3.o                   raw_tracepoint__sched_process_fork    success           111          265            19           19
  loop5.linked3.o                       while_true                            success            47           84             9            9
  ------------------------------------  ------------------------------------  -------  ------------  -----------  ------------  -----------

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220909193053.577111-4-andrii@kernel.org
---
 tools/testing/selftests/bpf/.gitignore |   1 +
 tools/testing/selftests/bpf/Makefile   |   7 +-
 tools/testing/selftests/bpf/veristat.c | 537 +++++++++++++++++++++++++++++++++
 3 files changed, 544 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/veristat.c

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 3a8cb2404ea6..3b288562963e 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -39,6 +39,7 @@ test_cpp
 /tools
 /runqslower
 /bench
+/veristat
 *.ko
 *.tmp
 xskxceiver
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 6cd327f1f216..1a0296bd744a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -82,7 +82,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
 TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
 	flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
 	test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
-	xskxceiver xdp_redirect_multi xdp_synproxy
+	xskxceiver xdp_redirect_multi xdp_synproxy veristat
 
 TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
 
@@ -595,6 +595,11 @@ $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
 
+$(OUTPUT)/veristat.o: $(BPFOBJ)
+$(OUTPUT)/veristat: $(OUTPUT)/veristat.o
+	$(call msg,BINARY,,$@)
+	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.a %.o,$^) $(LDLIBS) -o $@
+
 EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) $(HOST_SCRATCH_DIR)	\
 	prog_tests/tests.h map_tests/tests.h verifier/tests.h		\
 	feature bpftool							\
diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
new file mode 100644
index 000000000000..39e6dc41e504
--- /dev/null
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -0,0 +1,537 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#define _GNU_SOURCE
+#include <argp.h>
+#include <string.h>
+#include <stdlib.h>
+#include <linux/compiler.h>
+#include <sched.h>
+#include <pthread.h>
+#include <dirent.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/sysinfo.h>
+#include <sys/stat.h>
+#include <bpf/libbpf.h>
+
+enum stat_id {
+	VERDICT,
+	DURATION,
+	TOTAL_INSNS,
+	TOTAL_STATES,
+	PEAK_STATES,
+	MAX_STATES_PER_INSN,
+	MARK_READ_MAX_LEN,
+
+	FILE_NAME,
+	PROG_NAME,
+
+	ALL_STATS_CNT,
+	NUM_STATS_CNT = FILE_NAME - VERDICT,
+};
+
+struct verif_stats {
+	char *file_name;
+	char *prog_name;
+
+	long stats[NUM_STATS_CNT];
+};
+
+struct stat_specs {
+	int spec_cnt;
+	enum stat_id ids[ALL_STATS_CNT];
+	bool asc[ALL_STATS_CNT];
+	int lens[ALL_STATS_CNT];
+};
+
+static struct env {
+	char **filenames;
+	int filename_cnt;
+	bool verbose;
+
+	struct verif_stats *prog_stats;
+	int prog_stat_cnt;
+
+	struct stat_specs output_spec;
+	struct stat_specs sort_spec;
+} env;
+
+static int libbpf_print_fn(enum libbpf_print_level level,
+		    const char *format, va_list args)
+{
+	if (!env.verbose)
+		return 0;
+	if (level == LIBBPF_DEBUG /* && !env.verbose */)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+const char *argp_program_version = "veristat";
+const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
+const char argp_program_doc[] =
+"veristat    BPF verifier stats collection tool.\n"
+"\n"
+"USAGE: veristat <obj-file> [<obj-file>...]\n";
+
+static const struct argp_option opts[] = {
+	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
+	{ "verbose", 'v', NULL, 0, "Verbose mode" },
+	{ "output", 'o', "SPEC", 0, "Specify output stats" },
+	{ "sort", 's', "SPEC", 0, "Specify sort order" },
+	{},
+};
+
+static int parse_stats(const char *stats_str, struct stat_specs *specs);
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+	void *tmp;
+	int err;
+
+	switch (key) {
+	case 'h':
+		argp_state_help(state, stderr, ARGP_HELP_STD_HELP);
+		break;
+	case 'v':
+		env.verbose = true;
+		break;
+	case 'o':
+		err = parse_stats(arg, &env.output_spec);
+		if (err)
+			return err;
+		break;
+	case 's':
+		err = parse_stats(arg, &env.sort_spec);
+		if (err)
+			return err;
+		break;
+	case ARGP_KEY_ARG:
+		tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames));
+		if (!tmp)
+			return -ENOMEM;
+		env.filenames = tmp;
+		env.filenames[env.filename_cnt] = strdup(arg);
+		if (!env.filenames[env.filename_cnt])
+			return -ENOMEM;
+		env.filename_cnt++;
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+	return 0;
+}
+
+static const struct stat_specs default_output_spec = {
+	.spec_cnt = 7,
+	.ids = {
+		FILE_NAME, PROG_NAME, VERDICT, DURATION,
+		TOTAL_INSNS, TOTAL_STATES, PEAK_STATES,
+	},
+};
+
+static const struct stat_specs default_sort_spec = {
+	.spec_cnt = 2,
+	.ids = {
+		FILE_NAME, PROG_NAME,
+	},
+	.asc = { true, true, },
+};
+
+static struct stat_def {
+	const char *header;
+	const char *names[4];
+	bool asc_by_default;
+} stat_defs[] = {
+	[FILE_NAME] = { "File", {"file_name", "filename", "file"}, true /* asc */ },
+	[PROG_NAME] = { "Program", {"prog_name", "progname", "prog"}, true /* asc */ },
+	[VERDICT] = { "Verdict", {"verdict"}, true /* asc: failure, success */ },
+	[DURATION] = { "Duration, us", {"duration", "dur"}, },
+	[TOTAL_INSNS] = { "Total insns", {"total_insns", "insns"}, },
+	[TOTAL_STATES] = { "Total states", {"total_states", "states"}, },
+	[PEAK_STATES] = { "Peak states", {"peak_states"}, },
+	[MAX_STATES_PER_INSN] = { "Max states per insn", {"max_states_per_insn"}, },
+	[MARK_READ_MAX_LEN] = { "Max mark read length", {"max_mark_read_len", "mark_read"}, },
+};
+
+static int parse_stat(const char *stat_name, struct stat_specs *specs)
+{
+	int id, i;
+
+	if (specs->spec_cnt >= ARRAY_SIZE(specs->ids)) {
+		fprintf(stderr, "Can't specify more than %zd stats\n", ARRAY_SIZE(specs->ids));
+		return -E2BIG;
+	}
+
+	for (id = 0; id < ARRAY_SIZE(stat_defs); id++) {
+		struct stat_def *def = &stat_defs[id];
+
+		for (i = 0; i < ARRAY_SIZE(stat_defs[id].names); i++) {
+			if (!def->names[i] || strcmp(def->names[i], stat_name) != 0)
+				continue;
+
+			specs->ids[specs->spec_cnt] = id;
+			specs->asc[specs->spec_cnt] = def->asc_by_default;
+			specs->spec_cnt++;
+
+			return 0;
+		}
+	}
+
+	fprintf(stderr, "Unrecognized stat name '%s'\n", stat_name);
+	return -ESRCH;
+}
+
+static int parse_stats(const char *stats_str, struct stat_specs *specs)
+{
+	char *input, *state = NULL, *next;
+	int err;
+
+	input = strdup(stats_str);
+	if (!input)
+		return -ENOMEM;
+
+	while ((next = strtok_r(state ? NULL : input, ",", &state))) {
+		err = parse_stat(next, specs);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static char verif_log_buf[64 * 1024];
+
+static int parse_verif_log(const char *buf, size_t buf_sz, struct verif_stats *s)
+{
+	const char *next;
+	int pos;
+
+	for (pos = 0; buf[0]; buf = next) {
+		if (buf[0] == '\n')
+			buf++;
+		next = strchrnul(&buf[pos], '\n');
+
+		if (1 == sscanf(buf, "verification time %ld usec\n", &s->stats[DURATION]))
+			continue;
+		if (6 == sscanf(buf, "processed %ld insns (limit %*d) max_states_per_insn %ld total_states %ld peak_states %ld mark_read %ld",
+				&s->stats[TOTAL_INSNS],
+				&s->stats[MAX_STATES_PER_INSN],
+				&s->stats[TOTAL_STATES],
+				&s->stats[PEAK_STATES],
+				&s->stats[MARK_READ_MAX_LEN]))
+			continue;
+	}
+
+	return 0;
+}
+
+static int process_prog(const char *filename, struct bpf_object *obj, struct bpf_program *prog)
+{
+	const char *prog_name = bpf_program__name(prog);
+	size_t buf_sz = sizeof(verif_log_buf);
+	char *buf = verif_log_buf;
+	struct verif_stats *stats;
+	int err = 0;
+	void *tmp;
+
+	tmp = realloc(env.prog_stats, (env.prog_stat_cnt + 1) * sizeof(*env.prog_stats));
+	if (!tmp)
+		return -ENOMEM;
+	env.prog_stats = tmp;
+	stats = &env.prog_stats[env.prog_stat_cnt++];
+	memset(stats, 0, sizeof(*stats));
+
+	if (env.verbose) {
+		buf_sz = 16 * 1024 * 1024;
+		buf = malloc(buf_sz);
+		if (!buf)
+			return -ENOMEM;
+		bpf_program__set_log_buf(prog, buf, buf_sz);
+		bpf_program__set_log_level(prog, 1 | 4); /* stats + log */
+	} else {
+		bpf_program__set_log_buf(prog, buf, buf_sz);
+		bpf_program__set_log_level(prog, 4); /* only verifier stats */
+	}
+	verif_log_buf[0] = '\0';
+
+	err = bpf_object__load(obj);
+
+	stats->file_name = strdup(basename(filename));
+	stats->prog_name = strdup(bpf_program__name(prog));
+	stats->stats[VERDICT] = err == 0; /* 1 - success, 0 - failure */
+	parse_verif_log(buf, buf_sz, stats);
+
+	if (env.verbose) {
+		printf("PROCESSING %s/%s, DURATION US: %ld, VERDICT: %s, VERIFIER LOG:\n%s\n",
+		       filename, prog_name, stats->stats[DURATION],
+		       err ? "failure" : "success", buf);
+	}
+
+	if (verif_log_buf != buf)
+		free(buf);
+
+	return 0;
+};
+
+static int process_obj(const char *filename)
+{
+	struct bpf_object *obj = NULL, *tobj;
+	struct bpf_program *prog, *tprog, *lprog;
+	libbpf_print_fn_t old_libbpf_print_fn;
+	LIBBPF_OPTS(bpf_object_open_opts, opts);
+	int err = 0, prog_cnt = 0;
+
+	old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn);
+
+	obj = bpf_object__open_file(filename, &opts);
+	if (!obj) {
+		err = -errno;
+		fprintf(stderr, "Failed to open '%s': %d\n", filename, err);
+		goto cleanup;
+	}
+
+	bpf_object__for_each_program(prog, obj) {
+		prog_cnt++;
+	}
+
+	if (prog_cnt == 1) {
+		prog = bpf_object__next_program(obj, NULL);
+		bpf_program__set_autoload(prog, true);
+		process_prog(filename, obj, prog);
+		bpf_object__close(obj);
+		goto cleanup;
+	}
+
+	bpf_object__for_each_program(prog, obj) {
+		const char *prog_name = bpf_program__name(prog);
+
+		tobj = bpf_object__open_file(filename, &opts);
+		if (!tobj) {
+			err = -errno;
+			fprintf(stderr, "Failed to open '%s': %d\n", filename, err);
+			goto cleanup;
+		}
+
+		bpf_object__for_each_program(tprog, tobj) {
+			const char *tprog_name = bpf_program__name(tprog);
+
+			if (strcmp(prog_name, tprog_name) == 0) {
+				bpf_program__set_autoload(tprog, true);
+				lprog = tprog;
+			} else {
+				bpf_program__set_autoload(tprog, false);
+			}
+		}
+
+		process_prog(filename, tobj, lprog);
+		bpf_object__close(tobj);
+	}
+
+cleanup:
+	bpf_object__close(obj);
+	libbpf_set_print(old_libbpf_print_fn);
+	return err;
+}
+
+static int cmp_stat(const struct verif_stats *s1, const struct verif_stats *s2,
+		    enum stat_id id, bool asc)
+{
+	int cmp = 0;
+
+	switch (id) {
+	case FILE_NAME:
+		cmp = strcmp(s1->file_name, s2->file_name);
+		break;
+	case PROG_NAME:
+		cmp = strcmp(s1->prog_name, s2->prog_name);
+		break;
+	case VERDICT:
+	case DURATION:
+	case TOTAL_INSNS:
+	case TOTAL_STATES:
+	case PEAK_STATES:
+	case MAX_STATES_PER_INSN:
+	case MARK_READ_MAX_LEN: {
+		long v1 = s1->stats[id];
+		long v2 = s2->stats[id];
+
+		if (v1 != v2)
+			cmp = v1 < v2 ? -1 : 1;
+		break;
+	}
+	default:
+		fprintf(stderr, "Unrecognized stat #%d\n", id);
+		exit(1);
+	}
+
+	return asc ? cmp : -cmp;
+}
+
+static int cmp_prog_stats(const void *v1, const void *v2)
+{
+	const struct verif_stats *s1 = v1, *s2 = v2;
+	int i, cmp;
+
+	for (i = 0; i < env.sort_spec.spec_cnt; i++) {
+		cmp = cmp_stat(s1, s2, env.sort_spec.ids[i], env.sort_spec.asc[i]);
+		if (cmp != 0)
+			return cmp;
+	}
+
+	return 0;
+}
+
+#define HEADER_CHAR '-'
+#define COLUMN_SEP "  "
+
+static void output_headers(bool calc_len)
+{
+	int i, len;
+
+	for (i = 0; i < env.output_spec.spec_cnt; i++) {
+		int id = env.output_spec.ids[i];
+		int *max_len = &env.output_spec.lens[i];
+
+		if (calc_len) {
+			len = snprintf(NULL, 0, "%s", stat_defs[id].header);
+			if (len > *max_len)
+				*max_len = len;
+		} else {
+			printf("%s%-*s", i == 0 ? "" : COLUMN_SEP,  *max_len, stat_defs[id].header);
+		}
+	}
+
+	if (!calc_len)
+		printf("\n");
+}
+
+static void output_header_underlines(void)
+{
+	int i, j, len;
+
+	for (i = 0; i < env.output_spec.spec_cnt; i++) {
+		len = env.output_spec.lens[i];
+
+		printf("%s", i == 0 ? "" : COLUMN_SEP);
+		for (j = 0; j < len; j++)
+			printf("%c", HEADER_CHAR);
+	}
+	printf("\n");
+}
+
+static void output_stats(const struct verif_stats *s, bool calc_len)
+{
+	int i;
+
+	for (i = 0; i < env.output_spec.spec_cnt; i++) {
+		int id = env.output_spec.ids[i];
+		int *max_len = &env.output_spec.lens[i], len;
+		const char *str = NULL;
+		long val = 0;
+
+		switch (id) {
+		case FILE_NAME:
+			str = s->file_name;
+			break;
+		case PROG_NAME:
+			str = s->prog_name;
+			break;
+		case VERDICT:
+			str = s->stats[VERDICT] ? "success" : "failure";
+			break;
+		case DURATION:
+		case TOTAL_INSNS:
+		case TOTAL_STATES:
+		case PEAK_STATES:
+		case MAX_STATES_PER_INSN:
+		case MARK_READ_MAX_LEN:
+			val = s->stats[id];
+			break;
+		default:
+			fprintf(stderr, "Unrecognized stat #%d\n", id);
+			exit(1);
+		}
+
+		if (calc_len) {
+			if (str)
+				len = snprintf(NULL, 0, "%s", str);
+			else
+				len = snprintf(NULL, 0, "%ld", val);
+			if (len > *max_len)
+				*max_len = len;
+		} else {
+			if (str)
+				printf("%s%-*s", i == 0 ? "" : COLUMN_SEP, *max_len, str);
+			else
+				printf("%s%*ld", i == 0 ? "" : COLUMN_SEP,  *max_len, val);
+		}
+	}
+
+	if (!calc_len)
+		printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+	static const struct argp argp = {
+		.options = opts,
+		.parser = parse_arg,
+		.doc = argp_program_doc,
+	};
+	int err = 0, i;
+
+	if (argp_parse(&argp, argc, argv, 0, NULL, NULL))
+		return 1;
+
+	if (env.filename_cnt == 0) {
+		fprintf(stderr, "Please provide path to BPF object file!\n");
+		argp_help(&argp, stderr, ARGP_HELP_USAGE, "veristat");
+		return 1;
+	}
+
+	if (env.output_spec.spec_cnt == 0)
+		env.output_spec = default_output_spec;
+	if (env.sort_spec.spec_cnt == 0)
+		env.sort_spec = default_sort_spec;
+
+	for (i = 0; i < env.filename_cnt; i++) {
+		err = process_obj(env.filenames[i]);
+		if (err) {
+			fprintf(stderr, "Failed to process '%s': %d\n", env.filenames[i], err);
+			goto cleanup;
+		}
+	}
+
+	qsort(env.prog_stats, env.prog_stat_cnt, sizeof(*env.prog_stats), cmp_prog_stats);
+
+	/* calculate column widths */
+	output_headers(true);
+	for (i = 0; i < env.prog_stat_cnt; i++) {
+		output_stats(&env.prog_stats[i], true);
+	}
+
+	/* actually output the table */
+	output_headers(false);
+	output_header_underlines();
+	for (i = 0; i < env.prog_stat_cnt; i++) {
+		output_stats(&env.prog_stats[i], false);
+	}
+	output_header_underlines();
+	printf("\n");
+
+	printf("Done. Processed %d object files, %d programs.\n",
+	       env.filename_cnt, env.prog_stat_cnt);
+
+cleanup:
+	for (i = 0; i < env.prog_stat_cnt; i++) {
+		free(env.prog_stats[i].file_name);
+		free(env.prog_stats[i].prog_name);
+	}
+	free(env.prog_stats);
+	for (i = 0; i < env.filename_cnt; i++)
+		free(env.filenames[i]);
+	free(env.filenames);
+	return -err;
+}
-- 
cgit v1.2.3


From dc567045f1590f6460d3e9a6ea6ad5e600b58b84 Mon Sep 17 00:00:00 2001
From: Xin Liu <liuxin350@huawei.com>
Date: Tue, 13 Sep 2022 15:36:43 +0800
Subject: libbpf: Clean up legacy bpf maps declaration in bpf_helpers

Legacy BPF map declarations are no longer supported in libbpf v1.0 [0].
Only BTF-defined maps are supported starting from v1.0, so it is time to
remove the definition of bpf_map_def in bpf_helpers.h.

  [0] https://github.com/libbpf/libbpf/wiki/Libbpf:-the-road-to-v1.0

Signed-off-by: Xin Liu <liuxin350@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/bpf/20220913073643.19960-1-liuxin350@huawei.com
---
 tools/lib/bpf/bpf_helpers.h | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h
index 7349b16b8e2f..d37c4fe2849d 100644
--- a/tools/lib/bpf/bpf_helpers.h
+++ b/tools/lib/bpf/bpf_helpers.h
@@ -160,18 +160,6 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
 }
 #endif
 
-/*
- * Helper structure used by eBPF C program
- * to describe BPF map attributes to libbpf loader
- */
-struct bpf_map_def {
-	unsigned int type;
-	unsigned int key_size;
-	unsigned int value_size;
-	unsigned int max_entries;
-	unsigned int map_flags;
-} __attribute__((deprecated("use BTF-defined maps in .maps section")));
-
 enum libbpf_pin_type {
 	LIBBPF_PIN_NONE,
 	/* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */
-- 
cgit v1.2.3


From 571f9738bfb3d4b42253c1d0ad26da9fede85f36 Mon Sep 17 00:00:00 2001
From: Peilin Ye <peilin.ye@bytedance.com>
Date: Fri, 16 Sep 2022 13:28:00 -0700
Subject: bpf/btf: Use btf_type_str() whenever possible

We have btf_type_str().  Use it whenever possible in btf.c, instead of
"btf_kind_str[BTF_INFO_KIND(t->info)]".

Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
Link: https://lore.kernel.org/r/20220916202800.31421-1-yepeilin.cs@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/btf.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 98be25d13325..b3940c605aac 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1397,7 +1397,6 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
 						   const char *fmt, ...)
 {
 	struct bpf_verifier_log *log = &env->log;
-	u8 kind = BTF_INFO_KIND(t->info);
 	struct btf *btf = env->btf;
 	va_list args;
 
@@ -1413,7 +1412,7 @@ __printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
 
 	__btf_verifier_log(log, "[%u] %s %s%s",
 			   env->log_type_id,
-			   btf_kind_str[kind],
+			   btf_type_str(t),
 			   __btf_name_by_offset(btf, t->name_off),
 			   log_details ? " " : "");
 
@@ -5427,7 +5426,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			if (!btf_type_is_small_int(t)) {
 				bpf_log(log,
 					"ret type %s not allowed for fmod_ret\n",
-					btf_kind_str[BTF_INFO_KIND(t->info)]);
+					btf_type_str(t));
 				return false;
 			}
 			break;
@@ -5454,7 +5453,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			"func '%s' arg%d '%s' has type %s. Only pointer access is allowed\n",
 			tname, arg,
 			__btf_name_by_offset(btf, t->name_off),
-			btf_kind_str[BTF_INFO_KIND(t->info)]);
+			btf_type_str(t));
 		return false;
 	}
 
@@ -5538,11 +5537,11 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 	if (!btf_type_is_struct(t)) {
 		bpf_log(log,
 			"func '%s' arg%d type %s is not a struct\n",
-			tname, arg, btf_kind_str[BTF_INFO_KIND(t->info)]);
+			tname, arg, btf_type_str(t));
 		return false;
 	}
 	bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n",
-		tname, arg, info->btf_id, btf_kind_str[BTF_INFO_KIND(t->info)],
+		tname, arg, info->btf_id, btf_type_str(t),
 		__btf_name_by_offset(btf, t->name_off));
 	return true;
 }
@@ -5950,7 +5949,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 	if (ret < 0 || __btf_type_is_struct(t)) {
 		bpf_log(log,
 			"The function %s return type %s is unsupported.\n",
-			tname, btf_kind_str[BTF_INFO_KIND(t->info)]);
+			tname, btf_type_str(t));
 		return -EINVAL;
 	}
 	m->ret_size = ret;
@@ -5968,7 +5967,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
 		if (ret < 0 || ret > 16) {
 			bpf_log(log,
 				"The function %s arg%d type %s is unsupported.\n",
-				tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
+				tname, i, btf_type_str(t));
 			return -EINVAL;
 		}
 		if (ret == 0) {
@@ -6727,7 +6726,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
 			continue;
 		}
 		bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
-			i, btf_kind_str[BTF_INFO_KIND(t->info)], tname);
+			i, btf_type_str(t), tname);
 		return -EINVAL;
 	}
 	return 0;
-- 
cgit v1.2.3


From 0978c7c41fe2a3735f8776dc27cf1641bd916773 Mon Sep 17 00:00:00 2001
From: Deming Wang <wangdeming@inspur.com>
Date: Wed, 14 Sep 2022 21:48:30 -0400
Subject: acl: fix the comments of posix_acl_xattr_set

remove the double world of 'in'.

Signed-off-by: Deming Wang <wangdeming@inspur.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 fs/posix_acl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index c759b8eef62e..b4f109875e79 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -1176,7 +1176,7 @@ posix_acl_xattr_set(const struct xattr_handler *handler,
 		 * ACL_{GROUP,USER} have already been mapped according to the
 		 * caller's idmapping. The vfs_set_acl_prepare() helper will
 		 * recover them and take idmapped mounts into account. The
-		 * filesystem will receive the POSIX ACLs in in the correct
+		 * filesystem will receive the POSIX ACLs in the correct
 		 * format ready to be cached or written to the backing store
 		 * taking the filesystem idmapping into account.
 		 */
-- 
cgit v1.2.3


From d5e4eeea0c20b7467458cf3f3d887f59075db93e Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sun, 18 Sep 2022 01:20:13 +0200
Subject: EDAC/ppc_4xx: Reorder symbols to get rid of a few forward
 declarations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When moving the definition of ppc4xx_edac_driver further down, the
forward declarations can just be dropped.

Do this to reduce needless line repetition.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220917232013.489931-1-u.kleine-koenig@pengutronix.de
---
 drivers/edac/ppc4xx_edac.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/drivers/edac/ppc4xx_edac.c b/drivers/edac/ppc4xx_edac.c
index 0bc670778c99..046969b4e82e 100644
--- a/drivers/edac/ppc4xx_edac.c
+++ b/drivers/edac/ppc4xx_edac.c
@@ -178,11 +178,6 @@ struct ppc4xx_ecc_status {
 	u32 wmirq;
 };
 
-/* Function Prototypes */
-
-static int ppc4xx_edac_probe(struct platform_device *device);
-static int ppc4xx_edac_remove(struct platform_device *device);
-
 /* Global Variables */
 
 /*
@@ -197,15 +192,6 @@ static const struct of_device_id ppc4xx_edac_match[] = {
 };
 MODULE_DEVICE_TABLE(of, ppc4xx_edac_match);
 
-static struct platform_driver ppc4xx_edac_driver = {
-	.probe			= ppc4xx_edac_probe,
-	.remove			= ppc4xx_edac_remove,
-	.driver = {
-		.name = PPC4XX_EDAC_MODULE_NAME,
-		.of_match_table = ppc4xx_edac_match,
-	},
-};
-
 /*
  * TODO: The row and channel parameters likely need to be dynamically
  * set based on the aforementioned variant controller realizations.
@@ -1391,6 +1377,15 @@ ppc4xx_edac_opstate_init(void)
 			     EDAC_OPSTATE_UNKNOWN_STR)));
 }
 
+static struct platform_driver ppc4xx_edac_driver = {
+	.probe			= ppc4xx_edac_probe,
+	.remove			= ppc4xx_edac_remove,
+	.driver = {
+		.name = PPC4XX_EDAC_MODULE_NAME,
+		.of_match_table = ppc4xx_edac_match,
+	},
+};
+
 /**
  * ppc4xx_edac_init - driver/module insertion entry point
  *
-- 
cgit v1.2.3


From c01b5a33bf0614e61e8074a51976134f62a64201 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 18 Aug 2022 22:59:41 +0200
Subject: m68k: Move from strlcpy with unused retval to strscpy

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Link: https://lore.kernel.org/r/20220818205941.6240-1-wsa+renesas@sang-engineering.com
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/kernel/setup_mm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c
index e62fa8f2149b..06ee394db610 100644
--- a/arch/m68k/kernel/setup_mm.c
+++ b/arch/m68k/kernel/setup_mm.c
@@ -148,7 +148,7 @@ static void __init m68k_parse_bootinfo(const struct bi_record *record)
 			break;
 
 		case BI_COMMAND_LINE:
-			strlcpy(m68k_command_line, data,
+			strscpy(m68k_command_line, data,
 				sizeof(m68k_command_line));
 			break;
 
-- 
cgit v1.2.3


From d355a08e28691cc057f851a3cf9bf016bdfb3c39 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@linux-m68k.org>
Date: Wed, 31 Aug 2022 11:25:39 +1000
Subject: m68k: Allow kexec on M68KCLASSIC with MMU enabled only

None of the no-MMU CPU types (classic or ColdFire) or ColdFire with
MMU enabled have support for kexec yet, so the configuration as it
stands is not quite right, and it will fail to compile on them.

Make CONFIG_KEXEC depend on both CONFIG_M68KCLASSIC and CONFIG_MMU.

Signed-off-by: Greg Ungerer <gerg@linux-m68k.org>
Link: https://lore.kernel.org/r/20220831012539.3416470-1-gerg@linux-m68k.org
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index b06faf6c0b27..7bff88118507 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -87,7 +87,7 @@ config MMU_SUN3
 
 config KEXEC
 	bool "kexec system call"
-	depends on M68KCLASSIC
+	depends on M68KCLASSIC && MMU
 	select KEXEC_CORE
 	help
 	  kexec is a system call that implements the ability to shutdown your
-- 
cgit v1.2.3


From 15f9042311b0ab9c2904d50428f304a6229d6a02 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 29 Aug 2022 12:00:39 +0200
Subject: m68k: defconfig: Update defconfigs for v6.0-rc2

  - Drop CONFIG_NF_CONNTRACK_PROCFS=n (no longer auto-enabled since
    commit aa5762c34213aba7 ("netfilter: conntrack: NF_CONNTRACK_PROCFS
    should no longer default to y")),
  - Enable modular build of the new HCTR2 and ARIA crypto algorithms,
  - Drop CONFIG_CRYPTO_BLAKE2S=m (removed in commit 2d16803c562ecc64
    ("crypto blake2s - remove shash module")).

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/01a6abf7bc23a478d6a6c6801c6812ae96714620.1660559393.git.geert@linux-m68k.org
Link: https://lore.kernel.org/r/1fbe623e5d69bdce0352445c06e1b2c9fb8d123e.1661767119.git.geert@linux-m68k.org
---
 arch/m68k/configs/amiga_defconfig    | 4 ++--
 arch/m68k/configs/apollo_defconfig   | 4 ++--
 arch/m68k/configs/atari_defconfig    | 4 ++--
 arch/m68k/configs/bvme6000_defconfig | 4 ++--
 arch/m68k/configs/hp300_defconfig    | 4 ++--
 arch/m68k/configs/mac_defconfig      | 4 ++--
 arch/m68k/configs/multi_defconfig    | 4 ++--
 arch/m68k/configs/mvme147_defconfig  | 4 ++--
 arch/m68k/configs/mvme16x_defconfig  | 4 ++--
 arch/m68k/configs/q40_defconfig      | 4 ++--
 arch/m68k/configs/sun3_defconfig     | 4 ++--
 arch/m68k/configs/sun3x_defconfig    | 4 ++--
 12 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index a6a886a89be2..e2038d9499e4 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -84,7 +84,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 # CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
@@ -573,9 +572,9 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_KEYWRAP=m
 CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_BLAKE2S=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
@@ -594,6 +593,7 @@ CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index bffd24c2755e..ddd201259e43 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -80,7 +80,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 # CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
@@ -530,9 +529,9 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_KEYWRAP=m
 CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_BLAKE2S=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
@@ -551,6 +550,7 @@ CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index 0013425b1e08..d9f783707387 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -87,7 +87,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 # CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
@@ -550,9 +549,9 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_KEYWRAP=m
 CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_BLAKE2S=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
@@ -571,6 +570,7 @@ CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 42d969697f7f..68957c6bcff1 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -77,7 +77,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 # CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
@@ -522,9 +521,9 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_KEYWRAP=m
 CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_BLAKE2S=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
@@ -543,6 +542,7 @@ CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 97d6d9acb395..825c6a02fa9d 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -79,7 +79,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 # CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
@@ -532,9 +531,9 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_KEYWRAP=m
 CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_BLAKE2S=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
@@ -553,6 +552,7 @@ CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index 8cbfc1c659a3..17f64c562bf1 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -78,7 +78,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 # CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
@@ -552,9 +551,9 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_KEYWRAP=m
 CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_BLAKE2S=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
@@ -573,6 +572,7 @@ CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 9f45fe60757f..f5f4c572b694 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -98,7 +98,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 # CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
@@ -638,9 +637,9 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_KEYWRAP=m
 CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_BLAKE2S=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
@@ -659,6 +658,7 @@ CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index 4736cfacf6a2..b4a0bbef7e39 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -76,7 +76,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 # CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
@@ -521,9 +520,9 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_KEYWRAP=m
 CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_BLAKE2S=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
@@ -542,6 +541,7 @@ CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 638cd38aa7d2..c6a6d5926793 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -77,7 +77,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 # CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
@@ -522,9 +521,9 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_KEYWRAP=m
 CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_BLAKE2S=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
@@ -543,6 +542,7 @@ CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index ec8b6bb70ebd..49c9c89f0caf 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -78,7 +78,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 # CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
@@ -539,9 +538,9 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_KEYWRAP=m
 CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_BLAKE2S=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
@@ -560,6 +559,7 @@ CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index 7d8dc578d59c..9b44eeb9c07f 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -74,7 +74,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 # CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
@@ -521,9 +520,9 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_KEYWRAP=m
 CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_BLAKE2S=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
@@ -542,6 +541,7 @@ CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index 96290aee5302..d2ffb0a65b44 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -74,7 +74,6 @@ CONFIG_NETFILTER=y
 CONFIG_NETFILTER_NETLINK_HOOK=m
 CONFIG_NF_CONNTRACK=m
 CONFIG_NF_CONNTRACK_ZONES=y
-# CONFIG_NF_CONNTRACK_PROCFS is not set
 # CONFIG_NF_CT_PROTO_DCCP is not set
 CONFIG_NF_CONNTRACK_AMANDA=m
 CONFIG_NF_CONNTRACK_FTP=m
@@ -520,9 +519,9 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_KEYWRAP=m
 CONFIG_CRYPTO_ADIANTUM=m
+CONFIG_CRYPTO_HCTR2=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
-CONFIG_CRYPTO_BLAKE2S=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
@@ -541,6 +540,7 @@ CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_ARIA=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_SM4_GENERIC=m
 CONFIG_CRYPTO_TEA=m
-- 
cgit v1.2.3


From 1e6d11fe72e311c1989991ee318d239f650fa318 Mon Sep 17 00:00:00 2001
From: Lin Yujun <linyujun809@huawei.com>
Date: Wed, 14 Sep 2022 11:28:07 +0800
Subject: MIPS: SGI-IP30: Fix platform-device leak in bridge_platform_create()

In error case in bridge_platform_create after calling
platform_device_add()/platform_device_add_data()/
platform_device_add_resources(), release the failed
'pdev' or it will be leak, call platform_device_put()
to fix this problem.

Besides, 'pdev' is divided into 'pdev_wd' and 'pdev_bd',
use platform_device_unregister() to release sgi_w1
resources when xtalk-bridge registration fails.

Fixes: fd27234f24ae ("MIPS: add support for SGI Octane (IP30)")
Signed-off-by: Lin Yujun <linyujun809@huawei.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/sgi-ip30/ip30-xtalk.c | 70 +++++++++++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/arch/mips/sgi-ip30/ip30-xtalk.c b/arch/mips/sgi-ip30/ip30-xtalk.c
index 8129524421cb..7ceb2b23ea1c 100644
--- a/arch/mips/sgi-ip30/ip30-xtalk.c
+++ b/arch/mips/sgi-ip30/ip30-xtalk.c
@@ -40,12 +40,15 @@ static void bridge_platform_create(int widget, int masterwid)
 {
 	struct xtalk_bridge_platform_data *bd;
 	struct sgi_w1_platform_data *wd;
-	struct platform_device *pdev;
+	struct platform_device *pdev_wd;
+	struct platform_device *pdev_bd;
 	struct resource w1_res;
 
 	wd = kzalloc(sizeof(*wd), GFP_KERNEL);
-	if (!wd)
-		goto no_mem;
+	if (!wd) {
+		pr_warn("xtalk:%x bridge create out of memory\n", widget);
+		return;
+	}
 
 	snprintf(wd->dev_id, sizeof(wd->dev_id), "bridge-%012lx",
 		 IP30_SWIN_BASE(widget));
@@ -56,24 +59,35 @@ static void bridge_platform_create(int widget, int masterwid)
 	w1_res.end = w1_res.start + 3;
 	w1_res.flags = IORESOURCE_MEM;
 
-	pdev = platform_device_alloc("sgi_w1", PLATFORM_DEVID_AUTO);
-	if (!pdev) {
-		kfree(wd);
-		goto no_mem;
+	pdev_wd = platform_device_alloc("sgi_w1", PLATFORM_DEVID_AUTO);
+	if (!pdev_wd) {
+		pr_warn("xtalk:%x bridge create out of memory\n", widget);
+		goto err_kfree_wd;
+	}
+	if (platform_device_add_resources(pdev_wd, &w1_res, 1)) {
+		pr_warn("xtalk:%x bridge failed to add platform resources.\n", widget);
+		goto err_put_pdev_wd;
+	}
+	if (platform_device_add_data(pdev_wd, wd, sizeof(*wd))) {
+		pr_warn("xtalk:%x bridge failed to add platform data.\n", widget);
+		goto err_put_pdev_wd;
+	}
+	if (platform_device_add(pdev_wd)) {
+		pr_warn("xtalk:%x bridge failed to add platform device.\n", widget);
+		goto err_put_pdev_wd;
 	}
-	platform_device_add_resources(pdev, &w1_res, 1);
-	platform_device_add_data(pdev, wd, sizeof(*wd));
 	/* platform_device_add_data() duplicates the data */
 	kfree(wd);
-	platform_device_add(pdev);
 
 	bd = kzalloc(sizeof(*bd), GFP_KERNEL);
-	if (!bd)
-		goto no_mem;
-	pdev = platform_device_alloc("xtalk-bridge", PLATFORM_DEVID_AUTO);
-	if (!pdev) {
-		kfree(bd);
-		goto no_mem;
+	if (!bd) {
+		pr_warn("xtalk:%x bridge create out of memory\n", widget);
+		goto err_unregister_pdev_wd;
+	}
+	pdev_bd = platform_device_alloc("xtalk-bridge", PLATFORM_DEVID_AUTO);
+	if (!pdev_bd) {
+		pr_warn("xtalk:%x bridge create out of memory\n", widget);
+		goto err_kfree_bd;
 	}
 
 	bd->bridge_addr	= IP30_RAW_SWIN_BASE(widget);
@@ -93,15 +107,31 @@ static void bridge_platform_create(int widget, int masterwid)
 	bd->io.flags	= IORESOURCE_IO;
 	bd->io_offset	= IP30_SWIN_BASE(widget);
 
-	platform_device_add_data(pdev, bd, sizeof(*bd));
+	if (platform_device_add_data(pdev_bd, bd, sizeof(*bd))) {
+		pr_warn("xtalk:%x bridge failed to add platform data.\n", widget);
+		goto err_put_pdev_bd;
+	}
+	if (platform_device_add(pdev_bd)) {
+		pr_warn("xtalk:%x bridge failed to add platform device.\n", widget);
+		goto err_put_pdev_bd;
+	}
 	/* platform_device_add_data() duplicates the data */
 	kfree(bd);
-	platform_device_add(pdev);
 	pr_info("xtalk:%x bridge widget\n", widget);
 	return;
 
-no_mem:
-	pr_warn("xtalk:%x bridge create out of memory\n", widget);
+err_put_pdev_bd:
+	platform_device_put(pdev_bd);
+err_kfree_bd:
+	kfree(bd);
+err_unregister_pdev_wd:
+	platform_device_unregister(pdev_wd);
+	return;
+err_put_pdev_wd:
+	platform_device_put(pdev_wd);
+err_kfree_wd:
+	kfree(wd);
+	return;
 }
 
 static unsigned int __init xbow_widget_active(s8 wid)
-- 
cgit v1.2.3


From 11bec9cba4de06b3c0e9e4041453c2caaa1cbec1 Mon Sep 17 00:00:00 2001
From: Lin Yujun <linyujun809@huawei.com>
Date: Wed, 14 Sep 2022 11:29:17 +0800
Subject: MIPS: SGI-IP27: Fix platform-device leak in bridge_platform_create()

In error case in bridge_platform_create after calling
platform_device_add()/platform_device_add_data()/
platform_device_add_resources(), release the failed
'pdev' or it will be leak, call platform_device_put()
to fix this problem.

Besides, 'pdev' is divided into 'pdev_wd' and 'pdev_bd',
use platform_device_unregister() to release sgi_w1
resources when xtalk-bridge registration fails.

Fixes: 5dc76a96e95a ("MIPS: PCI: use information from 1-wire PROM for IOC3 detection")
Signed-off-by: Lin Yujun <linyujun809@huawei.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/sgi-ip27/ip27-xtalk.c | 70 +++++++++++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/arch/mips/sgi-ip27/ip27-xtalk.c b/arch/mips/sgi-ip27/ip27-xtalk.c
index e762886d1dda..5143d1cf8984 100644
--- a/arch/mips/sgi-ip27/ip27-xtalk.c
+++ b/arch/mips/sgi-ip27/ip27-xtalk.c
@@ -27,15 +27,18 @@ static void bridge_platform_create(nasid_t nasid, int widget, int masterwid)
 {
 	struct xtalk_bridge_platform_data *bd;
 	struct sgi_w1_platform_data *wd;
-	struct platform_device *pdev;
+	struct platform_device *pdev_wd;
+	struct platform_device *pdev_bd;
 	struct resource w1_res;
 	unsigned long offset;
 
 	offset = NODE_OFFSET(nasid);
 
 	wd = kzalloc(sizeof(*wd), GFP_KERNEL);
-	if (!wd)
-		goto no_mem;
+	if (!wd) {
+		pr_warn("xtalk:n%d/%x bridge create out of memory\n", nasid, widget);
+		return;
+	}
 
 	snprintf(wd->dev_id, sizeof(wd->dev_id), "bridge-%012lx",
 		 offset + (widget << SWIN_SIZE_BITS));
@@ -46,24 +49,35 @@ static void bridge_platform_create(nasid_t nasid, int widget, int masterwid)
 	w1_res.end = w1_res.start + 3;
 	w1_res.flags = IORESOURCE_MEM;
 
-	pdev = platform_device_alloc("sgi_w1", PLATFORM_DEVID_AUTO);
-	if (!pdev) {
-		kfree(wd);
-		goto no_mem;
+	pdev_wd = platform_device_alloc("sgi_w1", PLATFORM_DEVID_AUTO);
+	if (!pdev_wd) {
+		pr_warn("xtalk:n%d/%x bridge create out of memory\n", nasid, widget);
+		goto err_kfree_wd;
+	}
+	if (platform_device_add_resources(pdev_wd, &w1_res, 1)) {
+		pr_warn("xtalk:n%d/%x bridge failed to add platform resources.\n", nasid, widget);
+		goto err_put_pdev_wd;
+	}
+	if (platform_device_add_data(pdev_wd, wd, sizeof(*wd))) {
+		pr_warn("xtalk:n%d/%x bridge failed to add platform data.\n", nasid, widget);
+		goto err_put_pdev_wd;
+	}
+	if (platform_device_add(pdev_wd)) {
+		pr_warn("xtalk:n%d/%x bridge failed to add platform device.\n", nasid, widget);
+		goto err_put_pdev_wd;
 	}
-	platform_device_add_resources(pdev, &w1_res, 1);
-	platform_device_add_data(pdev, wd, sizeof(*wd));
 	/* platform_device_add_data() duplicates the data */
 	kfree(wd);
-	platform_device_add(pdev);
 
 	bd = kzalloc(sizeof(*bd), GFP_KERNEL);
-	if (!bd)
-		goto no_mem;
-	pdev = platform_device_alloc("xtalk-bridge", PLATFORM_DEVID_AUTO);
-	if (!pdev) {
-		kfree(bd);
-		goto no_mem;
+	if (!bd) {
+		pr_warn("xtalk:n%d/%x bridge create out of memory\n", nasid, widget);
+		goto err_unregister_pdev_wd;
+	}
+	pdev_bd = platform_device_alloc("xtalk-bridge", PLATFORM_DEVID_AUTO);
+	if (!pdev_bd) {
+		pr_warn("xtalk:n%d/%x bridge create out of memory\n", nasid, widget);
+		goto err_kfree_bd;
 	}
 
 
@@ -84,15 +98,31 @@ static void bridge_platform_create(nasid_t nasid, int widget, int masterwid)
 	bd->io.flags	= IORESOURCE_IO;
 	bd->io_offset	= offset;
 
-	platform_device_add_data(pdev, bd, sizeof(*bd));
+	if (platform_device_add_data(pdev_bd, bd, sizeof(*bd))) {
+		pr_warn("xtalk:n%d/%x bridge failed to add platform data.\n", nasid, widget);
+		goto err_put_pdev_bd;
+	}
+	if (platform_device_add(pdev_bd)) {
+		pr_warn("xtalk:n%d/%x bridge failed to add platform device.\n", nasid, widget);
+		goto err_put_pdev_bd;
+	}
 	/* platform_device_add_data() duplicates the data */
 	kfree(bd);
-	platform_device_add(pdev);
 	pr_info("xtalk:n%d/%x bridge widget\n", nasid, widget);
 	return;
 
-no_mem:
-	pr_warn("xtalk:n%d/%x bridge create out of memory\n", nasid, widget);
+err_put_pdev_bd:
+	platform_device_put(pdev_bd);
+err_kfree_bd:
+	kfree(bd);
+err_unregister_pdev_wd:
+	platform_device_unregister(pdev_wd);
+	return;
+err_put_pdev_wd:
+	platform_device_put(pdev_wd);
+err_kfree_wd:
+	kfree(wd);
+	return;
 }
 
 static int probe_one_port(nasid_t nasid, int widget, int masterwid)
-- 
cgit v1.2.3


From b971612cc193b524b554fe53821f4dbb2a774c01 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Wed, 14 Sep 2022 11:10:45 +0200
Subject: mips: bmips: bcm63268: add TWD block binding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TWD is an MFD block that contains timers, watchdog & some clocks / reset
controller.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/boot/dts/brcm/bcm63268.dtsi | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/mips/boot/dts/brcm/bcm63268.dtsi b/arch/mips/boot/dts/brcm/bcm63268.dtsi
index c3ce49ec675f..8926417a8fbc 100644
--- a/arch/mips/boot/dts/brcm/bcm63268.dtsi
+++ b/arch/mips/boot/dts/brcm/bcm63268.dtsi
@@ -105,14 +105,20 @@
 			interrupts = <2>, <3>;
 		};
 
-		wdt: watchdog@1000009c {
-			compatible = "brcm,bcm7038-wdt";
-			reg = <0x1000009c 0xc>;
+		timer-mfd@10000080 {
+			compatible = "brcm,bcm7038-twd", "simple-mfd", "syscon";
+			reg = <0x10000080 0x30>;
+			ranges = <0x0 0x10000080 0x30>;
 
-			clocks = <&periph_osc>;
-			clock-names = "refclk";
+			wdt: watchdog@1c {
+				compatible = "brcm,bcm7038-wdt";
+				reg = <0x1c 0xc>;
 
-			timeout-sec = <30>;
+				clocks = <&periph_osc>;
+				clock-names = "refclk";
+
+				timeout-sec = <30>;
+			};
 		};
 
 		uart0: serial@10000180 {
-- 
cgit v1.2.3


From aaec03d331303494227dc1e208eb212682fe925d Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Thu, 15 Sep 2022 21:21:51 +0800
Subject: MIPS: Loongson2ef: remove orphan sbx00_acpi_init() declaration

All uses of sbx00_acpi_init() in the "pci.c" file were removed by
commit 5831fdb099dd ("MIPS: Loongson2ef: clean up loongson64
related code"), so remove the orphan declaration, too.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/loongson2ef/common/pci.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/mips/loongson2ef/common/pci.c b/arch/mips/loongson2ef/common/pci.c
index 200916925e95..7d9ea51e8c01 100644
--- a/arch/mips/loongson2ef/common/pci.c
+++ b/arch/mips/loongson2ef/common/pci.c
@@ -73,8 +73,6 @@ static void __init setup_pcimap(void)
 #endif
 }
 
-extern int sbx00_acpi_init(void);
-
 static int __init pcibios_init(void)
 {
 	setup_pcimap();
-- 
cgit v1.2.3


From df1a727fa2f67369115574b02a01c140440c5a72 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Thu, 15 Sep 2022 21:21:52 +0800
Subject: MIPS: Octeon: remove orphan octeon_swiotlb declaration

All uses of octeon_swiotlb were removed by
commit f39f8d0eb081 ("MIPS/octeon: use swiotlb_init instead
of open coding it"), so remove the orphan declaration, too.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/include/asm/octeon/pci-octeon.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/mips/include/asm/octeon/pci-octeon.h b/arch/mips/include/asm/octeon/pci-octeon.h
index b12d9a3fbfb6..2f46f6c6e3d0 100644
--- a/arch/mips/include/asm/octeon/pci-octeon.h
+++ b/arch/mips/include/asm/octeon/pci-octeon.h
@@ -64,6 +64,4 @@ enum octeon_dma_bar_type {
 extern enum octeon_dma_bar_type octeon_dma_bar_type;
 
 void octeon_pci_dma_init(void);
-extern char *octeon_swiotlb;
-
 #endif
-- 
cgit v1.2.3


From c5f60e1cbb7a4f37bd158f4046f9c049ee235230 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Thu, 15 Sep 2022 21:21:53 +0800
Subject: MIPS: Octeon: remove orphan cvmx_fpa_setup_pool() declaration

All uses of cvmx_fpa_setup_pool() were removed by
commit a03822ea5df6 ("MIPS: OCTEON: Remove some unused files."),
so remove the orphan declaration, too.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/include/asm/octeon/cvmx-fpa.h | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/arch/mips/include/asm/octeon/cvmx-fpa.h b/arch/mips/include/asm/octeon/cvmx-fpa.h
index 29ae63606ab4..f6dfcca97f19 100644
--- a/arch/mips/include/asm/octeon/cvmx-fpa.h
+++ b/arch/mips/include/asm/octeon/cvmx-fpa.h
@@ -263,26 +263,6 @@ static inline void cvmx_fpa_free(void *ptr, uint64_t pool,
 	cvmx_write_io(newptr.u64, num_cache_lines);
 }
 
-/**
- * Setup a FPA pool to control a new block of memory.
- * This can only be called once per pool. Make sure proper
- * locking enforces this.
- *
- * @pool:	Pool to initialize
- *		     0 <= pool < 8
- * @name:	Constant character string to name this pool.
- *		     String is not copied.
- * @buffer:	Pointer to the block of memory to use. This must be
- *		     accessible by all processors and external hardware.
- * @block_size: Size for each block controlled by the FPA
- * @num_blocks: Number of blocks
- *
- * Returns 0 on Success,
- *	   -1 on failure
- */
-extern int cvmx_fpa_setup_pool(uint64_t pool, const char *name, void *buffer,
-			       uint64_t block_size, uint64_t num_blocks);
-
 /**
  * Shutdown a Memory pool and validate that it had all of
  * the buffers originally placed in it. This should only be
-- 
cgit v1.2.3


From 2af06ba7ee2fb80e1ededade7cda85cc04c51109 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Thu, 15 Sep 2022 21:21:54 +0800
Subject: MIPS: Octeon: remove orphan octeon_hal_setup_reserved32() declaration

All uses of octeon_hal_setup_reserved32() were removed by
commit 1ef2887030db ("MIPS: Octeon: Remove vestiges of
CONFIG_CAVIUM_RESERVE32_USE_WIRED_TLB"), so remove the
orphan declaration, too.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/include/asm/octeon/octeon.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/mips/include/asm/octeon/octeon.h b/arch/mips/include/asm/octeon/octeon.h
index 7e714aefc76d..5c1d726c702f 100644
--- a/arch/mips/include/asm/octeon/octeon.h
+++ b/arch/mips/include/asm/octeon/octeon.h
@@ -43,7 +43,6 @@ extern int octeon_get_southbridge_interrupt(void);
 extern int octeon_get_boot_coremask(void);
 extern int octeon_get_boot_num_arguments(void);
 extern const char *octeon_get_boot_argument(int arg);
-extern void octeon_hal_setup_reserved32(void);
 extern void octeon_user_io_init(void);
 
 extern void octeon_init_cvmcount(void);
-- 
cgit v1.2.3


From 0b8b94c384e7eb2363cd90f4d438c41ce93ae530 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Thu, 15 Sep 2022 21:21:55 +0800
Subject: MIPS: IRQ: remove orphan declarations from
 arch/mips/include/asm/irq.h

All uses of free_irqno() and alloc_legacy_irqno() were removed
by commit 69a07a41d908 ("MIPS: SGI-IP27: rework HUB interrupts"),
so remove the orphan declarations, too.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/include/asm/irq.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index 57561e0e6e8d..76941ae572b2 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -64,8 +64,6 @@ extern void arch_init_irq(void);
 extern void spurious_interrupt(void);
 
 extern int allocate_irqno(void);
-extern void alloc_legacy_irqno(void);
-extern void free_irqno(unsigned int irq);
 
 /*
  * Before R2 the timer and performance counter interrupts were both fixed to
-- 
cgit v1.2.3


From b7dc37fcf67b6191da5049938c8a1c881b4992df Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Thu, 15 Sep 2022 21:21:56 +0800
Subject: MIPS: remove orphan sni_cpu_time_init() declaration

All uses of sni_cpu_time_init() were removed by
commit 4b550488f894 ("[MIPS] Deforest the function pointer jungle
in the time code."), so remove the orphan declaration, too.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/include/asm/sni.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/mips/include/asm/sni.h b/arch/mips/include/asm/sni.h
index 7dfa297ce597..7fb6656a6bfd 100644
--- a/arch/mips/include/asm/sni.h
+++ b/arch/mips/include/asm/sni.h
@@ -226,9 +226,6 @@ extern void sni_pcit_cplus_irq_init(void);
 extern void sni_rm200_irq_init(void);
 extern void sni_pcimt_irq_init(void);
 
-/* timer inits */
-extern void sni_cpu_time_init(void);
-
 /* eisa init for RM200/400 */
 #ifdef CONFIG_EISA
 extern int sni_eisa_root_init(void);
-- 
cgit v1.2.3


From 5c5a4682e96a424c7d6b071b5832b5068ce29107 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Thu, 15 Sep 2022 21:21:57 +0800
Subject: MIPS: AR7: remove orphan declarations from
 arch/mips/include/asm/mach-ar7/ar7.h

All uses of ar7_cpu_clock, ar7_bus_clock and ar7_dsp_clock were
removed by commit 780019ddf02f ("MIPS: AR7: Implement clock API"),
so remove the orphan declarations, too.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/include/asm/mach-ar7/ar7.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/mips/include/asm/mach-ar7/ar7.h b/arch/mips/include/asm/mach-ar7/ar7.h
index cbe75ade3277..1e8621a6afa3 100644
--- a/arch/mips/include/asm/mach-ar7/ar7.h
+++ b/arch/mips/include/asm/mach-ar7/ar7.h
@@ -104,8 +104,6 @@ struct plat_dsl_data {
 	int reset_bit_sar;
 };
 
-extern int ar7_cpu_clock, ar7_bus_clock, ar7_dsp_clock;
-
 static inline int ar7_is_titan(void)
 {
 	return (readl((void *)KSEG1ADDR(AR7_REGS_GPIO + 0x24)) & 0xffff) ==
-- 
cgit v1.2.3


From 6f674034b467dc231c99ea34abf4a3f6434ff766 Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Tue, 13 Sep 2022 18:10:36 +0800
Subject: mips: cavium: convert to DEFINE_SHOW_ATTRIBUTE

Use DEFINE_SHOW_ATTRIBUTE helper macro to simplify the code.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/cavium-octeon/oct_ilm.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/arch/mips/cavium-octeon/oct_ilm.c b/arch/mips/cavium-octeon/oct_ilm.c
index 6a4694538bb6..dc05262e85ff 100644
--- a/arch/mips/cavium-octeon/oct_ilm.c
+++ b/arch/mips/cavium-octeon/oct_ilm.c
@@ -28,7 +28,7 @@ struct latency_info {
 static struct latency_info li;
 static struct dentry *dir;
 
-static int show_latency(struct seq_file *m, void *v)
+static int oct_ilm_show(struct seq_file *m, void *v)
 {
 	u64 cpuclk, avg, max, min;
 	struct latency_info curr_li = li;
@@ -43,18 +43,7 @@ static int show_latency(struct seq_file *m, void *v)
 		   curr_li.interrupt_cnt, avg, max, min);
 	return 0;
 }
-
-static int oct_ilm_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_latency, NULL);
-}
-
-static const struct file_operations oct_ilm_ops = {
-	.open = oct_ilm_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(oct_ilm);
 
 static int reset_statistics(void *data, u64 value)
 {
@@ -67,7 +56,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(reset_statistics_ops, NULL, reset_statistics, "%llu\n")
 static void init_debugfs(void)
 {
 	dir = debugfs_create_dir("oct_ilm", 0);
-	debugfs_create_file("statistics", 0222, dir, NULL, &oct_ilm_ops);
+	debugfs_create_file("statistics", 0222, dir, NULL, &oct_ilm_fops);
 	debugfs_create_file("reset", 0222, dir, NULL, &reset_statistics_ops);
 }
 
-- 
cgit v1.2.3


From 27a43a85bcfe5cd9ee0fa561d7ec9aead462166d Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Tue, 13 Sep 2022 18:10:37 +0800
Subject: mips: kernel: convert to DEFINE_SHOW_ATTRIBUTE

Use DEFINE_SHOW_ATTRIBUTE helper macro to simplify the code.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/kernel/segment.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/arch/mips/kernel/segment.c b/arch/mips/kernel/segment.c
index 0a9bd7b0983b..24560501c70d 100644
--- a/arch/mips/kernel/segment.c
+++ b/arch/mips/kernel/segment.c
@@ -46,7 +46,7 @@ static void build_segment_config(char *str, unsigned int cfg)
 		((cfg & MIPS_SEGCFG_EU) >> MIPS_SEGCFG_EU_SHIFT));
 }
 
-static int show_segments(struct seq_file *m, void *v)
+static int segments_show(struct seq_file *m, void *v)
 {
 	unsigned int segcfg;
 	char str[42];
@@ -80,18 +80,7 @@ static int show_segments(struct seq_file *m, void *v)
 
 	return 0;
 }
-
-static int segments_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, show_segments, NULL);
-}
-
-static const struct file_operations segments_fops = {
-	.open		= segments_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(segments);
 
 static int __init segments_info(void)
 {
-- 
cgit v1.2.3


From 53f9cd5c16a57076ba1ef01c62299fa899bd83b3 Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Tue, 13 Sep 2022 18:10:38 +0800
Subject: mips: ralink: convert to DEFINE_SHOW_ATTRIBUTE

Use DEFINE_SHOW_ATTRIBUTE helper macro to simplify the code.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/ralink/bootrom.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/arch/mips/ralink/bootrom.c b/arch/mips/ralink/bootrom.c
index 94ca8379b83c..8c8cc0a81ed8 100644
--- a/arch/mips/ralink/bootrom.c
+++ b/arch/mips/ralink/bootrom.c
@@ -18,22 +18,11 @@ static int bootrom_show(struct seq_file *s, void *unused)
 
 	return 0;
 }
-
-static int bootrom_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, bootrom_show, NULL);
-}
-
-static const struct file_operations bootrom_file_ops = {
-	.open		= bootrom_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(bootrom);
 
 static int __init bootrom_setup(void)
 {
-	debugfs_create_file("bootrom", 0444, NULL, NULL, &bootrom_file_ops);
+	debugfs_create_file("bootrom", 0444, NULL, NULL, &bootrom_fops);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 0dedcf6e3301836eb70cfa649052e7ce4fcd13ba Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 7 Sep 2022 16:05:56 -0700
Subject: MIPS: BCM47XX: Cast memcmp() of function to (void *)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clang is especially sensitive about argument type matching when using
__overloaded functions (like memcmp(), etc). Help it see that function
pointers are just "void *". Avoids this error:

arch/mips/bcm47xx/prom.c:89:8: error: no matching function for call to 'memcmp'
                   if (!memcmp(prom_init, prom_init + mem, 32))
                        ^~~~~~
include/linux/string.h:156:12: note: candidate function not viable: no known conversion from 'void (void)' to 'const void *' for 1st argument extern int memcmp(const void *,const void *,__kernel_size_t);

Cc: Hauke Mehrtens <hauke@hauke-m.de>
Cc: "Rafał Miłecki" <zajec5@gmail.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: linux-mips@vger.kernel.org
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: llvm@lists.linux.dev
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/lkml/202209080652.sz2d68e5-lkp@intel.com
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/bcm47xx/prom.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/bcm47xx/prom.c b/arch/mips/bcm47xx/prom.c
index ab203e66ba0d..a9bea411d928 100644
--- a/arch/mips/bcm47xx/prom.c
+++ b/arch/mips/bcm47xx/prom.c
@@ -86,7 +86,7 @@ static __init void prom_init_mem(void)
 			pr_debug("Assume 128MB RAM\n");
 			break;
 		}
-		if (!memcmp(prom_init, prom_init + mem, 32))
+		if (!memcmp((void *)prom_init, (void *)prom_init + mem, 32))
 			break;
 	}
 	lowmem = mem;
@@ -159,7 +159,7 @@ void __init bcm47xx_prom_highmem_init(void)
 
 	off = EXTVBASE + __pa(off);
 	for (extmem = 128 << 20; extmem < 512 << 20; extmem <<= 1) {
-		if (!memcmp(prom_init, (void *)(off + extmem), 16))
+		if (!memcmp((void *)prom_init, (void *)(off + extmem), 16))
 			break;
 	}
 	extmem -= lowmem;
-- 
cgit v1.2.3


From 5db72fdb74983a1e81331aadf99ae2305f277562 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 13 Sep 2022 19:31:40 +0300
Subject: ACPI: utils: Add acpi_dev_uid_to_integer() helper to get _UID as
 integer

Some users interpret _UID only as integer and for them it's easier to
have an integer representation of _UID. Add respective helper for that.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/utils.c    | 24 ++++++++++++++++++++++++
 include/acpi/acpi_bus.h |  1 +
 include/linux/acpi.h    |  5 +++++
 3 files changed, 30 insertions(+)

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 5a7b8065e77f..febf9b8da3a0 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -793,6 +793,30 @@ bool acpi_dev_hid_uid_match(struct acpi_device *adev,
 }
 EXPORT_SYMBOL(acpi_dev_hid_uid_match);
 
+/**
+ * acpi_dev_uid_to_integer - treat ACPI device _UID as integer
+ * @adev: ACPI device to get _UID from
+ * @integer: output buffer for integer
+ *
+ * Considers _UID as integer and converts it to @integer.
+ *
+ * Returns 0 on success, or negative error code otherwise.
+ */
+int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer)
+{
+	const char *uid;
+
+	if (!adev)
+		return -ENODEV;
+
+	uid = acpi_device_uid(adev);
+	if (!uid)
+		return -ENODATA;
+
+	return kstrtou64(uid, 0, integer);
+}
+EXPORT_SYMBOL(acpi_dev_uid_to_integer);
+
 /**
  * acpi_dev_found - Detect presence of a given ACPI device in the namespace.
  * @hid: Hardware ID of the device.
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index e7d27373ff71..bd0db916f330 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -733,6 +733,7 @@ static inline bool acpi_device_can_poweroff(struct acpi_device *adev)
 }
 
 bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2);
+int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer);
 
 void acpi_dev_clear_dependencies(struct acpi_device *supplier);
 bool acpi_dev_ready_for_enumeration(const struct acpi_device *device);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6f64b2f3dc54..9434db02cb60 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -798,6 +798,11 @@ acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *u
 	return false;
 }
 
+static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer)
+{
+	return -ENODEV;
+}
+
 static inline struct acpi_device *
 acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv)
 {
-- 
cgit v1.2.3


From 2a036e489eb1571810126d6fa47bd8af1e237c08 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 13 Sep 2022 19:31:41 +0300
Subject: ACPI: LPSS: Refactor _UID handling to use acpi_dev_uid_to_integer()

ACPI utils provide acpi_dev_uid_to_integer() helper to extract _UID as
an integer. Use it instead of custom approach.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_lpss.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index c4d4d21391d7..4d415e210c32 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -167,10 +167,10 @@ static struct pwm_lookup byt_pwm_lookup[] = {
 
 static void byt_pwm_setup(struct lpss_private_data *pdata)
 {
-	struct acpi_device *adev = pdata->adev;
+	u64 uid;
 
 	/* Only call pwm_add_table for the first PWM controller */
-	if (!adev->pnp.unique_id || strcmp(adev->pnp.unique_id, "1"))
+	if (acpi_dev_uid_to_integer(pdata->adev, &uid) || uid != 1)
 		return;
 
 	pwm_add_table(byt_pwm_lookup, ARRAY_SIZE(byt_pwm_lookup));
@@ -180,14 +180,13 @@ static void byt_pwm_setup(struct lpss_private_data *pdata)
 
 static void byt_i2c_setup(struct lpss_private_data *pdata)
 {
-	const char *uid_str = acpi_device_uid(pdata->adev);
 	acpi_handle handle = pdata->adev->handle;
 	unsigned long long shared_host = 0;
 	acpi_status status;
-	long uid = 0;
+	u64 uid;
 
-	/* Expected to always be true, but better safe then sorry */
-	if (uid_str && !kstrtol(uid_str, 10, &uid) && uid) {
+	/* Expected to always be successfull, but better safe then sorry */
+	if (!acpi_dev_uid_to_integer(pdata->adev, &uid) && uid) {
 		/* Detect I2C bus shared with PUNIT and ignore its d3 status */
 		status = acpi_evaluate_integer(handle, "_SEM", NULL, &shared_host);
 		if (ACPI_SUCCESS(status) && shared_host)
@@ -211,10 +210,10 @@ static struct pwm_lookup bsw_pwm_lookup[] = {
 
 static void bsw_pwm_setup(struct lpss_private_data *pdata)
 {
-	struct acpi_device *adev = pdata->adev;
+	u64 uid;
 
 	/* Only call pwm_add_table for the first PWM controller */
-	if (!adev->pnp.unique_id || strcmp(adev->pnp.unique_id, "1"))
+	if (acpi_dev_uid_to_integer(pdata->adev, &uid) || uid != 1)
 		return;
 
 	pwm_add_table(bsw_pwm_lookup, ARRAY_SIZE(bsw_pwm_lookup));
-- 
cgit v1.2.3


From 197a5aeaf6cd475720ca2f6ff1e4ed720da8657f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 13 Sep 2022 19:31:42 +0300
Subject: ACPI: x86: Refactor _UID handling to use acpi_dev_uid_to_integer()

ACPI utils provide acpi_dev_uid_to_integer() helper to extract _UID as
an integer. Use it instead of custom approach.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/x86/utils.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/x86/utils.c b/drivers/acpi/x86/utils.c
index 664070fc8349..2764b4778ce7 100644
--- a/drivers/acpi/x86/utils.c
+++ b/drivers/acpi/x86/utils.c
@@ -351,11 +351,17 @@ int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *s
 	struct acpi_device *adev = ACPI_COMPANION(controller_parent);
 	const struct dmi_system_id *dmi_id;
 	long quirks = 0;
+	u64 uid;
+	int ret;
 
 	*skip = false;
 
-	/* !dev_is_platform() to not match on PNP enumerated debug UARTs */
-	if (!adev || !adev->pnp.unique_id || !dev_is_platform(controller_parent))
+	ret = acpi_dev_uid_to_integer(adev, &uid);
+	if (ret)
+		return 0;
+
+	/* to not match on PNP enumerated debug UARTs */
+	if (!dev_is_platform(controller_parent))
 		return 0;
 
 	dmi_id = dmi_first_match(acpi_quirk_skip_dmi_ids);
@@ -363,10 +369,10 @@ int acpi_quirk_skip_serdev_enumeration(struct device *controller_parent, bool *s
 		quirks = (unsigned long)dmi_id->driver_data;
 
 	if (quirks & ACPI_QUIRK_UART1_TTY_UART2_SKIP) {
-		if (!strcmp(adev->pnp.unique_id, "1"))
+		if (uid == 1)
 			return -ENODEV; /* Create tty cdev instead of serdev */
 
-		if (!strcmp(adev->pnp.unique_id, "2"))
+		if (uid == 2)
 			*skip = true;
 	}
 
-- 
cgit v1.2.3


From f4c6752df92af2d4d67bcbdb259b4b387109477c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 13 Sep 2022 19:31:43 +0300
Subject: i2c: amd-mp2-plat: Refactor _UID handling to use
 acpi_dev_uid_to_integer()

ACPI utils provide acpi_dev_uid_to_integer() helper to extract _UID as
an integer. Use it instead of custom approach.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Wolfram Sang <wsa@kernel.org>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/i2c/busses/i2c-amd-mp2-plat.c | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/drivers/i2c/busses/i2c-amd-mp2-plat.c b/drivers/i2c/busses/i2c-amd-mp2-plat.c
index 84b7e6cbc67b..423fe0c8a471 100644
--- a/drivers/i2c/busses/i2c-amd-mp2-plat.c
+++ b/drivers/i2c/busses/i2c-amd-mp2-plat.c
@@ -244,14 +244,18 @@ static const struct i2c_adapter_quirks amd_i2c_dev_quirks = {
 
 static int i2c_amd_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	int ret;
 	struct amd_i2c_dev *i2c_dev;
-	struct acpi_device *adev = ACPI_COMPANION(&pdev->dev);
 	struct amd_mp2_dev *mp2_dev;
-	const char *uid;
+	u64 uid;
 
-	if (!adev)
-		return -ENODEV;
+	ret = acpi_dev_uid_to_integer(ACPI_COMPANION(dev), &uid);
+	if (ret)
+		return dev_err_probe(dev, ret, "missing UID/bus id!\n");
+	if (uid >= 2)
+		return dev_err_probe(dev, -EINVAL, "incorrect UID/bus id \"%llu\"!\n", uid);
+	dev_dbg(dev, "bus id is %llu\n", uid);
 
 	/* The ACPI namespace doesn't contain information about which MP2 PCI
 	 * device an AMDI0011 ACPI device is related to, so assume that there's
@@ -266,6 +270,7 @@ static int i2c_amd_probe(struct platform_device *pdev)
 	if (!i2c_dev)
 		return -ENOMEM;
 
+	i2c_dev->common.bus_id = uid;
 	i2c_dev->common.mp2_dev = mp2_dev;
 	i2c_dev->pdev = pdev;
 	platform_set_drvdata(pdev, i2c_dev);
@@ -276,20 +281,6 @@ static int i2c_amd_probe(struct platform_device *pdev)
 	i2c_dev->common.resume = &i2c_amd_resume;
 #endif
 
-	uid = adev->pnp.unique_id;
-	if (!uid) {
-		dev_err(&pdev->dev, "missing UID/bus id!\n");
-		return -EINVAL;
-	} else if (strcmp(uid, "0") == 0) {
-		i2c_dev->common.bus_id = 0;
-	} else if (strcmp(uid, "1") == 0) {
-		i2c_dev->common.bus_id = 1;
-	} else {
-		dev_err(&pdev->dev, "incorrect UID/bus id \"%s\"!\n", uid);
-		return -EINVAL;
-	}
-	dev_dbg(&pdev->dev, "bus id is %u\n", i2c_dev->common.bus_id);
-
 	/* Register the adapter */
 	amd_mp2_pm_runtime_get(mp2_dev);
 
-- 
cgit v1.2.3


From 3ddaf13982a06d57347052f1fc386f70ef5b086f Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 13 Sep 2022 19:31:44 +0300
Subject: i2c: mlxbf: Refactor _UID handling to use acpi_dev_uid_to_integer()

ACPI utils provide acpi_dev_uid_to_integer() helper to extract _UID as
an integer. Use it instead of custom approach.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Wolfram Sang <wsa@kernel.org>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/i2c/busses/i2c-mlxbf.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/drivers/i2c/busses/i2c-mlxbf.c b/drivers/i2c/busses/i2c-mlxbf.c
index 8716032f030a..32235c62f3d2 100644
--- a/drivers/i2c/busses/i2c-mlxbf.c
+++ b/drivers/i2c/busses/i2c-mlxbf.c
@@ -2229,35 +2229,27 @@ MODULE_DEVICE_TABLE(acpi, mlxbf_i2c_acpi_ids);
 static int mlxbf_i2c_acpi_probe(struct device *dev, struct mlxbf_i2c_priv *priv)
 {
 	const struct acpi_device_id *aid;
-	struct acpi_device *adev;
-	unsigned long bus_id = 0;
-	const char *uid;
+	u64 bus_id;
 	int ret;
 
 	if (acpi_disabled)
 		return -ENOENT;
 
-	adev = ACPI_COMPANION(dev);
-	if (!adev)
-		return -ENXIO;
-
 	aid = acpi_match_device(mlxbf_i2c_acpi_ids, dev);
 	if (!aid)
 		return -ENODEV;
 
 	priv->chip = (struct mlxbf_i2c_chip_info *)aid->driver_data;
 
-	uid = acpi_device_uid(adev);
-	if (!uid || !(*uid)) {
+	ret = acpi_dev_uid_to_integer(ACPI_COMPANION(dev), &bus_id);
+	if (ret) {
 		dev_err(dev, "Cannot retrieve UID\n");
-		return -ENODEV;
+		return ret;
 	}
 
-	ret = kstrtoul(uid, 0, &bus_id);
-	if (!ret)
-		priv->bus = bus_id;
+	priv->bus = bus_id;
 
-	return ret;
+	return 0;
 }
 #else
 static int mlxbf_i2c_acpi_probe(struct device *dev, struct mlxbf_i2c_priv *priv)
-- 
cgit v1.2.3


From 9cde62517f2eba00afb19f096dd783974896b9d9 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 13 Sep 2022 19:31:45 +0300
Subject: perf: qcom_l2_pmu: Refactor _UID handling to use
 acpi_dev_uid_to_integer()

ACPI utils provide acpi_dev_uid_to_integer() helper to extract _UID as
an integer. Use it instead of custom approach.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/perf/qcom_l2_pmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/perf/qcom_l2_pmu.c b/drivers/perf/qcom_l2_pmu.c
index 30234c261b05..aaca6db7d8f6 100644
--- a/drivers/perf/qcom_l2_pmu.c
+++ b/drivers/perf/qcom_l2_pmu.c
@@ -840,16 +840,16 @@ static int l2_cache_pmu_probe_cluster(struct device *dev, void *data)
 {
 	struct platform_device *pdev = to_platform_device(dev->parent);
 	struct platform_device *sdev = to_platform_device(dev);
-	struct acpi_device *adev = ACPI_COMPANION(dev);
 	struct l2cache_pmu *l2cache_pmu = data;
 	struct cluster_pmu *cluster;
-	unsigned long fw_cluster_id;
+	u64 fw_cluster_id;
 	int err;
 	int irq;
 
-	if (!adev || kstrtoul(adev->pnp.unique_id, 10, &fw_cluster_id) < 0) {
+	err = acpi_dev_uid_to_integer(ACPI_COMPANION(dev), &fw_cluster_id);
+	if (err) {
 		dev_err(&pdev->dev, "unable to read ACPI uid\n");
-		return -ENODEV;
+		return err;
 	}
 
 	cluster = devm_kzalloc(&pdev->dev, sizeof(*cluster), GFP_KERNEL);
@@ -879,7 +879,7 @@ static int l2_cache_pmu_probe_cluster(struct device *dev, void *data)
 	}
 
 	dev_info(&pdev->dev,
-		"Registered L2 cache PMU cluster %ld\n", fw_cluster_id);
+		 "Registered L2 cache PMU cluster %lld\n", fw_cluster_id);
 
 	spin_lock_init(&cluster->pmu_lock);
 
-- 
cgit v1.2.3


From 2990f3a87e94b19b66c3faa16f9dd1d649b93d12 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 13 Sep 2022 19:31:46 +0300
Subject: spi: pxa2xx: Refactor _UID handling to use acpi_dev_uid_to_integer()

ACPI utils provide acpi_dev_uid_to_integer() helper to extract _UID as
an integer. Use it instead of custom approach.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Mark Brown <broonie@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/spi/spi-pxa2xx.c | 37 ++++++++++---------------------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index 838d12e65144..c8e079d7e541 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -1441,31 +1441,6 @@ static const struct of_device_id pxa2xx_spi_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, pxa2xx_spi_of_match);
 
-#ifdef CONFIG_ACPI
-
-static int pxa2xx_spi_get_port_id(struct device *dev)
-{
-	struct acpi_device *adev;
-	unsigned int devid;
-	int port_id = -1;
-
-	adev = ACPI_COMPANION(dev);
-	if (adev && adev->pnp.unique_id &&
-	    !kstrtouint(adev->pnp.unique_id, 0, &devid))
-		port_id = devid;
-	return port_id;
-}
-
-#else /* !CONFIG_ACPI */
-
-static int pxa2xx_spi_get_port_id(struct device *dev)
-{
-	return -1;
-}
-
-#endif /* CONFIG_ACPI */
-
-
 #ifdef CONFIG_PCI
 
 static bool pxa2xx_spi_idma_filter(struct dma_chan *chan, void *param)
@@ -1479,13 +1454,16 @@ static struct pxa2xx_spi_controller *
 pxa2xx_spi_init_pdata(struct platform_device *pdev)
 {
 	struct pxa2xx_spi_controller *pdata;
+	struct device *dev = &pdev->dev;
+	struct device *parent = dev->parent;
 	struct ssp_device *ssp;
 	struct resource *res;
-	struct device *parent = pdev->dev.parent;
 	struct pci_dev *pcidev = dev_is_pci(parent) ? to_pci_dev(parent) : NULL;
 	const struct pci_device_id *pcidev_id = NULL;
 	enum pxa_ssp_type type;
 	const void *match;
+	int status;
+	u64 uid;
 
 	if (pcidev)
 		pcidev_id = pci_match_id(pxa2xx_spi_pci_compound_match, pcidev);
@@ -1529,7 +1507,12 @@ pxa2xx_spi_init_pdata(struct platform_device *pdev)
 
 	ssp->type = type;
 	ssp->dev = &pdev->dev;
-	ssp->port_id = pxa2xx_spi_get_port_id(&pdev->dev);
+
+	status = acpi_dev_uid_to_integer(ACPI_COMPANION(dev), &uid);
+	if (status)
+		ssp->port_id = -1;
+	else
+		ssp->port_id = uid;
 
 	pdata->is_slave = device_property_read_bool(&pdev->dev, "spi-slave");
 	pdata->num_chipselect = 1;
-- 
cgit v1.2.3


From 7fc90e8617095742ec0d948bfee10231c7a09be5 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 13 Sep 2022 19:31:47 +0300
Subject: efi/dev-path-parser: Refactor _UID handling to use
 acpi_dev_uid_to_integer()

ACPI utils provide acpi_dev_uid_to_integer() helper to extract _UID as
an integer. Use it instead of custom approach.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/firmware/efi/dev-path-parser.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/firmware/efi/dev-path-parser.c b/drivers/firmware/efi/dev-path-parser.c
index eb9c65f97841..f80d87c199c3 100644
--- a/drivers/firmware/efi/dev-path-parser.c
+++ b/drivers/firmware/efi/dev-path-parser.c
@@ -15,9 +15,11 @@
 static long __init parse_acpi_path(const struct efi_dev_path *node,
 				   struct device *parent, struct device **child)
 {
-	char hid[ACPI_ID_LEN], uid[11]; /* UINT_MAX + null byte */
 	struct acpi_device *adev;
 	struct device *phys_dev;
+	char hid[ACPI_ID_LEN];
+	u64 uid;
+	int ret;
 
 	if (node->header.length != 12)
 		return -EINVAL;
@@ -27,12 +29,12 @@ static long __init parse_acpi_path(const struct efi_dev_path *node,
 		'A' + ((node->acpi.hid >>  5) & 0x1f) - 1,
 		'A' + ((node->acpi.hid >>  0) & 0x1f) - 1,
 			node->acpi.hid >> 16);
-	sprintf(uid, "%u", node->acpi.uid);
 
 	for_each_acpi_dev_match(adev, hid, NULL, -1) {
-		if (adev->pnp.unique_id && !strcmp(adev->pnp.unique_id, uid))
+		ret = acpi_dev_uid_to_integer(adev, &uid);
+		if (ret == 0 && node->acpi.uid == uid)
 			break;
-		if (!adev->pnp.unique_id && node->acpi.uid == 0)
+		if (ret == -ENODATA && node->acpi.uid == 0)
 			break;
 	}
 	if (!adev)
-- 
cgit v1.2.3


From 2d2cb3066f2c90cd8ca540b36ba7a55e7f2406e0 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sun, 4 Sep 2022 00:32:56 +0900
Subject: Bluetooth: L2CAP: initialize delayed works at l2cap_chan_create()

syzbot is reporting cancel_delayed_work() without INIT_DELAYED_WORK() at
l2cap_chan_del() [1], for CONF_NOT_COMPLETE flag (which meant to prevent
l2cap_chan_del() from calling cancel_delayed_work()) is cleared by timer
which fires before l2cap_chan_del() is called by closing file descriptor
created by socket(AF_BLUETOOTH, SOCK_STREAM, BTPROTO_L2CAP).

l2cap_bredr_sig_cmd(L2CAP_CONF_REQ) and l2cap_bredr_sig_cmd(L2CAP_CONF_RSP)
are calling l2cap_ertm_init(chan), and they call l2cap_chan_ready() (which
clears CONF_NOT_COMPLETE flag) only when l2cap_ertm_init(chan) succeeded.

l2cap_sock_init() does not call l2cap_ertm_init(chan), and it instead sets
CONF_NOT_COMPLETE flag by calling l2cap_chan_set_defaults(). However, when
connect() is requested, "command 0x0409 tx timeout" happens after 2 seconds
 from connect() request, and CONF_NOT_COMPLETE flag is cleared after 4
seconds from connect() request, for l2cap_conn_start() from
l2cap_info_timeout() callback scheduled by

  schedule_delayed_work(&conn->info_timer, L2CAP_INFO_TIMEOUT);

in l2cap_connect() is calling l2cap_chan_ready().

Fix this problem by initializing delayed works used by L2CAP_MODE_ERTM
mode as soon as l2cap_chan_create() allocates a channel, like I did in
commit be8597239379f0f5 ("Bluetooth: initialize skb_queue_head at
l2cap_chan_create()").

Link: https://syzkaller.appspot.com/bug?extid=83672956c7aa6af698b3 [1]
Reported-by: syzbot <syzbot+83672956c7aa6af698b3@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/l2cap_core.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 2c9de67daadc..770891f68703 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -61,6 +61,9 @@ static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err);
 
 static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
 		     struct sk_buff_head *skbs, u8 event);
+static void l2cap_retrans_timeout(struct work_struct *work);
+static void l2cap_monitor_timeout(struct work_struct *work);
+static void l2cap_ack_timeout(struct work_struct *work);
 
 static inline u8 bdaddr_type(u8 link_type, u8 bdaddr_type)
 {
@@ -476,6 +479,9 @@ struct l2cap_chan *l2cap_chan_create(void)
 	write_unlock(&chan_list_lock);
 
 	INIT_DELAYED_WORK(&chan->chan_timer, l2cap_chan_timeout);
+	INIT_DELAYED_WORK(&chan->retrans_timer, l2cap_retrans_timeout);
+	INIT_DELAYED_WORK(&chan->monitor_timer, l2cap_monitor_timeout);
+	INIT_DELAYED_WORK(&chan->ack_timer, l2cap_ack_timeout);
 
 	chan->state = BT_OPEN;
 
@@ -3320,10 +3326,6 @@ int l2cap_ertm_init(struct l2cap_chan *chan)
 	chan->rx_state = L2CAP_RX_STATE_RECV;
 	chan->tx_state = L2CAP_TX_STATE_XMIT;
 
-	INIT_DELAYED_WORK(&chan->retrans_timer, l2cap_retrans_timeout);
-	INIT_DELAYED_WORK(&chan->monitor_timer, l2cap_monitor_timeout);
-	INIT_DELAYED_WORK(&chan->ack_timer, l2cap_ack_timeout);
-
 	skb_queue_head_init(&chan->srej_q);
 
 	err = l2cap_seq_list_init(&chan->srej_list, chan->tx_win);
-- 
cgit v1.2.3


From deee93d13d385103205879a8a0915036ecd83261 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 2 Sep 2022 20:23:48 +0900
Subject: Bluetooth: use hdev->workqueue when queuing hdev->{cmd,ncmd}_timer
 works

syzbot is reporting attempt to schedule hdev->cmd_work work from system_wq
WQ into hdev->workqueue WQ which is under draining operation [1], for
commit c8efcc2589464ac7 ("workqueue: allow chained queueing during
destruction") does not allow such operation.

The check introduced by commit 877afadad2dce8aa ("Bluetooth: When HCI work
queue is drained, only queue chained work") was incomplete.

Use hdev->workqueue WQ when queuing hdev->{cmd,ncmd}_timer works because
hci_{cmd,ncmd}_timeout() calls queue_work(hdev->workqueue). Also, protect
the queuing operation with RCU read lock in order to avoid calling
queue_delayed_work() after cancel_delayed_work() completed.

Link: https://syzkaller.appspot.com/bug?extid=243b7d89777f90f7613b [1]
Reported-by: syzbot <syzbot+243b7d89777f90f7613b@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Fixes: 877afadad2dce8aa ("Bluetooth: When HCI work queue is drained, only queue chained work")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_core.c  | 15 +++++++++++++--
 net/bluetooth/hci_event.c |  6 ++++--
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 3803e54f23c0..66c7cdba0d32 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -597,6 +597,15 @@ static int hci_dev_do_reset(struct hci_dev *hdev)
 
 	/* Cancel these to avoid queueing non-chained pending work */
 	hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
+	/* Wait for
+	 *
+	 *    if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
+	 *        queue_delayed_work(&hdev->{cmd,ncmd}_timer)
+	 *
+	 * inside RCU section to see the flag or complete scheduling.
+	 */
+	synchronize_rcu();
+	/* Explicitly cancel works in case scheduled after setting the flag. */
 	cancel_delayed_work(&hdev->cmd_timer);
 	cancel_delayed_work(&hdev->ncmd_timer);
 
@@ -4063,12 +4072,14 @@ static void hci_cmd_work(struct work_struct *work)
 			if (res < 0)
 				__hci_cmd_sync_cancel(hdev, -res);
 
+			rcu_read_lock();
 			if (test_bit(HCI_RESET, &hdev->flags) ||
 			    hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
 				cancel_delayed_work(&hdev->cmd_timer);
 			else
-				schedule_delayed_work(&hdev->cmd_timer,
-						      HCI_CMD_TIMEOUT);
+				queue_delayed_work(hdev->workqueue, &hdev->cmd_timer,
+						   HCI_CMD_TIMEOUT);
+			rcu_read_unlock();
 		} else {
 			skb_queue_head(&hdev->cmd_q, skb);
 			queue_work(hdev->workqueue, &hdev->cmd_work);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 5acb6fa6d676..ed3e5b251af1 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -3767,16 +3767,18 @@ static inline void handle_cmd_cnt_and_timer(struct hci_dev *hdev, u8 ncmd)
 {
 	cancel_delayed_work(&hdev->cmd_timer);
 
+	rcu_read_lock();
 	if (!test_bit(HCI_RESET, &hdev->flags)) {
 		if (ncmd) {
 			cancel_delayed_work(&hdev->ncmd_timer);
 			atomic_set(&hdev->cmd_cnt, 1);
 		} else {
 			if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
-				schedule_delayed_work(&hdev->ncmd_timer,
-						      HCI_NCMD_TIMEOUT);
+				queue_delayed_work(hdev->workqueue, &hdev->ncmd_timer,
+						   HCI_NCMD_TIMEOUT);
 		}
 	}
+	rcu_read_unlock();
 }
 
 static u8 hci_cc_le_read_buffer_size_v2(struct hci_dev *hdev, void *data,
-- 
cgit v1.2.3


From 3124d320c22f3f4388d9ac5c8f37eaad0cefd6b1 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 29 Aug 2022 23:58:12 +0900
Subject: Bluetooth: hci_{ldisc,serdev}: check percpu_init_rwsem() failure

syzbot is reporting NULL pointer dereference at hci_uart_tty_close() [1],
for rcu_sync_enter() is called without rcu_sync_init() due to
hci_uart_tty_open() ignoring percpu_init_rwsem() failure.

While we are at it, fix that hci_uart_register_device() ignores
percpu_init_rwsem() failure and hci_uart_unregister_device() does not
call percpu_free_rwsem().

Link: https://syzkaller.appspot.com/bug?extid=576dfca25381fb6fbc5f [1]
Reported-by: syzbot <syzbot+576dfca25381fb6fbc5f@syzkaller.appspotmail.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Fixes: 67d2f8781b9f00d1 ("Bluetooth: hci_ldisc: Allow sleeping while proto locks are held.")
Fixes: d73e172816652772 ("Bluetooth: hci_serdev: Init hci_uart proto_lock to avoid oops")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/hci_ldisc.c  |  7 +++++--
 drivers/bluetooth/hci_serdev.c | 10 +++++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index f537673ede17..865112e96ff9 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -493,6 +493,11 @@ static int hci_uart_tty_open(struct tty_struct *tty)
 		BT_ERR("Can't allocate control structure");
 		return -ENFILE;
 	}
+	if (percpu_init_rwsem(&hu->proto_lock)) {
+		BT_ERR("Can't allocate semaphore structure");
+		kfree(hu);
+		return -ENOMEM;
+	}
 
 	tty->disc_data = hu;
 	hu->tty = tty;
@@ -505,8 +510,6 @@ static int hci_uart_tty_open(struct tty_struct *tty)
 	INIT_WORK(&hu->init_ready, hci_uart_init_work);
 	INIT_WORK(&hu->write_work, hci_uart_write_work);
 
-	percpu_init_rwsem(&hu->proto_lock);
-
 	/* Flush any pending characters in the driver */
 	tty_driver_flush_buffer(tty);
 
diff --git a/drivers/bluetooth/hci_serdev.c b/drivers/bluetooth/hci_serdev.c
index c0e5f42ec6b7..f16fd79bc02b 100644
--- a/drivers/bluetooth/hci_serdev.c
+++ b/drivers/bluetooth/hci_serdev.c
@@ -310,11 +310,12 @@ int hci_uart_register_device(struct hci_uart *hu,
 
 	serdev_device_set_client_ops(hu->serdev, &hci_serdev_client_ops);
 
+	if (percpu_init_rwsem(&hu->proto_lock))
+		return -ENOMEM;
+
 	err = serdev_device_open(hu->serdev);
 	if (err)
-		return err;
-
-	percpu_init_rwsem(&hu->proto_lock);
+		goto err_rwsem;
 
 	err = p->open(hu);
 	if (err)
@@ -389,6 +390,8 @@ err_alloc:
 	p->close(hu);
 err_open:
 	serdev_device_close(hu->serdev);
+err_rwsem:
+	percpu_free_rwsem(&hu->proto_lock);
 	return err;
 }
 EXPORT_SYMBOL_GPL(hci_uart_register_device);
@@ -410,5 +413,6 @@ void hci_uart_unregister_device(struct hci_uart *hu)
 		clear_bit(HCI_UART_PROTO_READY, &hu->flags);
 		serdev_device_close(hu->serdev);
 	}
+	percpu_free_rwsem(&hu->proto_lock);
 }
 EXPORT_SYMBOL_GPL(hci_uart_unregister_device);
-- 
cgit v1.2.3


From a7e85406bdbd0c376f3997e571f7073b9527272e Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 19 Sep 2022 11:57:14 +0800
Subject: selftests/bpf: Add test result messages for
 test_task_storage_map_stress_lookup

Add test result message when test_task_storage_map_stress_lookup()
succeeds or is skipped. The test case can be skipped due to the choose
of preemption model in kernel config, so export skips in test_maps.c and
increase it when needed.

The following is the output of test_maps when the test case succeeds or
is skipped:

  test_task_storage_map_stress_lookup:PASS
  test_maps: OK, 0 SKIPPED

  test_task_storage_map_stress_lookup SKIP (no CONFIG_PREEMPT)
  test_maps: OK, 1 SKIPPED

Fixes: 73b97bc78b32 ("selftests/bpf: Test concurrent updates on bpf_task_storage_busy")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20220919035714.2195144-1-houtao@huaweicloud.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/map_tests/task_storage_map.c | 6 +++++-
 tools/testing/selftests/bpf/test_maps.c                  | 2 +-
 tools/testing/selftests/bpf/test_maps.h                  | 2 ++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/map_tests/task_storage_map.c b/tools/testing/selftests/bpf/map_tests/task_storage_map.c
index 1adc9c292eb2..aac08c85240b 100644
--- a/tools/testing/selftests/bpf/map_tests/task_storage_map.c
+++ b/tools/testing/selftests/bpf/map_tests/task_storage_map.c
@@ -77,8 +77,11 @@ void test_task_storage_map_stress_lookup(void)
 	CHECK(err, "open_and_load", "error %d\n", err);
 
 	/* Only for a fully preemptible kernel */
-	if (!skel->kconfig->CONFIG_PREEMPT)
+	if (!skel->kconfig->CONFIG_PREEMPT) {
+		printf("%s SKIP (no CONFIG_PREEMPT)\n", __func__);
+		skips++;
 		return;
+	}
 
 	/* Save the old affinity setting */
 	sched_getaffinity(getpid(), sizeof(old), &old);
@@ -119,4 +122,5 @@ out:
 	read_bpf_task_storage_busy__destroy(skel);
 	/* Restore affinity setting */
 	sched_setaffinity(getpid(), sizeof(old), &old);
+	printf("%s:PASS\n", __func__);
 }
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 00b9cc305e58..289ff310e283 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -30,7 +30,7 @@
 #define ENOTSUPP 524
 #endif
 
-static int skips;
+int skips;
 
 static struct bpf_map_create_opts map_opts = { .sz = sizeof(map_opts) };
 
diff --git a/tools/testing/selftests/bpf/test_maps.h b/tools/testing/selftests/bpf/test_maps.h
index 77d8587ac4ed..f6fbca761732 100644
--- a/tools/testing/selftests/bpf/test_maps.h
+++ b/tools/testing/selftests/bpf/test_maps.h
@@ -14,4 +14,6 @@
 	}								\
 })
 
+extern int skips;
+
 #endif
-- 
cgit v1.2.3


From 1dd73601a1cba37a0ed5f89a8662c90191df5873 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Fri, 9 Sep 2022 10:39:48 +0800
Subject: erofs: fix order >= MAX_ORDER warning due to crafted negative i_size

As syzbot reported [1], the root cause is that i_size field is a
signed type, and negative i_size is also less than EROFS_BLKSIZ.
As a consequence, it's handled as fast symlink unexpectedly.

Let's fall back to the generic path to deal with such unusual i_size.

[1] https://lore.kernel.org/r/000000000000ac8efa05e7feaa1f@google.com

Reported-by: syzbot+f966c13b1b4fc0403b19@syzkaller.appspotmail.com
Fixes: 431339ba9042 ("staging: erofs: add inode operations")
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Link: https://lore.kernel.org/r/20220909023948.28925-1-hsiangkao@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 95a403720e8c..16cf9a283557 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -214,7 +214,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr,
 
 	/* if it cannot be handled with fast symlink scheme */
 	if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
-	    inode->i_size >= EROFS_BLKSIZ) {
+	    inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) {
 		inode->i_op = &erofs_symlink_iops;
 		return 0;
 	}
-- 
cgit v1.2.3


From 1015c1016c231b26d4e2c9b3da65b6c043eb97a3 Mon Sep 17 00:00:00 2001
From: Jia Zhu <zhujia.zj@bytedance.com>
Date: Sun, 18 Sep 2022 12:34:51 +0800
Subject: erofs: use kill_anon_super() to kill super in fscache mode

Use kill_anon_super() instead of generic_shutdown_super() since the
mount() in erofs fscache mode uses get_tree_nodev() and associated
anon bdev needs to be freed.

Fixes: 9c0cc9c729657 ("erofs: add 'fsid' mount option")
Suggested-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Signed-off-by: Jia Zhu <zhujia.zj@bytedance.com>
Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220918043456.147-2-zhujia.zj@bytedance.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 3173debeaa5a..9716d355a63e 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -879,7 +879,7 @@ static void erofs_kill_sb(struct super_block *sb)
 	WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
 
 	if (erofs_is_fscache_mode(sb))
-		generic_shutdown_super(sb);
+		kill_anon_super(sb);
 	else
 		kill_block_super(sb);
 
-- 
cgit v1.2.3


From e1de2da0b7ac2dc0120c2ba8c7044788611933ea Mon Sep 17 00:00:00 2001
From: Jia Zhu <zhujia.zj@bytedance.com>
Date: Sun, 18 Sep 2022 12:34:52 +0800
Subject: erofs: code clean up for fscache

Some cleanups. No logic changes.

Suggested-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Signed-off-by: Jia Zhu <zhujia.zj@bytedance.com>
Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220918043456.147-3-zhujia.zj@bytedance.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/fscache.c  | 39 +++++++++++++++++++--------------------
 fs/erofs/internal.h | 19 +++++++++----------
 fs/erofs/super.c    | 21 ++++++++-------------
 3 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index b5fd9d71e67f..1eb63987e815 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -421,9 +421,8 @@ const struct address_space_operations erofs_fscache_access_aops = {
 	.readahead = erofs_fscache_readahead,
 };
 
-int erofs_fscache_register_cookie(struct super_block *sb,
-				  struct erofs_fscache **fscache,
-				  char *name, bool need_inode)
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+						     char *name, bool need_inode)
 {
 	struct fscache_volume *volume = EROFS_SB(sb)->volume;
 	struct erofs_fscache *ctx;
@@ -432,7 +431,7 @@ int erofs_fscache_register_cookie(struct super_block *sb,
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE,
 					name, strlen(name), NULL, 0, 0);
@@ -462,42 +461,33 @@ int erofs_fscache_register_cookie(struct super_block *sb,
 		ctx->inode = inode;
 	}
 
-	*fscache = ctx;
-	return 0;
+	return ctx;
 
 err_cookie:
 	fscache_unuse_cookie(ctx->cookie, NULL, NULL);
 	fscache_relinquish_cookie(ctx->cookie, false);
-	ctx->cookie = NULL;
 err:
 	kfree(ctx);
-	return ret;
+	return ERR_PTR(ret);
 }
 
-void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache)
+void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
 {
-	struct erofs_fscache *ctx = *fscache;
-
 	if (!ctx)
 		return;
 
 	fscache_unuse_cookie(ctx->cookie, NULL, NULL);
 	fscache_relinquish_cookie(ctx->cookie, false);
-	ctx->cookie = NULL;
-
 	iput(ctx->inode);
-	ctx->inode = NULL;
-
 	kfree(ctx);
-	*fscache = NULL;
 }
 
 int erofs_fscache_register_fs(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 	struct fscache_volume *volume;
+	struct erofs_fscache *fscache;
 	char *name;
-	int ret = 0;
 
 	name = kasprintf(GFP_KERNEL, "erofs,%s", sbi->opt.fsid);
 	if (!name)
@@ -506,19 +496,28 @@ int erofs_fscache_register_fs(struct super_block *sb)
 	volume = fscache_acquire_volume(name, NULL, NULL, 0);
 	if (IS_ERR_OR_NULL(volume)) {
 		erofs_err(sb, "failed to register volume for %s", name);
-		ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP;
-		volume = NULL;
+		kfree(name);
+		return volume ? PTR_ERR(volume) : -EOPNOTSUPP;
 	}
 
 	sbi->volume = volume;
 	kfree(name);
-	return ret;
+
+	fscache = erofs_fscache_register_cookie(sb, sbi->opt.fsid, true);
+	/* acquired volume will be relinquished in kill_sb() */
+	if (IS_ERR(fscache))
+		return PTR_ERR(fscache);
+
+	sbi->s_fscache = fscache;
+	return 0;
 }
 
 void erofs_fscache_unregister_fs(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 
+	erofs_fscache_unregister_cookie(sbi->s_fscache);
 	fscache_relinquish_volume(sbi->volume, NULL, false);
+	sbi->s_fscache = NULL;
 	sbi->volume = NULL;
 }
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index a01cc82795a2..39fb124ce670 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -581,27 +581,26 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb,
 int erofs_fscache_register_fs(struct super_block *sb);
 void erofs_fscache_unregister_fs(struct super_block *sb);
 
-int erofs_fscache_register_cookie(struct super_block *sb,
-				  struct erofs_fscache **fscache,
-				  char *name, bool need_inode);
-void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache);
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+						     char *name, bool need_inode);
+void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
 
 extern const struct address_space_operations erofs_fscache_access_aops;
 #else
 static inline int erofs_fscache_register_fs(struct super_block *sb)
 {
-	return 0;
+	return -EOPNOTSUPP;
 }
 static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
 
-static inline int erofs_fscache_register_cookie(struct super_block *sb,
-						struct erofs_fscache **fscache,
-						char *name, bool need_inode)
+static inline
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+						     char *name, bool need_inode)
 {
-	return -EOPNOTSUPP;
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline void erofs_fscache_unregister_cookie(struct erofs_fscache **fscache)
+static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache)
 {
 }
 #endif
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 9716d355a63e..884e7ed3d760 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -224,10 +224,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 			     struct erofs_device_info *dif, erofs_off_t *pos)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	struct erofs_fscache *fscache;
 	struct erofs_deviceslot *dis;
 	struct block_device *bdev;
 	void *ptr;
-	int ret;
 
 	ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP);
 	if (IS_ERR(ptr))
@@ -245,10 +245,10 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 	}
 
 	if (erofs_is_fscache_mode(sb)) {
-		ret = erofs_fscache_register_cookie(sb, &dif->fscache,
-				dif->path, false);
-		if (ret)
-			return ret;
+		fscache = erofs_fscache_register_cookie(sb, dif->path, false);
+		if (IS_ERR(fscache))
+			return PTR_ERR(fscache);
+		dif->fscache = fscache;
 	} else {
 		bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL,
 					  sb->s_type);
@@ -706,11 +706,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 		if (err)
 			return err;
 
-		err = erofs_fscache_register_cookie(sb, &sbi->s_fscache,
-						    sbi->opt.fsid, true);
-		if (err)
-			return err;
-
 		err = super_setup_bdi(sb);
 		if (err)
 			return err;
@@ -817,7 +812,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
 	fs_put_dax(dif->dax_dev, NULL);
 	if (dif->bdev)
 		blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
-	erofs_fscache_unregister_cookie(&dif->fscache);
+	erofs_fscache_unregister_cookie(dif->fscache);
+	dif->fscache = NULL;
 	kfree(dif->path);
 	kfree(dif);
 	return 0;
@@ -889,7 +885,6 @@ static void erofs_kill_sb(struct super_block *sb)
 
 	erofs_free_dev_context(sbi->devs);
 	fs_put_dax(sbi->dax_dev, NULL);
-	erofs_fscache_unregister_cookie(&sbi->s_fscache);
 	erofs_fscache_unregister_fs(sb);
 	kfree(sbi->opt.fsid);
 	kfree(sbi);
@@ -909,7 +904,7 @@ static void erofs_put_super(struct super_block *sb)
 	iput(sbi->managed_cache);
 	sbi->managed_cache = NULL;
 #endif
-	erofs_fscache_unregister_cookie(&sbi->s_fscache);
+	erofs_fscache_unregister_fs(sb);
 }
 
 static struct file_system_type erofs_fs_type = {
-- 
cgit v1.2.3


From 8b7adf1dff3d5baf687acda936f193f80b7e0179 Mon Sep 17 00:00:00 2001
From: Jia Zhu <zhujia.zj@bytedance.com>
Date: Sun, 18 Sep 2022 12:34:53 +0800
Subject: erofs: introduce fscache-based domain

A new fscache-based shared domain mode is going to be introduced for
erofs. In which case, same data blobs in same domain will be shared
and reused to reduce on-disk space usage.

The implementation of sharing blobs will be introduced in subsequent
patches.

Signed-off-by: Jia Zhu <zhujia.zj@bytedance.com>
Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220918043456.147-4-zhujia.zj@bytedance.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/fscache.c  | 129 +++++++++++++++++++++++++++++++++++++++++++++-------
 fs/erofs/internal.h |   9 ++++
 2 files changed, 121 insertions(+), 17 deletions(-)

diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 1eb63987e815..8a1be6077ca4 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -1,10 +1,14 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (C) 2022, Alibaba Cloud
+ * Copyright (C) 2022, Bytedance Inc. All rights reserved.
  */
 #include <linux/fscache.h>
 #include "internal.h"
 
+static DEFINE_MUTEX(erofs_domain_list_lock);
+static LIST_HEAD(erofs_domain_list);
+
 static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping,
 					     loff_t start, size_t len)
 {
@@ -421,6 +425,99 @@ const struct address_space_operations erofs_fscache_access_aops = {
 	.readahead = erofs_fscache_readahead,
 };
 
+static void erofs_fscache_domain_put(struct erofs_domain *domain)
+{
+	if (!domain)
+		return;
+	mutex_lock(&erofs_domain_list_lock);
+	if (refcount_dec_and_test(&domain->ref)) {
+		list_del(&domain->list);
+		mutex_unlock(&erofs_domain_list_lock);
+		fscache_relinquish_volume(domain->volume, NULL, false);
+		kfree(domain->domain_id);
+		kfree(domain);
+		return;
+	}
+	mutex_unlock(&erofs_domain_list_lock);
+}
+
+static int erofs_fscache_register_volume(struct super_block *sb)
+{
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	char *domain_id = sbi->opt.domain_id;
+	struct fscache_volume *volume;
+	char *name;
+	int ret = 0;
+
+	name = kasprintf(GFP_KERNEL, "erofs,%s",
+			 domain_id ? domain_id : sbi->opt.fsid);
+	if (!name)
+		return -ENOMEM;
+
+	volume = fscache_acquire_volume(name, NULL, NULL, 0);
+	if (IS_ERR_OR_NULL(volume)) {
+		erofs_err(sb, "failed to register volume for %s", name);
+		ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP;
+		volume = NULL;
+	}
+
+	sbi->volume = volume;
+	kfree(name);
+	return ret;
+}
+
+static int erofs_fscache_init_domain(struct super_block *sb)
+{
+	int err;
+	struct erofs_domain *domain;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	domain = kzalloc(sizeof(struct erofs_domain), GFP_KERNEL);
+	if (!domain)
+		return -ENOMEM;
+
+	domain->domain_id = kstrdup(sbi->opt.domain_id, GFP_KERNEL);
+	if (!domain->domain_id) {
+		kfree(domain);
+		return -ENOMEM;
+	}
+
+	err = erofs_fscache_register_volume(sb);
+	if (err)
+		goto out;
+
+	domain->volume = sbi->volume;
+	refcount_set(&domain->ref, 1);
+	list_add(&domain->list, &erofs_domain_list);
+	sbi->domain = domain;
+	return 0;
+out:
+	kfree(domain->domain_id);
+	kfree(domain);
+	return err;
+}
+
+static int erofs_fscache_register_domain(struct super_block *sb)
+{
+	int err;
+	struct erofs_domain *domain;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+	mutex_lock(&erofs_domain_list_lock);
+	list_for_each_entry(domain, &erofs_domain_list, list) {
+		if (!strcmp(domain->domain_id, sbi->opt.domain_id)) {
+			sbi->domain = domain;
+			sbi->volume = domain->volume;
+			refcount_inc(&domain->ref);
+			mutex_unlock(&erofs_domain_list_lock);
+			return 0;
+		}
+	}
+	err = erofs_fscache_init_domain(sb);
+	mutex_unlock(&erofs_domain_list_lock);
+	return err;
+}
+
 struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
 						     char *name, bool need_inode)
 {
@@ -484,27 +581,19 @@ void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
 
 int erofs_fscache_register_fs(struct super_block *sb)
 {
+	int ret;
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
-	struct fscache_volume *volume;
 	struct erofs_fscache *fscache;
-	char *name;
-
-	name = kasprintf(GFP_KERNEL, "erofs,%s", sbi->opt.fsid);
-	if (!name)
-		return -ENOMEM;
 
-	volume = fscache_acquire_volume(name, NULL, NULL, 0);
-	if (IS_ERR_OR_NULL(volume)) {
-		erofs_err(sb, "failed to register volume for %s", name);
-		kfree(name);
-		return volume ? PTR_ERR(volume) : -EOPNOTSUPP;
-	}
-
-	sbi->volume = volume;
-	kfree(name);
+	if (sbi->opt.domain_id)
+		ret = erofs_fscache_register_domain(sb);
+	else
+		ret = erofs_fscache_register_volume(sb);
+	if (ret)
+		return ret;
 
+	/* acquired domain/volume will be relinquished in kill_sb() on error */
 	fscache = erofs_fscache_register_cookie(sb, sbi->opt.fsid, true);
-	/* acquired volume will be relinquished in kill_sb() */
 	if (IS_ERR(fscache))
 		return PTR_ERR(fscache);
 
@@ -517,7 +606,13 @@ void erofs_fscache_unregister_fs(struct super_block *sb)
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 
 	erofs_fscache_unregister_cookie(sbi->s_fscache);
-	fscache_relinquish_volume(sbi->volume, NULL, false);
+
+	if (sbi->domain)
+		erofs_fscache_domain_put(sbi->domain);
+	else
+		fscache_relinquish_volume(sbi->volume, NULL, false);
+
 	sbi->s_fscache = NULL;
 	sbi->volume = NULL;
+	sbi->domain = NULL;
 }
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 39fb124ce670..93fc4ad61cfb 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -76,6 +76,7 @@ struct erofs_mount_opts {
 #endif
 	unsigned int mount_opt;
 	char *fsid;
+	char *domain_id;
 };
 
 struct erofs_dev_context {
@@ -98,6 +99,13 @@ struct erofs_sb_lz4_info {
 	u16 max_pclusterblks;
 };
 
+struct erofs_domain {
+	refcount_t ref;
+	struct list_head list;
+	struct fscache_volume *volume;
+	char *domain_id;
+};
+
 struct erofs_fscache {
 	struct fscache_cookie *cookie;
 	struct inode *inode;
@@ -157,6 +165,7 @@ struct erofs_sb_info {
 	/* fscache support */
 	struct fscache_volume *volume;
 	struct erofs_fscache *s_fscache;
+	struct erofs_domain *domain;
 };
 
 #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
-- 
cgit v1.2.3


From a9849560c55e9e4ab9c53d073363dd6e19ec06ef Mon Sep 17 00:00:00 2001
From: Jia Zhu <zhujia.zj@bytedance.com>
Date: Sun, 18 Sep 2022 12:34:54 +0800
Subject: erofs: introduce a pseudo mnt to manage shared cookies

Use a pseudo mnt to manage shared cookies.

Signed-off-by: Jia Zhu <zhujia.zj@bytedance.com>
Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220918043456.147-5-zhujia.zj@bytedance.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/fscache.c  | 13 +++++++++++++
 fs/erofs/internal.h |  1 +
 fs/erofs/super.c    | 33 +++++++++++++++++++++++++++++++--
 3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 8a1be6077ca4..0480aaf44cb9 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -8,6 +8,7 @@
 
 static DEFINE_MUTEX(erofs_domain_list_lock);
 static LIST_HEAD(erofs_domain_list);
+static struct vfsmount *erofs_pseudo_mnt;
 
 static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping,
 					     loff_t start, size_t len)
@@ -432,6 +433,10 @@ static void erofs_fscache_domain_put(struct erofs_domain *domain)
 	mutex_lock(&erofs_domain_list_lock);
 	if (refcount_dec_and_test(&domain->ref)) {
 		list_del(&domain->list);
+		if (list_empty(&erofs_domain_list)) {
+			kern_unmount(erofs_pseudo_mnt);
+			erofs_pseudo_mnt = NULL;
+		}
 		mutex_unlock(&erofs_domain_list_lock);
 		fscache_relinquish_volume(domain->volume, NULL, false);
 		kfree(domain->domain_id);
@@ -486,6 +491,14 @@ static int erofs_fscache_init_domain(struct super_block *sb)
 	if (err)
 		goto out;
 
+	if (!erofs_pseudo_mnt) {
+		erofs_pseudo_mnt = kern_mount(&erofs_fs_type);
+		if (IS_ERR(erofs_pseudo_mnt)) {
+			err = PTR_ERR(erofs_pseudo_mnt);
+			goto out;
+		}
+	}
+
 	domain->volume = sbi->volume;
 	refcount_set(&domain->ref, 1);
 	list_add(&domain->list, &erofs_domain_list);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 93fc4ad61cfb..0db56259160c 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -373,6 +373,7 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
 }
 
 extern const struct super_operations erofs_sops;
+extern struct file_system_type erofs_fs_type;
 
 extern const struct address_space_operations erofs_raw_access_aops;
 extern const struct address_space_operations z_erofs_aops;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 884e7ed3d760..ab746181ae08 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -676,6 +676,13 @@ static const struct export_operations erofs_export_ops = {
 	.get_parent = erofs_get_parent,
 };
 
+static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc)
+{
+	static const struct tree_descr empty_descr = {""};
+
+	return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr);
+}
+
 static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct inode *inode;
@@ -776,6 +783,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 	return 0;
 }
 
+static int erofs_fc_anon_get_tree(struct fs_context *fc)
+{
+	return get_tree_nodev(fc, erofs_fc_fill_pseudo_super);
+}
+
 static int erofs_fc_get_tree(struct fs_context *fc)
 {
 	struct erofs_fs_context *ctx = fc->fs_private;
@@ -844,10 +856,21 @@ static const struct fs_context_operations erofs_context_ops = {
 	.free		= erofs_fc_free,
 };
 
+static const struct fs_context_operations erofs_anon_context_ops = {
+	.get_tree       = erofs_fc_anon_get_tree,
+};
+
 static int erofs_init_fs_context(struct fs_context *fc)
 {
-	struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	struct erofs_fs_context *ctx;
+
+	/* pseudo mount for anon inodes */
+	if (fc->sb_flags & SB_KERNMOUNT) {
+		fc->ops = &erofs_anon_context_ops;
+		return 0;
+	}
 
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
 	ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
@@ -874,6 +897,12 @@ static void erofs_kill_sb(struct super_block *sb)
 
 	WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
 
+	/* pseudo mount for anon inodes */
+	if (sb->s_flags & SB_KERNMOUNT) {
+		kill_anon_super(sb);
+		return;
+	}
+
 	if (erofs_is_fscache_mode(sb))
 		kill_anon_super(sb);
 	else
@@ -907,7 +936,7 @@ static void erofs_put_super(struct super_block *sb)
 	erofs_fscache_unregister_fs(sb);
 }
 
-static struct file_system_type erofs_fs_type = {
+struct file_system_type erofs_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "erofs",
 	.init_fs_context = erofs_init_fs_context,
-- 
cgit v1.2.3


From 7d41963759feb3cfa4c1164b8b9db5d1f055932d Mon Sep 17 00:00:00 2001
From: Jia Zhu <zhujia.zj@bytedance.com>
Date: Sun, 18 Sep 2022 19:01:50 +0800
Subject: erofs: Support sharing cookies in the same domain

Several erofs filesystems can belong to one domain, and data blobs can
be shared among these erofs filesystems of same domain.

Users could specify domain_id mount option to create or join into a
domain.

Signed-off-by: Jia Zhu <zhujia.zj@bytedance.com>
Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220918110150.6338-1-zhujia.zj@bytedance.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/fscache.c  | 99 +++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/erofs/internal.h |  3 ++
 2 files changed, 96 insertions(+), 6 deletions(-)

diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 0480aaf44cb9..bc3556e77b93 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -7,6 +7,7 @@
 #include "internal.h"
 
 static DEFINE_MUTEX(erofs_domain_list_lock);
+static DEFINE_MUTEX(erofs_domain_cookies_lock);
 static LIST_HEAD(erofs_domain_list);
 static struct vfsmount *erofs_pseudo_mnt;
 
@@ -531,8 +532,9 @@ static int erofs_fscache_register_domain(struct super_block *sb)
 	return err;
 }
 
-struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
-						     char *name, bool need_inode)
+static
+struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
+						    char *name, bool need_inode)
 {
 	struct fscache_volume *volume = EROFS_SB(sb)->volume;
 	struct erofs_fscache *ctx;
@@ -581,17 +583,102 @@ err:
 	return ERR_PTR(ret);
 }
 
-void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
+static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx)
 {
-	if (!ctx)
-		return;
-
 	fscache_unuse_cookie(ctx->cookie, NULL, NULL);
 	fscache_relinquish_cookie(ctx->cookie, false);
 	iput(ctx->inode);
+	kfree(ctx->name);
 	kfree(ctx);
 }
 
+static
+struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb,
+		char *name, bool need_inode)
+{
+	int err;
+	struct inode *inode;
+	struct erofs_fscache *ctx;
+	struct erofs_domain *domain = EROFS_SB(sb)->domain;
+
+	ctx = erofs_fscache_acquire_cookie(sb, name, need_inode);
+	if (IS_ERR(ctx))
+		return ctx;
+
+	ctx->name = kstrdup(name, GFP_KERNEL);
+	if (!ctx->name) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	inode = new_inode(erofs_pseudo_mnt->mnt_sb);
+	if (!inode) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	ctx->domain = domain;
+	ctx->anon_inode = inode;
+	inode->i_private = ctx;
+	refcount_inc(&domain->ref);
+	return ctx;
+out:
+	erofs_fscache_relinquish_cookie(ctx);
+	return ERR_PTR(err);
+}
+
+static
+struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
+						   char *name, bool need_inode)
+{
+	struct inode *inode;
+	struct erofs_fscache *ctx;
+	struct erofs_domain *domain = EROFS_SB(sb)->domain;
+	struct super_block *psb = erofs_pseudo_mnt->mnt_sb;
+
+	mutex_lock(&erofs_domain_cookies_lock);
+	list_for_each_entry(inode, &psb->s_inodes, i_sb_list) {
+		ctx = inode->i_private;
+		if (!ctx || ctx->domain != domain || strcmp(ctx->name, name))
+			continue;
+		igrab(inode);
+		mutex_unlock(&erofs_domain_cookies_lock);
+		return ctx;
+	}
+	ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode);
+	mutex_unlock(&erofs_domain_cookies_lock);
+	return ctx;
+}
+
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+						    char *name, bool need_inode)
+{
+	if (EROFS_SB(sb)->opt.domain_id)
+		return erofs_domain_register_cookie(sb, name, need_inode);
+	return erofs_fscache_acquire_cookie(sb, name, need_inode);
+}
+
+void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
+{
+	bool drop;
+	struct erofs_domain *domain;
+
+	if (!ctx)
+		return;
+	domain = ctx->domain;
+	if (domain) {
+		mutex_lock(&erofs_domain_cookies_lock);
+		drop = atomic_read(&ctx->anon_inode->i_count) == 1;
+		iput(ctx->anon_inode);
+		mutex_unlock(&erofs_domain_cookies_lock);
+		if (!drop)
+			return;
+	}
+
+	erofs_fscache_relinquish_cookie(ctx);
+	erofs_fscache_domain_put(domain);
+}
+
 int erofs_fscache_register_fs(struct super_block *sb)
 {
 	int ret;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 0db56259160c..ef3f7982b92d 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -109,6 +109,9 @@ struct erofs_domain {
 struct erofs_fscache {
 	struct fscache_cookie *cookie;
 	struct inode *inode;
+	struct inode *anon_inode;
+	struct erofs_domain *domain;
+	char *name;
 };
 
 struct erofs_sb_info {
-- 
cgit v1.2.3


From 2ef164414123fcf574aff7a0be5f71f7e60a3fec Mon Sep 17 00:00:00 2001
From: Jia Zhu <zhujia.zj@bytedance.com>
Date: Sun, 18 Sep 2022 12:34:56 +0800
Subject: erofs: introduce 'domain_id' mount option

Introduce 'domain_id' mount option to enable shared domain sementics.
In which case, the related cookie is shared if two mountpoints in the
same domain have the same data blob. Users could specify the name of
domain by this mount option.

Signed-off-by: Jia Zhu <zhujia.zj@bytedance.com>
Reviewed-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220918043456.147-7-zhujia.zj@bytedance.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/super.c | 17 +++++++++++++++++
 fs/erofs/sysfs.c | 19 +++++++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index ab746181ae08..9f7fe6c04e65 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -440,6 +440,7 @@ enum {
 	Opt_dax_enum,
 	Opt_device,
 	Opt_fsid,
+	Opt_domain_id,
 	Opt_err
 };
 
@@ -465,6 +466,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
 	fsparam_enum("dax",		Opt_dax_enum, erofs_dax_param_enums),
 	fsparam_string("device",	Opt_device),
 	fsparam_string("fsid",		Opt_fsid),
+	fsparam_string("domain_id",	Opt_domain_id),
 	{}
 };
 
@@ -568,6 +570,16 @@ static int erofs_fc_parse_param(struct fs_context *fc,
 			return -ENOMEM;
 #else
 		errorfc(fc, "fsid option not supported");
+#endif
+		break;
+	case Opt_domain_id:
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+		kfree(ctx->opt.domain_id);
+		ctx->opt.domain_id = kstrdup(param->string, GFP_KERNEL);
+		if (!ctx->opt.domain_id)
+			return -ENOMEM;
+#else
+		errorfc(fc, "domain_id option not supported");
 #endif
 		break;
 	default:
@@ -702,6 +714,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_fs_info = sbi;
 	sbi->opt = ctx->opt;
 	ctx->opt.fsid = NULL;
+	ctx->opt.domain_id = NULL;
 	sbi->devs = ctx->devs;
 	ctx->devs = NULL;
 
@@ -846,6 +859,7 @@ static void erofs_fc_free(struct fs_context *fc)
 
 	erofs_free_dev_context(ctx->devs);
 	kfree(ctx->opt.fsid);
+	kfree(ctx->opt.domain_id);
 	kfree(ctx);
 }
 
@@ -916,6 +930,7 @@ static void erofs_kill_sb(struct super_block *sb)
 	fs_put_dax(sbi->dax_dev, NULL);
 	erofs_fscache_unregister_fs(sb);
 	kfree(sbi->opt.fsid);
+	kfree(sbi->opt.domain_id);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
@@ -1068,6 +1083,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 #ifdef CONFIG_EROFS_FS_ONDEMAND
 	if (opt->fsid)
 		seq_printf(seq, ",fsid=%s", opt->fsid);
+	if (opt->domain_id)
+		seq_printf(seq, ",domain_id=%s", opt->domain_id);
 #endif
 	return 0;
 }
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index c1383e508bbe..341fb43ad587 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -201,12 +201,27 @@ static struct kobject erofs_feat = {
 int erofs_register_sysfs(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	char *name;
+	char *str = NULL;
 	int err;
 
+	if (erofs_is_fscache_mode(sb)) {
+		if (sbi->opt.domain_id) {
+			str = kasprintf(GFP_KERNEL, "%s,%s", sbi->opt.domain_id,
+					sbi->opt.fsid);
+			if (!str)
+				return -ENOMEM;
+			name = str;
+		} else {
+			name = sbi->opt.fsid;
+		}
+	} else {
+		name = sb->s_id;
+	}
 	sbi->s_kobj.kset = &erofs_root;
 	init_completion(&sbi->s_kobj_unregister);
-	err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s",
-			erofs_is_fscache_mode(sb) ? sbi->opt.fsid : sb->s_id);
+	err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name);
+	kfree(str);
 	if (err)
 		goto put_sb_kobj;
 	return 0;
-- 
cgit v1.2.3


From 41d27f518b955ef4b75b02cc67392aef0809a78d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 9 Sep 2022 11:30:19 +0200
Subject: fat: port to vfs{g,u}id_t and associated helpers

A while ago we introduced a dedicated vfs{g,u}id_t type in commit
1e5267cd0895 ("mnt_idmapping: add vfs{g,u}id_t"). We already switched
over a good part of the VFS. Ultimately we will remove all legacy
idmapped mount helpers that operate only on k{g,u}id_t in favor of the
new type safe helpers that operate on vfs{g,u}id_t.

Cc: Seth Forshee (Digital Ocean) <sforshee@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/fat/file.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/fat/file.c b/fs/fat/file.c
index 3e4eb3467cb4..8a6b493b5b5f 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -461,8 +461,9 @@ static int fat_allow_set_time(struct user_namespace *mnt_userns,
 {
 	umode_t allow_utime = sbi->options.allow_utime;
 
-	if (!uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) {
-		if (in_group_p(i_gid_into_mnt(mnt_userns, inode)))
+	if (!vfsuid_eq_kuid(i_uid_into_vfsuid(mnt_userns, inode),
+			    current_fsuid())) {
+		if (vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)))
 			allow_utime >>= 3;
 		if (allow_utime & MAY_WRITE)
 			return 1;
-- 
cgit v1.2.3


From 146034fed6ee75ec09cf8f996165e2296ceae0bb Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Wed, 7 Sep 2022 18:09:34 +0900
Subject: x86/asm/bitops: Use __builtin_ffs() to evaluate constant expressions

For x86_64, the current ffs() implementation does not produce optimized
code when called with a constant expression. On the contrary, the
__builtin_ffs() functions of both GCC and clang are able to fold the
expression into a single instruction.

** Example **

Consider two dummy functions foo() and bar() as below:

  #include <linux/bitops.h>
  #define CONST 0x01000000

  unsigned int foo(void)
  {
  	return ffs(CONST);
  }

  unsigned int bar(void)
  {
  	return __builtin_ffs(CONST);
  }

GCC would produce below assembly code:

  0000000000000000 <foo>:
     0:	ba 00 00 00 01       	mov    $0x1000000,%edx
     5:	b8 ff ff ff ff       	mov    $0xffffffff,%eax
     a:	0f bc c2             	bsf    %edx,%eax
     d:	83 c0 01             	add    $0x1,%eax
    10:	c3                   	ret
  <Instructions after ret and before next function were redacted>

  0000000000000020 <bar>:
    20:	b8 19 00 00 00       	mov    $0x19,%eax
    25:	c3                   	ret

And clang would produce:

  0000000000000000 <foo>:
     0:	b8 ff ff ff ff       	mov    $0xffffffff,%eax
     5:	0f bc 05 00 00 00 00 	bsf    0x0(%rip),%eax        # c <foo+0xc>
     c:	83 c0 01             	add    $0x1,%eax
     f:	c3                   	ret

  0000000000000010 <bar>:
    10:	b8 19 00 00 00       	mov    $0x19,%eax
    15:	c3                   	ret

Both examples clearly demonstrate the benefit of using __builtin_ffs()
instead of the kernel's asm implementation for constant expressions.

However, for non constant expressions, the kernel's ffs() asm version
remains better for x86_64 because, contrary to GCC, it doesn't emit the
CMOV assembly instruction, c.f. [1] (noticeably, clang is able optimize
out the CMOV call).

Use __builtin_constant_p() to select between the kernel's ffs() and
the __builtin_ffs() depending on whether the argument is constant or
not.

As a side benefit, replacing the ffs() function declaration by a macro
also removes below -Wshadow warning:

  ./arch/x86/include/asm/bitops.h:283:28: warning: declaration of 'ffs' shadows a built-in function [-Wshadow]
    283 | static __always_inline int ffs(int x)

** Statistics **

On a allyesconfig, before...:

  $ objdump -d vmlinux.o | grep bsf | wc -l
  1081

...and after:

  $ objdump -d vmlinux.o | grep bsf | wc -l
  792

So, roughly 26.7% of the calls to ffs() were using constant
expressions and could be optimized out.

(tests done on linux v5.18-rc5 x86_64 using GCC 11.2.1)

[1] commit ca3d30cc02f7 ("x86_64, asm: Optimise fls(), ffs() and fls64()")

  [ bp: Massage commit message. ]

Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Yury Norov <yury.norov@gmail.com>
Link: https://lore.kernel.org/r/20220511160319.1045812-1-mailhol.vincent@wanadoo.fr
---
 arch/x86/include/asm/bitops.h | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 0fe9de58af31..879238e5a6a0 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -292,18 +292,7 @@ static __always_inline unsigned long __fls(unsigned long word)
 #undef ADDR
 
 #ifdef __KERNEL__
-/**
- * ffs - find first set bit in word
- * @x: the word to search
- *
- * This is defined the same way as the libc and compiler builtin ffs
- * routines, therefore differs in spirit from the other bitops.
- *
- * ffs(value) returns 0 if value is 0 or the position of the first
- * set bit if value is nonzero. The first (least significant) bit
- * is at position 1.
- */
-static __always_inline int ffs(int x)
+static __always_inline int variable_ffs(int x)
 {
 	int r;
 
@@ -333,6 +322,19 @@ static __always_inline int ffs(int x)
 	return r + 1;
 }
 
+/**
+ * ffs - find first set bit in word
+ * @x: the word to search
+ *
+ * This is defined the same way as the libc and compiler builtin ffs
+ * routines, therefore differs in spirit from the other bitops.
+ *
+ * ffs(value) returns 0 if value is 0 or the position of the first
+ * set bit if value is nonzero. The first (least significant) bit
+ * is at position 1.
+ */
+#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x))
+
 /**
  * fls - find last set bit in word
  * @x: the word to search
-- 
cgit v1.2.3


From fdb6649ab7c142e497539a471e573c2593b9c923 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Wed, 7 Sep 2022 18:09:35 +0900
Subject: x86/asm/bitops: Use __builtin_ctzl() to evaluate constant expressions

If x is not 0, __ffs(x) is equivalent to:
  (unsigned long)__builtin_ctzl(x)
And if x is not ~0UL, ffz(x) is equivalent to:
  (unsigned long)__builtin_ctzl(~x)
Because __builting_ctzl() returns an int, a cast to (unsigned long) is
necessary to avoid potential warnings on implicit casts.

Concerning the edge cases, __builtin_ctzl(0) is always undefined,
whereas __ffs(0) and ffz(~0UL) may or may not be defined, depending on
the processor. Regardless, for both functions, developers are asked to
check against 0 or ~0UL so replacing __ffs() or ffz() by
__builting_ctzl() is safe.

For x86_64, the current __ffs() and ffz() implementations do not
produce optimized code when called with a constant expression. On the
contrary, the __builtin_ctzl() folds into a single instruction.

However, for non constant expressions, the __ffs() and ffz() asm
versions of the kernel remains slightly better than the code produced
by GCC (it produces a useless instruction to clear eax).

Use __builtin_constant_p() to select between the kernel's
__ffs()/ffz() and the __builtin_ctzl() depending on whether the
argument is constant or not.

** Statistics **

On a allyesconfig, before...:

  $ objdump -d vmlinux.o | grep tzcnt | wc -l
  3607

...and after:

  $ objdump -d vmlinux.o | grep tzcnt | wc -l
  2600

So, roughly 27.9% of the calls to either __ffs() or ffz() were using
constant expressions and could be optimized out.

(tests done on linux v5.18-rc5 x86_64 using GCC 11.2.1)

Note: on x86_64, the BSF instruction produces TZCNT when used with the
REP prefix (which explain the use of `grep tzcnt' instead of `grep bsf'
in above benchmark). c.f. [1]

[1] e26a44a2d618 ("x86: Use REP BSF unconditionally")

  [ bp: Massage commit message. ]

Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Yury Norov <yury.norov@gmail.com>
Link: https://lore.kernel.org/r/20220511160319.1045812-1-mailhol.vincent@wanadoo.fr
---
 arch/x86/include/asm/bitops.h | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 879238e5a6a0..2edf68475fec 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -247,17 +247,30 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
 					  variable_test_bit(nr, addr);
 }
 
+static __always_inline unsigned long variable__ffs(unsigned long word)
+{
+	asm("rep; bsf %1,%0"
+		: "=r" (word)
+		: "rm" (word));
+	return word;
+}
+
 /**
  * __ffs - find first set bit in word
  * @word: The word to search
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned long __ffs(unsigned long word)
+#define __ffs(word)				\
+	(__builtin_constant_p(word) ?		\
+	 (unsigned long)__builtin_ctzl(word) :	\
+	 variable__ffs(word))
+
+static __always_inline unsigned long variable_ffz(unsigned long word)
 {
 	asm("rep; bsf %1,%0"
 		: "=r" (word)
-		: "rm" (word));
+		: "r" (~word));
 	return word;
 }
 
@@ -267,13 +280,10 @@ static __always_inline unsigned long __ffs(unsigned long word)
  *
  * Undefined if no zero exists, so code should check against ~0UL first.
  */
-static __always_inline unsigned long ffz(unsigned long word)
-{
-	asm("rep; bsf %1,%0"
-		: "=r" (word)
-		: "r" (~word));
-	return word;
-}
+#define ffz(word)				\
+	(__builtin_constant_p(word) ?		\
+	 (unsigned long)__builtin_ctzl(~word) :	\
+	 variable_ffz(word))
 
 /*
  * __fls: find last set bit in word
-- 
cgit v1.2.3


From 5258b80e60da6d8908ae2846b234ed8d9d9d4a19 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Tue, 6 Sep 2022 09:11:23 +0200
Subject: x86/dumpstack: Don't mention RIP in "Code: "

Commit

  238c91115cd0 ("x86/dumpstack: Fix misleading instruction pointer error message")

changed the "Code:" line in bug reports when RIP is an invalid pointer.
In particular, the report currently says (for example):

  BUG: kernel NULL pointer dereference, address: 0000000000000000
  ...
  RIP: 0010:0x0
  Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6.

That

  Unable to access opcode bytes at RIP 0xffffffffffffffd6.

is quite confusing as RIP value is 0, not -42. That -42 comes from
"regs->ip - PROLOGUE_SIZE", because Code is dumped with some prologue
(and epilogue).

So do not mention "RIP" on this line in this context.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/b772c39f-c5ae-8f17-fe6e-6a2bc4d1f83b@kernel.org
---
 arch/x86/kernel/dumpstack.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index afae4dd77495..b3dba35f466e 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -128,7 +128,7 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl)
 		/* No access to the user space stack of other tasks. Ignore. */
 		break;
 	default:
-		printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n",
+		printk("%sCode: Unable to access opcode bytes at 0x%lx.\n",
 		       loglvl, prologue);
 		break;
 	}
-- 
cgit v1.2.3


From c31b38cb948ee7d3317139f005fa1f90de4a06b7 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Mon, 19 Sep 2022 22:48:11 +0800
Subject: bpf: Check whether or not node is NULL before free it in free_bulk

llnode could be NULL if there are new allocations after the checking of
c-free_cnt > c->high_watermark in bpf_mem_refill() and before the
calling of __llist_del_first() in free_bulk (e.g. a PREEMPT_RT kernel
or allocation in NMI context). And it will incur oops as shown below:

 BUG: kernel NULL pointer dereference, address: 0000000000000000
 #PF: supervisor write access in kernel mode
 #PF: error_code(0x0002) - not-present page
 PGD 0 P4D 0
 Oops: 0002 [#1] PREEMPT_RT SMP
 CPU: 39 PID: 373 Comm: irq_work/39 Tainted: G        W          6.0.0-rc6-rt9+ #1
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996)
 RIP: 0010:bpf_mem_refill+0x66/0x130
 ......
 Call Trace:
  <TASK>
  irq_work_single+0x24/0x60
  irq_work_run_list+0x24/0x30
  run_irq_workd+0x18/0x20
  smpboot_thread_fn+0x13f/0x2c0
  kthread+0x121/0x140
  ? kthread_complete_and_exit+0x20/0x20
  ret_from_fork+0x1f/0x30
  </TASK>

Simply fixing it by checking whether or not llnode is NULL in free_bulk().

Fixes: 8d5a8011b35d ("bpf: Batch call_rcu callbacks instead of SLAB_TYPESAFE_BY_RCU.")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20220919144811.3570825-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/memalloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 20621f5407d8..5f83be1d2018 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -277,7 +277,8 @@ static void free_bulk(struct bpf_mem_cache *c)
 		local_dec(&c->active);
 		if (IS_ENABLED(CONFIG_PREEMPT_RT))
 			local_irq_restore(flags);
-		enque_to_free(c, llnode);
+		if (llnode)
+			enque_to_free(c, llnode);
 	} while (cnt > (c->high_watermark + c->low_watermark) / 2);
 
 	/* and drain free_llist_extra */
-- 
cgit v1.2.3


From c969bb8dbaf2f3628927eae73e7c579a74cf1b6e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 20 Sep 2022 19:12:52 +0200
Subject: selinux: use "grep -E" instead of "egrep"

The latest version of grep claims that egrep is now obsolete so the build
now contains warnings that look like:
	egrep: warning: egrep is obsolescent; using grep -E
fix this by using "grep -E" instead.

Cc: Paul Moore <paul@paul-moore.com>
Cc: Stephen Smalley <stephen.smalley.work@gmail.com>
Cc: Eric Paris <eparis@parisplace.org>
Cc: selinux@vger.kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
[PM: tweak to remove vdso reference, cleanup subj line]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 scripts/selinux/install_policy.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/selinux/install_policy.sh b/scripts/selinux/install_policy.sh
index 2dccf141241d..20af56ce245c 100755
--- a/scripts/selinux/install_policy.sh
+++ b/scripts/selinux/install_policy.sh
@@ -78,7 +78,7 @@ cd /etc/selinux/dummy/contexts/files
 $SF -F file_contexts /
 
 mounts=`cat /proc/$$/mounts | \
-	egrep "ext[234]|jfs|xfs|reiserfs|jffs2|gfs2|btrfs|f2fs|ocfs2" | \
+	grep -E "ext[234]|jfs|xfs|reiserfs|jffs2|gfs2|btrfs|f2fs|ocfs2" | \
 	awk '{ print $2 '}`
 $SF -F file_contexts $mounts
 
-- 
cgit v1.2.3


From 2fe2fb4ce60be9005d7bfdd5665be03b8efb5b13 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Tue, 20 Sep 2022 14:11:11 -0400
Subject: selinux: remove runtime disable message in the install_policy.sh
 script

We are in the process of deprecating the runtime disable mechanism,
let's not reference it in the scripts.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 scripts/selinux/install_policy.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/selinux/install_policy.sh b/scripts/selinux/install_policy.sh
index 20af56ce245c..24086793b0d8 100755
--- a/scripts/selinux/install_policy.sh
+++ b/scripts/selinux/install_policy.sh
@@ -31,8 +31,7 @@ fi
 if selinuxenabled; then
     echo "SELinux is already enabled"
     echo "This prevents safely relabeling all files."
-    echo "Boot with selinux=0 on the kernel command-line or"
-    echo "SELINUX=disabled in /etc/selinux/config."
+    echo "Boot with selinux=0 on the kernel command-line."
     exit 1
 fi
 
-- 
cgit v1.2.3


From 52bdae37c92ae10d47d54bd7cd39e0a17547ebfa Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Tue, 20 Sep 2022 08:15:22 -0600
Subject: bpf: Remove unused btf_struct_access stub

This stub was not being used anywhere.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/590e7bd6172ffe0f3d7b51cd40e8ded941aaf7e8.1663683114.git.dxu@dxuuu.xyz
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/netfilter/nf_conntrack_bpf.h | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h
index a61a93d1c6dc..9c07d2d59da5 100644
--- a/include/net/netfilter/nf_conntrack_bpf.h
+++ b/include/net/netfilter/nf_conntrack_bpf.h
@@ -3,8 +3,6 @@
 #ifndef _NF_CONNTRACK_BPF_H
 #define _NF_CONNTRACK_BPF_H
 
-#include <linux/bpf.h>
-#include <linux/btf.h>
 #include <linux/kconfig.h>
 #include <linux/mutex.h>
 
@@ -31,16 +29,6 @@ static inline void cleanup_nf_conntrack_bpf(void)
 {
 }
 
-static inline int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log,
-						 const struct btf *btf,
-						 const struct btf_type *t, int off,
-						 int size, enum bpf_access_type atype,
-						 u32 *next_btf_id,
-						 enum bpf_type_flag *flag)
-{
-	return -EACCES;
-}
-
 #endif
 
 #endif /* _NF_CONNTRACK_BPF_H */
-- 
cgit v1.2.3


From 5a090aa35038e3dad1ee334e3c509c39e7599bb4 Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Tue, 20 Sep 2022 08:15:23 -0600
Subject: bpf: Rename nfct_bsa to nfct_btf_struct_access

The former name was a little hard to guess.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/73adc72385c8b162391fbfb404f0b6d4c5cc55d7.1663683114.git.dxu@dxuuu.xyz
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/netfilter/nf_conntrack_bpf.h |  8 ++++----
 net/core/filter.c                        | 18 +++++++++---------
 net/netfilter/nf_conntrack_bpf.c         |  4 ++--
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h
index 9c07d2d59da5..1199d4f8e019 100644
--- a/include/net/netfilter/nf_conntrack_bpf.h
+++ b/include/net/netfilter/nf_conntrack_bpf.h
@@ -13,10 +13,10 @@ extern int register_nf_conntrack_bpf(void);
 extern void cleanup_nf_conntrack_bpf(void);
 
 extern struct mutex nf_conn_btf_access_lock;
-extern int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf,
-		       const struct btf_type *t, int off, int size,
-		       enum bpf_access_type atype, u32 *next_btf_id,
-		       enum bpf_type_flag *flag);
+extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf,
+				     const struct btf_type *t, int off, int size,
+				     enum bpf_access_type atype, u32 *next_btf_id,
+				     enum bpf_type_flag *flag);
 
 #else
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 4b2be211bcbe..2fd9449026aa 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8608,11 +8608,11 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 DEFINE_MUTEX(nf_conn_btf_access_lock);
 EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
 
-int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf,
-		const struct btf_type *t, int off, int size,
-		enum bpf_access_type atype, u32 *next_btf_id,
-		enum bpf_type_flag *flag);
-EXPORT_SYMBOL_GPL(nfct_bsa);
+int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf,
+			      const struct btf_type *t, int off, int size,
+			      enum bpf_access_type atype, u32 *next_btf_id,
+			      enum bpf_type_flag *flag);
+EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
 
 static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
 					const struct btf *btf,
@@ -8628,8 +8628,8 @@ static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
 					 flag);
 
 	mutex_lock(&nf_conn_btf_access_lock);
-	if (nfct_bsa)
-		ret = nfct_bsa(log, btf, t, off, size, atype, next_btf_id, flag);
+	if (nfct_btf_struct_access)
+		ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag);
 	mutex_unlock(&nf_conn_btf_access_lock);
 
 	return ret;
@@ -8708,8 +8708,8 @@ static int xdp_btf_struct_access(struct bpf_verifier_log *log,
 					 flag);
 
 	mutex_lock(&nf_conn_btf_access_lock);
-	if (nfct_bsa)
-		ret = nfct_bsa(log, btf, t, off, size, atype, next_btf_id, flag);
+	if (nfct_btf_struct_access)
+		ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag);
 	mutex_unlock(&nf_conn_btf_access_lock);
 
 	return ret;
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 77eb8e959f61..29c4efb3da5e 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -502,7 +502,7 @@ int register_nf_conntrack_bpf(void)
 	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set);
 	if (!ret) {
 		mutex_lock(&nf_conn_btf_access_lock);
-		nfct_bsa = _nf_conntrack_btf_struct_access;
+		nfct_btf_struct_access = _nf_conntrack_btf_struct_access;
 		mutex_unlock(&nf_conn_btf_access_lock);
 	}
 
@@ -512,6 +512,6 @@ int register_nf_conntrack_bpf(void)
 void cleanup_nf_conntrack_bpf(void)
 {
 	mutex_lock(&nf_conn_btf_access_lock);
-	nfct_bsa = NULL;
+	nfct_btf_struct_access = NULL;
 	mutex_unlock(&nf_conn_btf_access_lock);
 }
-- 
cgit v1.2.3


From fdf214978a71b2749d26f6da2b1d51d9ac23831d Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Tue, 20 Sep 2022 08:15:24 -0600
Subject: bpf: Move nf_conn extern declarations to filter.h

We're seeing the following new warnings on netdev/build_32bit and
netdev/build_allmodconfig_warn CI jobs:

    ../net/core/filter.c:8608:1: warning: symbol
    'nf_conn_btf_access_lock' was not declared. Should it be static?
    ../net/core/filter.c:8611:5: warning: symbol 'nfct_bsa' was not
    declared. Should it be static?

Fix by ensuring extern declaration is present while compiling filter.o.

Fixes: 864b656f82cc ("bpf: Add support for writing to nf_conn:mark")
Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Link: https://lore.kernel.org/r/2bd2e0283df36d8a4119605878edb1838d144174.1663683114.git.dxu@dxuuu.xyz
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/filter.h                   | 6 ++++++
 include/net/netfilter/nf_conntrack_bpf.h | 7 -------
 net/netfilter/nf_conntrack_bpf.c         | 1 +
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 75335432fcbc..98e28126c24b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -567,6 +567,12 @@ struct sk_filter {
 
 DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 
+extern struct mutex nf_conn_btf_access_lock;
+extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf,
+				     const struct btf_type *t, int off, int size,
+				     enum bpf_access_type atype, u32 *next_btf_id,
+				     enum bpf_type_flag *flag);
+
 typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
 					  const struct bpf_insn *insnsi,
 					  unsigned int (*bpf_func)(const void *,
diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h
index 1199d4f8e019..c8b80add1142 100644
--- a/include/net/netfilter/nf_conntrack_bpf.h
+++ b/include/net/netfilter/nf_conntrack_bpf.h
@@ -4,7 +4,6 @@
 #define _NF_CONNTRACK_BPF_H
 
 #include <linux/kconfig.h>
-#include <linux/mutex.h>
 
 #if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
     (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
@@ -12,12 +11,6 @@
 extern int register_nf_conntrack_bpf(void);
 extern void cleanup_nf_conntrack_bpf(void);
 
-extern struct mutex nf_conn_btf_access_lock;
-extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf,
-				     const struct btf_type *t, int off, int size,
-				     enum bpf_access_type atype, u32 *next_btf_id,
-				     enum bpf_type_flag *flag);
-
 #else
 
 static inline int register_nf_conntrack_bpf(void)
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 29c4efb3da5e..67df64283aef 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -9,6 +9,7 @@
 #include <linux/bpf_verifier.h>
 #include <linux/bpf.h>
 #include <linux/btf.h>
+#include <linux/filter.h>
 #include <linux/mutex.h>
 #include <linux/types.h>
 #include <linux/btf_ids.h>
-- 
cgit v1.2.3


From bb20da18ce936adda6b48aea79a8797c8eee479f Mon Sep 17 00:00:00 2001
From: Yihao Han <hanyihao@vivo.com>
Date: Mon, 19 Sep 2022 20:19:58 -0700
Subject: Bluetooth: MGMT: fix zalloc-simple.cocci warnings

Use zeroing allocator rather than allocator followed by memset with 0

Generated by: scripts/coccinelle/api/alloc/zalloc-simple.cocci

Signed-off-by: Yihao Han <hanyihao@vivo.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/mgmt.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index b9b64030a7b0..a92e7e485feb 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -4397,12 +4397,10 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
 
 	/* Enough space for 7 features */
 	len = sizeof(*rp) + (sizeof(rp->features[0]) * 7);
-	rp = kmalloc(len, GFP_KERNEL);
+	rp = kzalloc(len, GFP_KERNEL);
 	if (!rp)
 		return -ENOMEM;
 
-	memset(rp, 0, len);
-
 #ifdef CONFIG_BT_FEATURE_DEBUG
 	if (!hdev) {
 		flags = bt_dbg_get() ? BIT(0) : 0;
-- 
cgit v1.2.3


From bc069da65eec7b5113b40432930152c9c1cd7f88 Mon Sep 17 00:00:00 2001
From: Rong Tao <rongtao@cestc.cn>
Date: Sun, 11 Sep 2022 21:03:30 +0800
Subject: samples/bpf: Replace blk_account_io_done() with
 __blk_account_io_done()

Since commit be6bfe36db17 ("block: inline hot paths of blk_account_io_*()")
blk_account_io_*() become inline functions.

Signed-off-by: Rong Tao <rtoax@foxmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/tencent_1CC476835C219FACD84B6715F0D785517E07@qq.com
---
 samples/bpf/task_fd_query_kern.c | 2 +-
 samples/bpf/task_fd_query_user.c | 2 +-
 samples/bpf/tracex3_kern.c       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
index c821294e1774..186ac0a79c0a 100644
--- a/samples/bpf/task_fd_query_kern.c
+++ b/samples/bpf/task_fd_query_kern.c
@@ -10,7 +10,7 @@ int bpf_prog1(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("kretprobe/blk_account_io_done")
+SEC("kretprobe/__blk_account_io_done")
 int bpf_prog2(struct pt_regs *ctx)
 {
 	return 0;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
index 424718c0872c..a33d74bd3a4b 100644
--- a/samples/bpf/task_fd_query_user.c
+++ b/samples/bpf/task_fd_query_user.c
@@ -348,7 +348,7 @@ int main(int argc, char **argv)
 	/* test two functions in the corresponding *_kern.c file */
 	CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request",
 					   BPF_FD_TYPE_KPROBE));
-	CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_done",
+	CHECK_AND_RET(test_debug_fs_kprobe(1, "__blk_account_io_done",
 					   BPF_FD_TYPE_KRETPROBE));
 
 	/* test nondebug fs kprobe */
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
index 710a4410b2fb..bde6591cb20c 100644
--- a/samples/bpf/tracex3_kern.c
+++ b/samples/bpf/tracex3_kern.c
@@ -49,7 +49,7 @@ struct {
 	__uint(max_entries, SLOTS);
 } lat_map SEC(".maps");
 
-SEC("kprobe/blk_account_io_done")
+SEC("kprobe/__blk_account_io_done")
 int bpf_prog2(struct pt_regs *ctx)
 {
 	long rq = PT_REGS_PARM1(ctx);
-- 
cgit v1.2.3


From 7620bffbf72cd66a5d18e444a143b5b5989efa87 Mon Sep 17 00:00:00 2001
From: Xin Liu <liuxin350@huawei.com>
Date: Sat, 17 Sep 2022 16:48:09 +0800
Subject: libbpf: Fix NULL pointer exception in API btf_dump__dump_type_data

We found that function btf_dump__dump_type_data can be called by the
user as an API, but in this function, the `opts` parameter may be used
as a null pointer.This causes `opts->indent_str` to trigger a NULL
pointer exception.

Fixes: 2ce8450ef5a3 ("libbpf: add bpf_object__open_{file, mem} w/ extensible opts")
Signed-off-by: Xin Liu <liuxin350@huawei.com>
Signed-off-by: Weibin Kong <kongweibin2@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220917084809.30770-1-liuxin350@huawei.com
---
 tools/lib/bpf/btf_dump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index 627edb5bb6de..4221f73a74d0 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -2385,7 +2385,7 @@ int btf_dump__dump_type_data(struct btf_dump *d, __u32 id,
 	d->typed_dump->indent_lvl = OPTS_GET(opts, indent_level, 0);
 
 	/* default indent string is a tab */
-	if (!opts->indent_str)
+	if (!OPTS_GET(opts, indent_str, NULL))
 		d->typed_dump->indent_str[0] = '\t';
 	else
 		libbpf_strlcpy(d->typed_dump->indent_str, opts->indent_str,
-- 
cgit v1.2.3


From 91dc90fdb8b8199519a3aac9c46a433b02223c5b Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 21 Sep 2022 12:30:38 +0530
Subject: cpufreq: qcom-cpufreq-hw: Fix uninitialized throttled_freq warning

Commit 6240aaad75e1 was supposed to drop the reference count to the OPP,
instead it avoided more stuff if the OPP isn't found. This isn't
entirely correct. We already have a frequency value available, we just
couldn't align it with an OPP in case of IS_ERR(opp).

Lets continue with updating thermal pressure, etc, even if we aren't
able to find an OPP here.

This fixes warning generated by the 'smatch' tool.

Fixes: 6240aaad75e1 ("cpufreq: qcom-hw: fix the opp entries refcounting")
Cc: v5.18+ <stable@vger.kernel.org> # v5.18+
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/qcom-cpufreq-hw.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index d5ef3c66c762..bb32659820ce 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -316,14 +316,14 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
 	if (IS_ERR(opp)) {
 		dev_warn(dev, "Can't find the OPP for throttling: %pe!\n", opp);
 	} else {
-		throttled_freq = freq_hz / HZ_PER_KHZ;
-
-		/* Update thermal pressure (the boost frequencies are accepted) */
-		arch_update_thermal_pressure(policy->related_cpus, throttled_freq);
-
 		dev_pm_opp_put(opp);
 	}
 
+	throttled_freq = freq_hz / HZ_PER_KHZ;
+
+	/* Update thermal pressure (the boost frequencies are accepted) */
+	arch_update_thermal_pressure(policy->related_cpus, throttled_freq);
+
 	/*
 	 * In the unlikely case policy is unregistered do not enable
 	 * polling or h/w interrupt
-- 
cgit v1.2.3


From 676886010707fc656e2b407b2849883fbb63717e Mon Sep 17 00:00:00 2001
From: Sumit Gupta <sumitg@nvidia.com>
Date: Tue, 20 Sep 2022 16:36:46 +0530
Subject: cpufreq: tegra194: Add support for Tegra239

Adding support for Tegra239 SoC which has eight cores in
a single cluster. Also, moving num_clusters to SoC data
to avoid over allocating memory for four clusters always.

Signed-off-by: Sumit Gupta <sumitg@nvidia.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/tegra194-cpufreq.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c
index 7e143c06972e..cfc1b225f811 100644
--- a/drivers/cpufreq/tegra194-cpufreq.c
+++ b/drivers/cpufreq/tegra194-cpufreq.c
@@ -38,14 +38,6 @@
 /* cpufreq transisition latency */
 #define TEGRA_CPUFREQ_TRANSITION_LATENCY (300 * 1000) /* unit in nanoseconds */
 
-enum cluster {
-	CLUSTER0,
-	CLUSTER1,
-	CLUSTER2,
-	CLUSTER3,
-	MAX_CLUSTERS,
-};
-
 struct tegra_cpu_ctr {
 	u32 cpu;
 	u32 coreclk_cnt, last_coreclk_cnt;
@@ -67,12 +59,12 @@ struct tegra_cpufreq_ops {
 struct tegra_cpufreq_soc {
 	struct tegra_cpufreq_ops *ops;
 	int maxcpus_per_cluster;
+	unsigned int num_clusters;
 	phys_addr_t actmon_cntr_base;
 };
 
 struct tegra194_cpufreq_data {
 	void __iomem *regs;
-	size_t num_clusters;
 	struct cpufreq_frequency_table **tables;
 	const struct tegra_cpufreq_soc *soc;
 };
@@ -166,6 +158,14 @@ static const struct tegra_cpufreq_soc tegra234_cpufreq_soc = {
 	.ops = &tegra234_cpufreq_ops,
 	.actmon_cntr_base = 0x9000,
 	.maxcpus_per_cluster = 4,
+	.num_clusters = 3,
+};
+
+const struct tegra_cpufreq_soc tegra239_cpufreq_soc = {
+	.ops = &tegra234_cpufreq_ops,
+	.actmon_cntr_base = 0x4000,
+	.maxcpus_per_cluster = 8,
+	.num_clusters = 1,
 };
 
 static void tegra194_get_cpu_cluster_id(u32 cpu, u32 *cpuid, u32 *clusterid)
@@ -378,7 +378,7 @@ static int tegra194_cpufreq_init(struct cpufreq_policy *policy)
 
 	data->soc->ops->get_cpu_cluster_id(policy->cpu, NULL, &clusterid);
 
-	if (clusterid >= data->num_clusters || !data->tables[clusterid])
+	if (clusterid >= data->soc->num_clusters || !data->tables[clusterid])
 		return -EINVAL;
 
 	start_cpu = rounddown(policy->cpu, maxcpus_per_cluster);
@@ -429,6 +429,7 @@ static struct tegra_cpufreq_ops tegra194_cpufreq_ops = {
 static const struct tegra_cpufreq_soc tegra194_cpufreq_soc = {
 	.ops = &tegra194_cpufreq_ops,
 	.maxcpus_per_cluster = 2,
+	.num_clusters = 4,
 };
 
 static void tegra194_cpufreq_free_resources(void)
@@ -521,15 +522,14 @@ static int tegra194_cpufreq_probe(struct platform_device *pdev)
 
 	soc = of_device_get_match_data(&pdev->dev);
 
-	if (soc->ops && soc->maxcpus_per_cluster) {
+	if (soc->ops && soc->maxcpus_per_cluster && soc->num_clusters) {
 		data->soc = soc;
 	} else {
 		dev_err(&pdev->dev, "soc data missing\n");
 		return -EINVAL;
 	}
 
-	data->num_clusters = MAX_CLUSTERS;
-	data->tables = devm_kcalloc(&pdev->dev, data->num_clusters,
+	data->tables = devm_kcalloc(&pdev->dev, data->soc->num_clusters,
 				    sizeof(*data->tables), GFP_KERNEL);
 	if (!data->tables)
 		return -ENOMEM;
@@ -554,7 +554,7 @@ static int tegra194_cpufreq_probe(struct platform_device *pdev)
 		goto put_bpmp;
 	}
 
-	for (i = 0; i < data->num_clusters; i++) {
+	for (i = 0; i < data->soc->num_clusters; i++) {
 		data->tables[i] = init_freq_table(pdev, bpmp, i);
 		if (IS_ERR(data->tables[i])) {
 			err = PTR_ERR(data->tables[i]);
@@ -586,6 +586,7 @@ static int tegra194_cpufreq_remove(struct platform_device *pdev)
 static const struct of_device_id tegra194_cpufreq_of_match[] = {
 	{ .compatible = "nvidia,tegra194-ccplex", .data = &tegra194_cpufreq_soc },
 	{ .compatible = "nvidia,tegra234-ccplex-cluster", .data = &tegra234_cpufreq_soc },
+	{ .compatible = "nvidia,tegra239-ccplex-cluster", .data = &tegra239_cpufreq_soc },
 	{ /* sentinel */ }
 };
 
-- 
cgit v1.2.3


From 38e316398e4e6338b80223fb5f74415c0513718f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 19 Sep 2022 11:49:14 +0200
Subject: xattr: always us is_posix_acl_xattr() helper

The is_posix_acl_xattr() helper was added in 0c5fd887d2bb ("acl: move
idmapped mount fixup into vfs_{g,s}etxattr()") to remove the open-coded
checks for POSIX ACLs. We missed to update two locations. Switch them to
use the helper.

Cc: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
Reviewed-by: Seth Forshee (DigitalOcean) <sforshee@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xattr.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/xattr.c b/fs/xattr.c
index 74fc8e021ebc..61107b6bbed2 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -583,9 +583,7 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
 static void setxattr_convert(struct user_namespace *mnt_userns,
 			     struct dentry *d, struct xattr_ctx *ctx)
 {
-	if (ctx->size &&
-		((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
-		(strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)))
+	if (ctx->size && is_posix_acl_xattr(ctx->kname->name))
 		posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size);
 }
 
@@ -701,8 +699,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
 
 	error = vfs_getxattr(mnt_userns, d, kname, ctx->kvalue, ctx->size);
 	if (error > 0) {
-		if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
-		    (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
+		if (is_posix_acl_xattr(kname))
 			posix_acl_fix_xattr_to_user(ctx->kvalue, error);
 		if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
 			error = -EFAULT;
-- 
cgit v1.2.3


From f92ff8f5dca2acfa475842c71212657b0703c3be Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Sep 2022 13:10:40 +0200
Subject: x86/paravirt: Ensure proper alignment

The entries in the .parainstructions sections are 8 byte aligned and the
corresponding C struct paravirt_patch_site makes the array offset 16
bytes.

Though the pushed entries are only using 12 bytes, __parainstructions_end
is therefore 4 bytes short.

That works by chance because it's only used in a loop:

     for (p = start; p < end; p++)

But this falls flat when calculating the number of elements:

    n = end - start

That's obviously off by one.

Ensure that the gap is filled and the last entry is occupying 16 bytes.

  [ bp: Add the proper struct and section names. ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Link: https://lore.kernel.org/r/20220915111142.992398801@infradead.org
---
 arch/x86/include/asm/paravirt.h       | 1 +
 arch/x86/include/asm/paravirt_types.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 964442b99245..2a0b8dd4ec33 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -743,6 +743,7 @@ extern void default_banner(void);
 	 word 771b;				\
 	 .byte ptype;				\
 	 .byte 772b-771b;			\
+	 _ASM_ALIGN;				\
 	.popsection
 
 
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 89df6c6617f5..27b036a15f70 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -294,6 +294,7 @@ extern struct paravirt_patch_template pv_ops;
 	"  .byte " type "\n"				\
 	"  .byte 772b-771b\n"				\
 	"  .short " clobber "\n"			\
+	_ASM_ALIGN "\n"					\
 	".popsection\n"
 
 /* Generate patchable code, with the default asm parameters. */
-- 
cgit v1.2.3


From 3a74904ceff3ecdb9d6cc0844ed67df417968eb6 Mon Sep 17 00:00:00 2001
From: William Dean <williamsukatube@163.com>
Date: Sat, 17 Sep 2022 16:42:48 +0800
Subject: bpf: simplify code in btf_parse_hdr

It could directly return 'btf_check_sec_info' to simplify code.

Signed-off-by: William Dean <williamsukatube@163.com>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/r/20220917084248.3649-1-williamsukatube@163.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/bpf/btf.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index b3940c605aac..6ccd4f4d731e 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -4854,7 +4854,6 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
 	u32 hdr_len, hdr_copy, btf_data_size;
 	const struct btf_header *hdr;
 	struct btf *btf;
-	int err;
 
 	btf = env->btf;
 	btf_data_size = btf->data_size;
@@ -4911,11 +4910,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
 		return -EINVAL;
 	}
 
-	err = btf_check_sec_info(env, btf_data_size);
-	if (err)
-		return err;
-
-	return 0;
+	return btf_check_sec_info(env, btf_data_size);
 }
 
 static int btf_check_type_tags(struct btf_verifier_env *env,
-- 
cgit v1.2.3


From 0e426a3ae030a9e891899370229e117158b35de6 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Wed, 21 Sep 2022 10:46:02 +0000
Subject: bpf, cgroup: Reject prog_attach_flags array when effective query

Attach flags is only valid for attached progs of this layer cgroup,
but not for effective progs. For querying with EFFECTIVE flags,
exporting attach flags does not make sense. So when effective query,
we reject prog_attach_flags array and don't need to populate it.
Also we limit attach_flags to output 0 during effective query.

Fixes: b79c9fc9551b ("bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20220921104604.2340580-2-pulehui@huaweicloud.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/uapi/linux/bpf.h       |  7 +++++--
 kernel/bpf/cgroup.c            | 28 ++++++++++++++++++----------
 tools/include/uapi/linux/bpf.h |  7 +++++--
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 59a217ca2dfd..4eff7fc7ae58 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1233,7 +1233,7 @@ enum {
 
 /* Query effective (directly attached + inherited from ancestor cgroups)
  * programs that will be executed for events within a cgroup.
- * attach_flags with this flag are returned only for directly attached programs.
+ * attach_flags with this flag are always returned 0.
  */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
@@ -1432,7 +1432,10 @@ union bpf_attr {
 		__u32		attach_flags;
 		__aligned_u64	prog_ids;
 		__u32		prog_cnt;
-		__aligned_u64	prog_attach_flags; /* output: per-program attach_flags */
+		/* output: per-program attach_flags.
+		 * not allowed to be set during effective query.
+		 */
+		__aligned_u64	prog_attach_flags;
 	} query;
 
 	struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4a400cd63731..22888aaa68b6 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1020,6 +1020,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 			      union bpf_attr __user *uattr)
 {
 	__u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
+	bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE;
 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
 	enum bpf_attach_type type = attr->query.attach_type;
 	enum cgroup_bpf_attach_type from_atype, to_atype;
@@ -1029,8 +1030,12 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	int total_cnt = 0;
 	u32 flags;
 
+	if (effective_query && prog_attach_flags)
+		return -EINVAL;
+
 	if (type == BPF_LSM_CGROUP) {
-		if (attr->query.prog_cnt && prog_ids && !prog_attach_flags)
+		if (!effective_query && attr->query.prog_cnt &&
+		    prog_ids && !prog_attach_flags)
 			return -EINVAL;
 
 		from_atype = CGROUP_LSM_START;
@@ -1045,7 +1050,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	}
 
 	for (atype = from_atype; atype <= to_atype; atype++) {
-		if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+		if (effective_query) {
 			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
 							      lockdep_is_held(&cgroup_mutex));
 			total_cnt += bpf_prog_array_length(effective);
@@ -1054,6 +1059,8 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 		}
 	}
 
+	/* always output uattr->query.attach_flags as 0 during effective query */
+	flags = effective_query ? 0 : flags;
 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
 		return -EFAULT;
 	if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
@@ -1068,7 +1075,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 	}
 
 	for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
-		if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+		if (effective_query) {
 			effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
 							      lockdep_is_held(&cgroup_mutex));
 			cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
@@ -1090,15 +1097,16 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
 				if (++i == cnt)
 					break;
 			}
-		}
 
-		if (prog_attach_flags) {
-			flags = cgrp->bpf.flags[atype];
+			if (prog_attach_flags) {
+				flags = cgrp->bpf.flags[atype];
 
-			for (i = 0; i < cnt; i++)
-				if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags)))
-					return -EFAULT;
-			prog_attach_flags += cnt;
+				for (i = 0; i < cnt; i++)
+					if (copy_to_user(prog_attach_flags + i,
+							 &flags, sizeof(flags)))
+						return -EFAULT;
+				prog_attach_flags += cnt;
+			}
 		}
 
 		prog_ids += cnt;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 59a217ca2dfd..4eff7fc7ae58 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1233,7 +1233,7 @@ enum {
 
 /* Query effective (directly attached + inherited from ancestor cgroups)
  * programs that will be executed for events within a cgroup.
- * attach_flags with this flag are returned only for directly attached programs.
+ * attach_flags with this flag are always returned 0.
  */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
@@ -1432,7 +1432,10 @@ union bpf_attr {
 		__u32		attach_flags;
 		__aligned_u64	prog_ids;
 		__u32		prog_cnt;
-		__aligned_u64	prog_attach_flags; /* output: per-program attach_flags */
+		/* output: per-program attach_flags.
+		 * not allowed to be set during effective query.
+		 */
+		__aligned_u64	prog_attach_flags;
 	} query;
 
 	struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
-- 
cgit v1.2.3


From bdcee1b0b0834d031c76a12209840afe949b048a Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Wed, 21 Sep 2022 10:46:03 +0000
Subject: bpftool: Fix wrong cgroup attach flags being assigned to effective
 progs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When root-cgroup attach multi progs and sub-cgroup attach a override prog,
bpftool will display incorrectly for the attach flags of the sub-cgroup’s
effective progs:

$ bpftool cgroup tree /sys/fs/cgroup effective
CgroupPath
ID       AttachType      AttachFlags     Name
/sys/fs/cgroup
6        cgroup_sysctl   multi           sysctl_tcp_mem
13       cgroup_sysctl   multi           sysctl_tcp_mem
/sys/fs/cgroup/cg1
20       cgroup_sysctl   override        sysctl_tcp_mem
6        cgroup_sysctl   override        sysctl_tcp_mem <- wrong
13       cgroup_sysctl   override        sysctl_tcp_mem <- wrong
/sys/fs/cgroup/cg1/cg2
20       cgroup_sysctl                   sysctl_tcp_mem
6        cgroup_sysctl                   sysctl_tcp_mem
13       cgroup_sysctl                   sysctl_tcp_mem

Attach flags is only valid for attached progs of this layer cgroup,
but not for effective progs. For querying with EFFECTIVE flags,
exporting attach flags does not make sense. So let's remove the
AttachFlags field and the associated logic. After this patch, the
above effective cgroup tree will show as bellow:

$ bpftool cgroup tree /sys/fs/cgroup effective
CgroupPath
ID       AttachType      Name
/sys/fs/cgroup
6        cgroup_sysctl   sysctl_tcp_mem
13       cgroup_sysctl   sysctl_tcp_mem
/sys/fs/cgroup/cg1
20       cgroup_sysctl   sysctl_tcp_mem
6        cgroup_sysctl   sysctl_tcp_mem
13       cgroup_sysctl   sysctl_tcp_mem
/sys/fs/cgroup/cg1/cg2
20       cgroup_sysctl   sysctl_tcp_mem
6        cgroup_sysctl   sysctl_tcp_mem
13       cgroup_sysctl   sysctl_tcp_mem

Fixes: b79c9fc9551b ("bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP")
Fixes: a98bf57391a2 ("tools: bpftool: add support for reporting the effective cgroup progs")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20220921104604.2340580-3-pulehui@huaweicloud.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/bpf/bpftool/cgroup.c | 54 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/tools/bpf/bpftool/cgroup.c b/tools/bpf/bpftool/cgroup.c
index cced668fb2a3..b46a998d8f8d 100644
--- a/tools/bpf/bpftool/cgroup.c
+++ b/tools/bpf/bpftool/cgroup.c
@@ -136,8 +136,8 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type,
 			jsonw_string_field(json_wtr, "attach_type", attach_type_str);
 		else
 			jsonw_uint_field(json_wtr, "attach_type", attach_type);
-		jsonw_string_field(json_wtr, "attach_flags",
-				   attach_flags_str);
+		if (!(query_flags & BPF_F_QUERY_EFFECTIVE))
+			jsonw_string_field(json_wtr, "attach_flags", attach_flags_str);
 		jsonw_string_field(json_wtr, "name", prog_name);
 		if (attach_btf_name)
 			jsonw_string_field(json_wtr, "attach_btf_name", attach_btf_name);
@@ -150,7 +150,10 @@ static int show_bpf_prog(int id, enum bpf_attach_type attach_type,
 			printf("%-15s", attach_type_str);
 		else
 			printf("type %-10u", attach_type);
-		printf(" %-15s %-15s", attach_flags_str, prog_name);
+		if (query_flags & BPF_F_QUERY_EFFECTIVE)
+			printf(" %-15s", prog_name);
+		else
+			printf(" %-15s %-15s", attach_flags_str, prog_name);
 		if (attach_btf_name)
 			printf(" %-15s", attach_btf_name);
 		else if (info.attach_btf_id)
@@ -195,6 +198,32 @@ static int cgroup_has_attached_progs(int cgroup_fd)
 
 	return no_prog ? 0 : 1;
 }
+
+static int show_effective_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
+				    int level)
+{
+	LIBBPF_OPTS(bpf_prog_query_opts, p);
+	__u32 prog_ids[1024] = {0};
+	__u32 iter;
+	int ret;
+
+	p.query_flags = query_flags;
+	p.prog_cnt = ARRAY_SIZE(prog_ids);
+	p.prog_ids = prog_ids;
+
+	ret = bpf_prog_query_opts(cgroup_fd, type, &p);
+	if (ret)
+		return ret;
+
+	if (p.prog_cnt == 0)
+		return 0;
+
+	for (iter = 0; iter < p.prog_cnt; iter++)
+		show_bpf_prog(prog_ids[iter], type, NULL, level);
+
+	return 0;
+}
+
 static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
 				   int level)
 {
@@ -245,6 +274,14 @@ static int show_attached_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
 	return 0;
 }
 
+static int show_bpf_progs(int cgroup_fd, enum bpf_attach_type type,
+			  int level)
+{
+	return query_flags & BPF_F_QUERY_EFFECTIVE ?
+	       show_effective_bpf_progs(cgroup_fd, type, level) :
+	       show_attached_bpf_progs(cgroup_fd, type, level);
+}
+
 static int do_show(int argc, char **argv)
 {
 	enum bpf_attach_type type;
@@ -292,6 +329,8 @@ static int do_show(int argc, char **argv)
 
 	if (json_output)
 		jsonw_start_array(json_wtr);
+	else if (query_flags & BPF_F_QUERY_EFFECTIVE)
+		printf("%-8s %-15s %-15s\n", "ID", "AttachType", "Name");
 	else
 		printf("%-8s %-15s %-15s %-15s\n", "ID", "AttachType",
 		       "AttachFlags", "Name");
@@ -304,7 +343,7 @@ static int do_show(int argc, char **argv)
 		 * If we were able to get the show for at least one
 		 * attach type, let's return 0.
 		 */
-		if (show_attached_bpf_progs(cgroup_fd, type, 0) == 0)
+		if (show_bpf_progs(cgroup_fd, type, 0) == 0)
 			ret = 0;
 	}
 
@@ -362,7 +401,7 @@ static int do_show_tree_fn(const char *fpath, const struct stat *sb,
 
 	btf_vmlinux = libbpf_find_kernel_btf();
 	for (type = 0; type < __MAX_BPF_ATTACH_TYPE; type++)
-		show_attached_bpf_progs(cgroup_fd, type, ftw->level);
+		show_bpf_progs(cgroup_fd, type, ftw->level);
 
 	if (errno == EINVAL)
 		/* Last attach type does not support query.
@@ -436,6 +475,11 @@ static int do_show_tree(int argc, char **argv)
 
 	if (json_output)
 		jsonw_start_array(json_wtr);
+	else if (query_flags & BPF_F_QUERY_EFFECTIVE)
+		printf("%s\n"
+		       "%-8s %-15s %-15s\n",
+		       "CgroupPath",
+		       "ID", "AttachType", "Name");
 	else
 		printf("%s\n"
 		       "%-8s %-15s %-15s %-15s\n",
-- 
cgit v1.2.3


From d2aa993b7d9de6deeb1df6c9a6b9b6193c337cc6 Mon Sep 17 00:00:00 2001
From: Pu Lehui <pulehui@huawei.com>
Date: Wed, 21 Sep 2022 10:46:04 +0000
Subject: selftests/bpf: Adapt cgroup effective query uapi change

The attach flags is meaningless for effective query and
its value will always be set as 0 during effective query.
Root cg's effective progs is always its attached progs,
so we use non-effective query to get its progs count and
attach flags. And we don't need the remain attach flags
check.

Fixes: b79c9fc9551b ("bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP")
Signed-off-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/r/20220921104604.2340580-4-pulehui@huaweicloud.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/prog_tests/cgroup_link.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c
index 9e6e6aad347c..15093a69510e 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c
@@ -71,10 +71,9 @@ void serial_test_cgroup_link(void)
 
 	ping_and_check(cg_nr, 0);
 
-	/* query the number of effective progs and attach flags in root cg */
+	/* query the number of attached progs and attach flags in root cg */
 	err = bpf_prog_query(cgs[0].fd, BPF_CGROUP_INET_EGRESS,
-			     BPF_F_QUERY_EFFECTIVE, &attach_flags, NULL,
-			     &prog_cnt);
+			     0, &attach_flags, NULL, &prog_cnt);
 	CHECK_FAIL(err);
 	CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI);
 	if (CHECK(prog_cnt != 1, "effect_cnt", "exp %d, got %d\n", 1, prog_cnt))
@@ -85,17 +84,15 @@ void serial_test_cgroup_link(void)
 			     BPF_F_QUERY_EFFECTIVE, NULL, NULL,
 			     &prog_cnt);
 	CHECK_FAIL(err);
-	CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI);
 	if (CHECK(prog_cnt != cg_nr, "effect_cnt", "exp %d, got %d\n",
 		  cg_nr, prog_cnt))
 		goto cleanup;
 
 	/* query the effective prog IDs in last cg */
 	err = bpf_prog_query(cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS,
-			     BPF_F_QUERY_EFFECTIVE, &attach_flags,
-			     prog_ids, &prog_cnt);
+			     BPF_F_QUERY_EFFECTIVE, NULL, prog_ids,
+			     &prog_cnt);
 	CHECK_FAIL(err);
-	CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI);
 	if (CHECK(prog_cnt != cg_nr, "effect_cnt", "exp %d, got %d\n",
 		  cg_nr, prog_cnt))
 		goto cleanup;
-- 
cgit v1.2.3


From 2d93540014387d1c73b9ccc4d7895320df66d01b Mon Sep 17 00:00:00 2001
From: Chao Qin <chao.qin@intel.com>
Date: Tue, 20 Sep 2022 14:08:26 +0800
Subject: powercap: intel_rapl: fix UBSAN shift-out-of-bounds issue

When value < time_unit, the parameter of ilog2() will be zero and
the return value is -1. u64(-1) is too large for shift exponent
and then will trigger shift-out-of-bounds:

shift exponent 18446744073709551615 is too large for 32-bit type 'int'
Call Trace:
 rapl_compute_time_window_core
 rapl_write_data_raw
 set_time_window
 store_constraint_time_window_us

Signed-off-by: Chao Qin <chao.qin@intel.com>
Acked-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 6812c1ca3100..4688d0647b22 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -994,6 +994,9 @@ static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
 		y = value & 0x1f;
 		value = (1 << y) * (4 + f) * rp->time_unit / 4;
 	} else {
+		if (value < rp->time_unit)
+			return 0;
+
 		do_div(value, rp->time_unit);
 		y = ilog2(value);
 		f = div64_u64(4 * (value - (1 << y)), 1 << y);
-- 
cgit v1.2.3


From 68b99e94a4a2db6ba9b31fe0485e057b9354a640 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 20 Sep 2022 04:06:57 -0700
Subject: thermal: intel_powerclamp: Use get_cpu() instead of
 smp_processor_id() to avoid crash

When CPU 0 is offline and intel_powerclamp is used to inject
idle, it generates kernel BUG:

BUG: using smp_processor_id() in preemptible [00000000] code: bash/15687
caller is debug_smp_processor_id+0x17/0x20
CPU: 4 PID: 15687 Comm: bash Not tainted 5.19.0-rc7+ #57
Call Trace:
<TASK>
dump_stack_lvl+0x49/0x63
dump_stack+0x10/0x16
check_preemption_disabled+0xdd/0xe0
debug_smp_processor_id+0x17/0x20
powerclamp_set_cur_state+0x7f/0xf9 [intel_powerclamp]
...
...

Here CPU 0 is the control CPU by default and changed to the current CPU,
if CPU 0 offlined. This check has to be performed under cpus_read_lock(),
hence the above warning.

Use get_cpu() instead of smp_processor_id() to avoid this BUG.

Suggested-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
[ rjw: Subject edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/intel/intel_powerclamp.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c
index c841ab37e7c6..46cd799af148 100644
--- a/drivers/thermal/intel/intel_powerclamp.c
+++ b/drivers/thermal/intel/intel_powerclamp.c
@@ -532,8 +532,10 @@ static int start_power_clamp(void)
 
 	/* prefer BSP */
 	control_cpu = 0;
-	if (!cpu_online(control_cpu))
-		control_cpu = smp_processor_id();
+	if (!cpu_online(control_cpu)) {
+		control_cpu = get_cpu();
+		put_cpu();
+	}
 
 	clamping = true;
 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
-- 
cgit v1.2.3


From c4e927da893b2d9ebe71ac206ed021df363771e4 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 20 Sep 2022 04:08:02 -0700
Subject: thermal: intel_powerclamp: Remove accounting for IRQ wakes

There is a static variable "idle_wakeup_counter", which accounts for
number of wake ups because of IRQs and take actions to compensate idle
injection. This is now read and reset to 0, but never incremented.
So all the usage of this counter for idle injection has no use.

Also another static variable "reduce_irq", which depends on
"idle_wakeup_counter", so remove usage of "reduce_irq" also.

Commit feb6cd6a0f9f ("thermal/intel_powerclamp: stop sched tick in
forced idle") replaced the local use of "mwait_idle_with_hints" with
play_idle(). This removed possibility of updating "idle_wakeup_counter"
without change in play_idle(). This change was made in Linux 4.10.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/intel/intel_powerclamp.c | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c
index 46cd799af148..2a5570b9799a 100644
--- a/drivers/thermal/intel/intel_powerclamp.c
+++ b/drivers/thermal/intel/intel_powerclamp.c
@@ -62,8 +62,7 @@ static struct dentry *debug_dir;
 static unsigned int set_target_ratio;
 static unsigned int current_ratio;
 static bool should_skip;
-static bool reduce_irq;
-static atomic_t idle_wakeup_counter;
+
 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
 				  * control parameters. default to BSP but BSP
 				  * can be offlined.
@@ -285,9 +284,6 @@ static unsigned int get_compensation(int ratio)
 			cal_data[ratio + 1].steady_comp) / 3;
 	}
 
-	/* REVISIT: simple penalty of double idle injection */
-	if (reduce_irq)
-		comp = ratio;
 	/* do not exceed limit */
 	if (comp + ratio >= MAX_TARGET_RATIO)
 		comp = MAX_TARGET_RATIO - ratio - 1;
@@ -301,13 +297,9 @@ static void adjust_compensation(int target_ratio, unsigned int win)
 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
 
 	/*
-	 * adjust compensations if confidence level has not been reached or
-	 * there are too many wakeups during the last idle injection period, we
-	 * cannot trust the data for compensation.
+	 * adjust compensations if confidence level has not been reached.
 	 */
-	if (d->confidence >= CONFIDENCE_OK ||
-		atomic_read(&idle_wakeup_counter) >
-		win * num_online_cpus())
+	if (d->confidence >= CONFIDENCE_OK)
 		return;
 
 	delta = set_target_ratio - current_ratio;
@@ -347,14 +339,7 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
 	tsc_last = tsc_now;
 
 	adjust_compensation(target_ratio, win);
-	/*
-	 * too many external interrupts, set flag such
-	 * that we can take measure later.
-	 */
-	reduce_irq = atomic_read(&idle_wakeup_counter) >=
-		2 * win * num_online_cpus();
 
-	atomic_set(&idle_wakeup_counter, 0);
 	/* if we are above target+guard, skip */
 	return set_target_ratio + guard <= current_ratio;
 }
-- 
cgit v1.2.3


From 65c0c2367e9e695b13d0d39307ebc8818bcdbd8d Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 20 Sep 2022 20:58:48 +0800
Subject: intel_idle: Add AlderLake-N support

Similar to the other other AlderLake platforms, the C1 and C1E states on
ADL-N are mutually exclusive. Only one of them can be enabled at a time.

C1E is preferred on ADL-N for better energy efficiency.

C6S is also supported on this platform. Its latency is far bigger than
C6, but really close to C8 (PC8), thus it is not exposed as a separate
state.

Suggested-by: Baieswara Reddy Sagili <baieswara.reddy.sagili@intel.com>
Suggested-by: Vinay Kumar <vinay.kumar@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 114bd6631578..cfeb24d40d37 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -928,6 +928,51 @@ static struct cpuidle_state adl_l_cstates[] __initdata = {
 		.enter = NULL }
 };
 
+static struct cpuidle_state adl_n_cstates[] __initdata = {
+	{
+		.name = "C1",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_UNUSABLE,
+		.exit_latency = 1,
+		.target_residency = 1,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C1E",
+		.desc = "MWAIT 0x01",
+		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+		.exit_latency = 2,
+		.target_residency = 4,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C6",
+		.desc = "MWAIT 0x20",
+		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 195,
+		.target_residency = 585,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C8",
+		.desc = "MWAIT 0x40",
+		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 260,
+		.target_residency = 1040,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.name = "C10",
+		.desc = "MWAIT 0x60",
+		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 660,
+		.target_residency = 1980,
+		.enter = &intel_idle,
+		.enter_s2idle = intel_idle_s2idle, },
+	{
+		.enter = NULL }
+};
+
 static struct cpuidle_state spr_cstates[] __initdata = {
 	{
 		.name = "C1",
@@ -1309,6 +1354,10 @@ static const struct idle_cpu idle_cpu_adl_l __initconst = {
 	.state_table = adl_l_cstates,
 };
 
+static const struct idle_cpu idle_cpu_adl_n __initconst = {
+	.state_table = adl_n_cstates,
+};
+
 static const struct idle_cpu idle_cpu_spr __initconst = {
 	.state_table = spr_cstates,
 	.disable_promotion_to_c1e = true,
@@ -1379,6 +1428,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,		&idle_cpu_icx),
 	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,		&idle_cpu_adl),
 	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L,		&idle_cpu_adl_l),
+	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N,		&idle_cpu_adl_n),
 	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,	&idle_cpu_spr),
 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,	&idle_cpu_knl),
 	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,	&idle_cpu_knl),
@@ -1816,6 +1866,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
 		break;
 	case INTEL_FAM6_ALDERLAKE:
 	case INTEL_FAM6_ALDERLAKE_L:
+	case INTEL_FAM6_ALDERLAKE_N:
 		adl_idle_state_table_update();
 		break;
 	}
-- 
cgit v1.2.3


From 8addbfc7b308d591f8a5f2f6bb24d08d9d79dfbb Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 21 Sep 2022 16:35:50 +0200
Subject: bpf: Gate dynptr API behind CAP_BPF

This has been enabled for unprivileged programs for only one kernel
release, hence the expected annoyances due to this move are low. Users
using ringbuf can stick to non-dynptr APIs. The actual use cases dynptr
is meant to serve may not make sense in unprivileged BPF programs.

Hence, gate these helpers behind CAP_BPF and limit use to privileged
BPF programs.

Fixes: 263ae152e962 ("bpf: Add bpf_dynptr_from_mem for local dynptrs")
Fixes: bc34dee65a65 ("bpf: Dynptr support for ring buffers")
Fixes: 13bbbfbea759 ("bpf: Add bpf_dynptr_read and bpf_dynptr_write")
Fixes: 34d4ef5775f7 ("bpf: Add dynptr data slices")
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220921143550.30247-1-memxor@gmail.com
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1f961f9982d2..3814b0fd3a2c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1627,26 +1627,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_ringbuf_discard_proto;
 	case BPF_FUNC_ringbuf_query:
 		return &bpf_ringbuf_query_proto;
-	case BPF_FUNC_ringbuf_reserve_dynptr:
-		return &bpf_ringbuf_reserve_dynptr_proto;
-	case BPF_FUNC_ringbuf_submit_dynptr:
-		return &bpf_ringbuf_submit_dynptr_proto;
-	case BPF_FUNC_ringbuf_discard_dynptr:
-		return &bpf_ringbuf_discard_dynptr_proto;
 	case BPF_FUNC_for_each_map_elem:
 		return &bpf_for_each_map_elem_proto;
 	case BPF_FUNC_loop:
 		return &bpf_loop_proto;
 	case BPF_FUNC_strncmp:
 		return &bpf_strncmp_proto;
-	case BPF_FUNC_dynptr_from_mem:
-		return &bpf_dynptr_from_mem_proto;
-	case BPF_FUNC_dynptr_read:
-		return &bpf_dynptr_read_proto;
-	case BPF_FUNC_dynptr_write:
-		return &bpf_dynptr_write_proto;
-	case BPF_FUNC_dynptr_data:
-		return &bpf_dynptr_data_proto;
 	default:
 		break;
 	}
@@ -1675,6 +1661,20 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_timer_cancel_proto;
 	case BPF_FUNC_kptr_xchg:
 		return &bpf_kptr_xchg_proto;
+	case BPF_FUNC_ringbuf_reserve_dynptr:
+		return &bpf_ringbuf_reserve_dynptr_proto;
+	case BPF_FUNC_ringbuf_submit_dynptr:
+		return &bpf_ringbuf_submit_dynptr_proto;
+	case BPF_FUNC_ringbuf_discard_dynptr:
+		return &bpf_ringbuf_discard_dynptr_proto;
+	case BPF_FUNC_dynptr_from_mem:
+		return &bpf_dynptr_from_mem_proto;
+	case BPF_FUNC_dynptr_read:
+		return &bpf_dynptr_read_proto;
+	case BPF_FUNC_dynptr_write:
+		return &bpf_dynptr_write_proto;
+	case BPF_FUNC_dynptr_data:
+		return &bpf_dynptr_data_proto;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 448a496f760664d3e2e79466aa1787e6abc922b5 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 19 Sep 2022 10:56:59 -0700
Subject: Bluetooth: hci_sysfs: Fix attempting to call device_add multiple
 times

device_add shall not be called multiple times as stated in its
documentation:

 'Do not call this routine or device_register() more than once for
 any device structure'

Syzkaller reports a bug as follows [1]:
------------[ cut here ]------------
kernel BUG at lib/list_debug.c:33!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
[...]
Call Trace:
 <TASK>
 __list_add include/linux/list.h:69 [inline]
 list_add_tail include/linux/list.h:102 [inline]
 kobj_kset_join lib/kobject.c:164 [inline]
 kobject_add_internal+0x18f/0x8f0 lib/kobject.c:214
 kobject_add_varg lib/kobject.c:358 [inline]
 kobject_add+0x150/0x1c0 lib/kobject.c:410
 device_add+0x368/0x1e90 drivers/base/core.c:3452
 hci_conn_add_sysfs+0x9b/0x1b0 net/bluetooth/hci_sysfs.c:53
 hci_le_cis_estabilished_evt+0x57c/0xae0 net/bluetooth/hci_event.c:6799
 hci_le_meta_evt+0x2b8/0x510 net/bluetooth/hci_event.c:7110
 hci_event_func net/bluetooth/hci_event.c:7440 [inline]
 hci_event_packet+0x63d/0xfd0 net/bluetooth/hci_event.c:7495
 hci_rx_work+0xae7/0x1230 net/bluetooth/hci_core.c:4007
 process_one_work+0x991/0x1610 kernel/workqueue.c:2289
 worker_thread+0x665/0x1080 kernel/workqueue.c:2436
 kthread+0x2e4/0x3a0 kernel/kthread.c:376
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306
 </TASK>

Link: https://syzkaller.appspot.com/bug?id=da3246e2d33afdb92d66bc166a0934c5b146404a
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Tested-by: Hawkins Jiawei <yin31149@gmail.com>
---
 net/bluetooth/hci_sysfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 4e3e0451b08c..08542dfc2dc5 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -48,6 +48,9 @@ void hci_conn_add_sysfs(struct hci_conn *conn)
 
 	BT_DBG("conn %p", conn);
 
+	if (device_is_registered(&conn->dev))
+		return;
+
 	dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle);
 
 	if (device_add(&conn->dev) < 0) {
-- 
cgit v1.2.3


From 7096daba731eea262e0f7bf03453ceddcad89f70 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 19 Sep 2022 10:57:00 -0700
Subject: Bluetooth: hci_debugfs: Fix not checking conn->debugfs

hci_debugfs_create_conn shall check if conn->debugfs has already been
created and don't attempt to overwrite it.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 902b40a90b91..3f401ec5bb0c 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -1245,7 +1245,7 @@ void hci_debugfs_create_conn(struct hci_conn *conn)
 	struct hci_dev *hdev = conn->hdev;
 	char name[6];
 
-	if (IS_ERR_OR_NULL(hdev->debugfs))
+	if (IS_ERR_OR_NULL(hdev->debugfs) || conn->debugfs)
 		return;
 
 	snprintf(name, sizeof(name), "%u", conn->handle);
-- 
cgit v1.2.3


From ed680f925aea76ac666f34d9923cb40558f4e97b Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 19 Sep 2022 11:10:17 -0700
Subject: Bluetooth: hci_event: Make sure ISO events don't affect non-ISO
 connections

ISO events (CIS/BIS) shall only be relevant for connection with link
type of ISO_LINK, otherwise the controller is probably buggy or it is
the result of fuzzer tools such as syzkaller.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_event.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index ed3e5b251af1..faca701bce2a 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -6791,6 +6791,13 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data,
 		goto unlock;
 	}
 
+	if (conn->type != ISO_LINK) {
+		bt_dev_err(hdev,
+			   "Invalid connection link type handle 0x%4.4x",
+			   handle);
+		goto unlock;
+	}
+
 	if (conn->role == HCI_ROLE_SLAVE) {
 		__le32 interval;
 
@@ -6911,6 +6918,13 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data,
 	if (!conn)
 		goto unlock;
 
+	if (conn->type != ISO_LINK) {
+		bt_dev_err(hdev,
+			   "Invalid connection link type handle 0x%2.2x",
+			   ev->handle);
+		goto unlock;
+	}
+
 	if (ev->num_bis)
 		conn->handle = __le16_to_cpu(ev->bis_handle[0]);
 
-- 
cgit v1.2.3


From 583c1f420173f7d84413a1a1fbf5109d798b4faa Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 19 Sep 2022 19:00:57 -0500
Subject: bpf: Define new BPF_MAP_TYPE_USER_RINGBUF map type

We want to support a ringbuf map type where samples are published from
user-space, to be consumed by BPF programs. BPF currently supports a
kernel -> user-space circular ring buffer via the BPF_MAP_TYPE_RINGBUF
map type.  We'll need to define a new map type for user-space -> kernel,
as none of the helpers exported for BPF_MAP_TYPE_RINGBUF will apply
to a user-space producer ring buffer, and we'll want to add one or
more helper functions that would not apply for a kernel-producer
ring buffer.

This patch therefore adds a new BPF_MAP_TYPE_USER_RINGBUF map type
definition. The map type is useless in its current form, as there is no
way to access or use it for anything until we one or more BPF helpers. A
follow-on patch will therefore add a new helper function that allows BPF
programs to run callbacks on samples that are published to the ring
buffer.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220920000100.477320-2-void@manifault.com
---
 include/linux/bpf_types.h                       |  1 +
 include/uapi/linux/bpf.h                        |  1 +
 kernel/bpf/ringbuf.c                            | 62 ++++++++++++++++++++++---
 kernel/bpf/verifier.c                           |  3 ++
 tools/bpf/bpftool/Documentation/bpftool-map.rst |  2 +-
 tools/bpf/bpftool/map.c                         |  2 +-
 tools/include/uapi/linux/bpf.h                  |  1 +
 tools/lib/bpf/libbpf.c                          |  1 +
 8 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 2b9112b80171..2c6a4f2562a7 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -126,6 +126,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
 #endif
 BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops)
 
 BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3df78c56c1bf..e18c85324db6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -928,6 +928,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_INODE_STORAGE,
 	BPF_MAP_TYPE_TASK_STORAGE,
 	BPF_MAP_TYPE_BLOOM_FILTER,
+	BPF_MAP_TYPE_USER_RINGBUF,
 };
 
 /* Note that tracing related programs such as
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index b483aea35f41..754e915748fb 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -38,10 +38,27 @@ struct bpf_ringbuf {
 	struct page **pages;
 	int nr_pages;
 	spinlock_t spinlock ____cacheline_aligned_in_smp;
-	/* Consumer and producer counters are put into separate pages to allow
-	 * mapping consumer page as r/w, but restrict producer page to r/o.
-	 * This protects producer position from being modified by user-space
-	 * application and ruining in-kernel position tracking.
+	/* Consumer and producer counters are put into separate pages to
+	 * allow each position to be mapped with different permissions.
+	 * This prevents a user-space application from modifying the
+	 * position and ruining in-kernel tracking. The permissions of the
+	 * pages depend on who is producing samples: user-space or the
+	 * kernel.
+	 *
+	 * Kernel-producer
+	 * ---------------
+	 * The producer position and data pages are mapped as r/o in
+	 * userspace. For this approach, bits in the header of samples are
+	 * used to signal to user-space, and to other producers, whether a
+	 * sample is currently being written.
+	 *
+	 * User-space producer
+	 * -------------------
+	 * Only the page containing the consumer position is mapped r/o in
+	 * user-space. User-space producers also use bits of the header to
+	 * communicate to the kernel, but the kernel must carefully check and
+	 * validate each sample to ensure that they're correctly formatted, and
+	 * fully contained within the ring buffer.
 	 */
 	unsigned long consumer_pos __aligned(PAGE_SIZE);
 	unsigned long producer_pos __aligned(PAGE_SIZE);
@@ -224,7 +241,7 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
 	return -ENOTSUPP;
 }
 
-static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
 {
 	struct bpf_ringbuf_map *rb_map;
 
@@ -242,6 +259,26 @@ static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
 				   vma->vm_pgoff + RINGBUF_PGOFF);
 }
 
+static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
+{
+	struct bpf_ringbuf_map *rb_map;
+
+	rb_map = container_of(map, struct bpf_ringbuf_map, map);
+
+	if (vma->vm_flags & VM_WRITE) {
+		if (vma->vm_pgoff == 0)
+			/* Disallow writable mappings to the consumer pointer,
+			 * and allow writable mappings to both the producer
+			 * position, and the ring buffer data itself.
+			 */
+			return -EPERM;
+	} else {
+		vma->vm_flags &= ~VM_MAYWRITE;
+	}
+	/* remap_vmalloc_range() checks size and offset constraints */
+	return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
 static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
 {
 	unsigned long cons_pos, prod_pos;
@@ -269,7 +306,7 @@ const struct bpf_map_ops ringbuf_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc = ringbuf_map_alloc,
 	.map_free = ringbuf_map_free,
-	.map_mmap = ringbuf_map_mmap,
+	.map_mmap = ringbuf_map_mmap_kern,
 	.map_poll = ringbuf_map_poll,
 	.map_lookup_elem = ringbuf_map_lookup_elem,
 	.map_update_elem = ringbuf_map_update_elem,
@@ -278,6 +315,19 @@ const struct bpf_map_ops ringbuf_map_ops = {
 	.map_btf_id = &ringbuf_map_btf_ids[0],
 };
 
+BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
+const struct bpf_map_ops user_ringbuf_map_ops = {
+	.map_meta_equal = bpf_map_meta_equal,
+	.map_alloc = ringbuf_map_alloc,
+	.map_free = ringbuf_map_free,
+	.map_mmap = ringbuf_map_mmap_user,
+	.map_lookup_elem = ringbuf_map_lookup_elem,
+	.map_update_elem = ringbuf_map_update_elem,
+	.map_delete_elem = ringbuf_map_delete_elem,
+	.map_get_next_key = ringbuf_map_get_next_key,
+	.map_btf_id = &user_ringbuf_map_btf_ids[0],
+};
+
 /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
  * calculate offset from record metadata to ring buffer in pages, rounded
  * down. This page offset is stored as part of record metadata and allows to
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8c6fbcd0afaf..83710b60e708 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6240,6 +6240,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		    func_id != BPF_FUNC_ringbuf_discard_dynptr)
 			goto error;
 		break;
+	case BPF_MAP_TYPE_USER_RINGBUF:
+		goto error;
 	case BPF_MAP_TYPE_STACK_TRACE:
 		if (func_id != BPF_FUNC_get_stackid)
 			goto error;
@@ -12635,6 +12637,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 		case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 		case BPF_MAP_TYPE_HASH_OF_MAPS:
 		case BPF_MAP_TYPE_RINGBUF:
+		case BPF_MAP_TYPE_USER_RINGBUF:
 		case BPF_MAP_TYPE_INODE_STORAGE:
 		case BPF_MAP_TYPE_SK_STORAGE:
 		case BPF_MAP_TYPE_TASK_STORAGE:
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 7c188a598444..7f3b67a8b48f 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -55,7 +55,7 @@ MAP COMMANDS
 |		| **devmap** | **devmap_hash** | **sockmap** | **cpumap** | **xskmap** | **sockhash**
 |		| **cgroup_storage** | **reuseport_sockarray** | **percpu_cgroup_storage**
 |		| **queue** | **stack** | **sk_storage** | **struct_ops** | **ringbuf** | **inode_storage**
-|		| **task_storage** | **bloom_filter** }
+|		| **task_storage** | **bloom_filter** | **user_ringbuf** }
 
 DESCRIPTION
 ===========
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 38b6bc9c26c3..9a6ca9f31133 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -1459,7 +1459,7 @@ static int do_help(int argc, char **argv)
 		"                 devmap | devmap_hash | sockmap | cpumap | xskmap | sockhash |\n"
 		"                 cgroup_storage | reuseport_sockarray | percpu_cgroup_storage |\n"
 		"                 queue | stack | sk_storage | struct_ops | ringbuf | inode_storage |\n"
-		"                 task_storage | bloom_filter }\n"
+		"                 task_storage | bloom_filter | user_ringbuf }\n"
 		"       " HELP_SPEC_OPTIONS " |\n"
 		"                    {-f|--bpffs} | {-n|--nomount} }\n"
 		"",
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3df78c56c1bf..e18c85324db6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -928,6 +928,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_INODE_STORAGE,
 	BPF_MAP_TYPE_TASK_STORAGE,
 	BPF_MAP_TYPE_BLOOM_FILTER,
+	BPF_MAP_TYPE_USER_RINGBUF,
 };
 
 /* Note that tracing related programs such as
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 2ca30ccc774c..d480da05b6de 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -163,6 +163,7 @@ static const char * const map_type_name[] = {
 	[BPF_MAP_TYPE_INODE_STORAGE]		= "inode_storage",
 	[BPF_MAP_TYPE_TASK_STORAGE]		= "task_storage",
 	[BPF_MAP_TYPE_BLOOM_FILTER]		= "bloom_filter",
+	[BPF_MAP_TYPE_USER_RINGBUF]             = "user_ringbuf",
 };
 
 static const char * const prog_type_name[] = {
-- 
cgit v1.2.3


From 20571567384428dfc9fe5cf9f2e942e1df13c2dd Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 19 Sep 2022 19:00:58 -0500
Subject: bpf: Add bpf_user_ringbuf_drain() helper

In a prior change, we added a new BPF_MAP_TYPE_USER_RINGBUF map type which
will allow user-space applications to publish messages to a ring buffer
that is consumed by a BPF program in kernel-space. In order for this
map-type to be useful, it will require a BPF helper function that BPF
programs can invoke to drain samples from the ring buffer, and invoke
callbacks on those samples. This change adds that capability via a new BPF
helper function:

bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx,
                       u64 flags)

BPF programs may invoke this function to run callback_fn() on a series of
samples in the ring buffer. callback_fn() has the following signature:

long callback_fn(struct bpf_dynptr *dynptr, void *context);

Samples are provided to the callback in the form of struct bpf_dynptr *'s,
which the program can read using BPF helper functions for querying
struct bpf_dynptr's.

In order to support bpf_ringbuf_drain(), a new PTR_TO_DYNPTR register
type is added to the verifier to reflect a dynptr that was allocated by
a helper function and passed to a BPF program. Unlike PTR_TO_STACK
dynptrs which are allocated on the stack by a BPF program, PTR_TO_DYNPTR
dynptrs need not use reference tracking, as the BPF helper is trusted to
properly free the dynptr before returning. The verifier currently only
supports PTR_TO_DYNPTR registers that are also DYNPTR_TYPE_LOCAL.

Note that while the corresponding user-space libbpf logic will be added
in a subsequent patch, this patch does contain an implementation of the
.map_poll() callback for BPF_MAP_TYPE_USER_RINGBUF maps. This
.map_poll() callback guarantees that an epoll-waiting user-space
producer will receive at least one event notification whenever at least
one sample is drained in an invocation of bpf_user_ringbuf_drain(),
provided that the function is not invoked with the BPF_RB_NO_WAKEUP
flag. If the BPF_RB_FORCE_WAKEUP flag is provided, a wakeup
notification is sent even if no sample was drained.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220920000100.477320-3-void@manifault.com
---
 include/linux/bpf.h            |  11 ++-
 include/uapi/linux/bpf.h       |  38 +++++++++
 kernel/bpf/helpers.c           |   2 +
 kernel/bpf/ringbuf.c           | 181 +++++++++++++++++++++++++++++++++++++++--
 kernel/bpf/verifier.c          |  61 +++++++++++++-
 tools/include/uapi/linux/bpf.h |  38 +++++++++
 6 files changed, 320 insertions(+), 11 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e0dbe0c0a17e..33e543b86e1a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -451,7 +451,7 @@ enum bpf_type_flag {
 	/* DYNPTR points to memory local to the bpf program. */
 	DYNPTR_TYPE_LOCAL	= BIT(8 + BPF_BASE_TYPE_BITS),
 
-	/* DYNPTR points to a ringbuf record. */
+	/* DYNPTR points to a kernel-produced ringbuf record. */
 	DYNPTR_TYPE_RINGBUF	= BIT(9 + BPF_BASE_TYPE_BITS),
 
 	/* Size is known at compile time. */
@@ -656,6 +656,7 @@ enum bpf_reg_type {
 	PTR_TO_MEM,		 /* reg points to valid memory region */
 	PTR_TO_BUF,		 /* reg points to a read/write buffer */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
+	PTR_TO_DYNPTR,		 /* reg points to a dynptr */
 	__BPF_REG_TYPE_MAX,
 
 	/* Extended reg_types. */
@@ -1394,6 +1395,11 @@ struct bpf_array {
 #define BPF_MAP_CAN_READ	BIT(0)
 #define BPF_MAP_CAN_WRITE	BIT(1)
 
+/* Maximum number of user-producer ring buffer samples that can be drained in
+ * a call to bpf_user_ringbuf_drain().
+ */
+#define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024)
+
 static inline u32 bpf_map_flags_to_cap(struct bpf_map *map)
 {
 	u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);
@@ -2495,6 +2501,7 @@ extern const struct bpf_func_proto bpf_loop_proto;
 extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
 extern const struct bpf_func_proto bpf_set_retval_proto;
 extern const struct bpf_func_proto bpf_get_retval_proto;
+extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto;
 
 const struct bpf_func_proto *tracing_prog_func_proto(
   enum bpf_func_id func_id, const struct bpf_prog *prog);
@@ -2639,7 +2646,7 @@ enum bpf_dynptr_type {
 	BPF_DYNPTR_TYPE_INVALID,
 	/* Points to memory that is local to the bpf program */
 	BPF_DYNPTR_TYPE_LOCAL,
-	/* Underlying data is a ringbuf record */
+	/* Underlying data is a kernel-produced ringbuf record */
 	BPF_DYNPTR_TYPE_RINGBUF,
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e18c85324db6..ead35f39f185 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5388,6 +5388,43 @@ union bpf_attr {
  *	Return
  *		Current *ktime*.
  *
+ * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags)
+ *	Description
+ *		Drain samples from the specified user ring buffer, and invoke
+ *		the provided callback for each such sample:
+ *
+ *		long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ *
+ *		If **callback_fn** returns 0, the helper will continue to try
+ *		and drain the next sample, up to a maximum of
+ *		BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1,
+ *		the helper will skip the rest of the samples and return. Other
+ *		return values are not used now, and will be rejected by the
+ *		verifier.
+ *	Return
+ *		The number of drained samples if no error was encountered while
+ *		draining samples, or 0 if no samples were present in the ring
+ *		buffer. If a user-space producer was epoll-waiting on this map,
+ *		and at least one sample was drained, they will receive an event
+ *		notification notifying them of available space in the ring
+ *		buffer. If the BPF_RB_NO_WAKEUP flag is passed to this
+ *		function, no wakeup notification will be sent. If the
+ *		BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will
+ *		be sent even if no sample was drained.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EBUSY** if the ring buffer is contended, and another calling
+ *		context was concurrently draining the ring buffer.
+ *
+ *		**-EINVAL** if user-space is not properly tracking the ring
+ *		buffer due to the producer position not being aligned to 8
+ *		bytes, a sample not being aligned to 8 bytes, or the producer
+ *		position not matching the advertised length of a sample.
+ *
+ *		**-E2BIG** if user-space has tried to publish a sample which is
+ *		larger than the size of the ring buffer, or which cannot fit
+ *		within a struct bpf_dynptr.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5599,6 +5636,7 @@ union bpf_attr {
 	FN(tcp_raw_check_syncookie_ipv4),	\
 	FN(tcp_raw_check_syncookie_ipv6),	\
 	FN(ktime_get_tai_ns),		\
+	FN(user_ringbuf_drain),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 41aeaf3862ec..cb5564c77482 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1659,6 +1659,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		return &bpf_for_each_map_elem_proto;
 	case BPF_FUNC_loop:
 		return &bpf_loop_proto;
+	case BPF_FUNC_user_ringbuf_drain:
+		return &bpf_user_ringbuf_drain_proto;
 	default:
 		break;
 	}
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 754e915748fb..9e832acf4692 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -38,6 +38,22 @@ struct bpf_ringbuf {
 	struct page **pages;
 	int nr_pages;
 	spinlock_t spinlock ____cacheline_aligned_in_smp;
+	/* For user-space producer ring buffers, an atomic_t busy bit is used
+	 * to synchronize access to the ring buffers in the kernel, rather than
+	 * the spinlock that is used for kernel-producer ring buffers. This is
+	 * done because the ring buffer must hold a lock across a BPF program's
+	 * callback:
+	 *
+	 *    __bpf_user_ringbuf_peek() // lock acquired
+	 * -> program callback_fn()
+	 * -> __bpf_user_ringbuf_sample_release() // lock released
+	 *
+	 * It is unsafe and incorrect to hold an IRQ spinlock across what could
+	 * be a long execution window, so we instead simply disallow concurrent
+	 * access to the ring buffer by kernel consumers, and return -EBUSY from
+	 * __bpf_user_ringbuf_peek() if the busy bit is held by another task.
+	 */
+	atomic_t busy ____cacheline_aligned_in_smp;
 	/* Consumer and producer counters are put into separate pages to
 	 * allow each position to be mapped with different permissions.
 	 * This prevents a user-space application from modifying the
@@ -153,6 +169,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
 		return NULL;
 
 	spin_lock_init(&rb->spinlock);
+	atomic_set(&rb->busy, 0);
 	init_waitqueue_head(&rb->waitq);
 	init_irq_work(&rb->work, bpf_ringbuf_notify);
 
@@ -288,8 +305,13 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
 	return prod_pos - cons_pos;
 }
 
-static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
-				 struct poll_table_struct *pts)
+static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
+{
+	return rb->mask + 1;
+}
+
+static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp,
+				      struct poll_table_struct *pts)
 {
 	struct bpf_ringbuf_map *rb_map;
 
@@ -301,13 +323,26 @@ static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
 	return 0;
 }
 
+static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
+				      struct poll_table_struct *pts)
+{
+	struct bpf_ringbuf_map *rb_map;
+
+	rb_map = container_of(map, struct bpf_ringbuf_map, map);
+	poll_wait(filp, &rb_map->rb->waitq, pts);
+
+	if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb))
+		return EPOLLOUT | EPOLLWRNORM;
+	return 0;
+}
+
 BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
 const struct bpf_map_ops ringbuf_map_ops = {
 	.map_meta_equal = bpf_map_meta_equal,
 	.map_alloc = ringbuf_map_alloc,
 	.map_free = ringbuf_map_free,
 	.map_mmap = ringbuf_map_mmap_kern,
-	.map_poll = ringbuf_map_poll,
+	.map_poll = ringbuf_map_poll_kern,
 	.map_lookup_elem = ringbuf_map_lookup_elem,
 	.map_update_elem = ringbuf_map_update_elem,
 	.map_delete_elem = ringbuf_map_delete_elem,
@@ -321,6 +356,7 @@ const struct bpf_map_ops user_ringbuf_map_ops = {
 	.map_alloc = ringbuf_map_alloc,
 	.map_free = ringbuf_map_free,
 	.map_mmap = ringbuf_map_mmap_user,
+	.map_poll = ringbuf_map_poll_user,
 	.map_lookup_elem = ringbuf_map_lookup_elem,
 	.map_update_elem = ringbuf_map_update_elem,
 	.map_delete_elem = ringbuf_map_delete_elem,
@@ -362,7 +398,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
 		return NULL;
 
 	len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
-	if (len > rb->mask + 1)
+	if (len > ringbuf_total_data_sz(rb))
 		return NULL;
 
 	cons_pos = smp_load_acquire(&rb->consumer_pos);
@@ -509,7 +545,7 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
 	case BPF_RB_AVAIL_DATA:
 		return ringbuf_avail_data_sz(rb);
 	case BPF_RB_RING_SIZE:
-		return rb->mask + 1;
+		return ringbuf_total_data_sz(rb);
 	case BPF_RB_CONS_POS:
 		return smp_load_acquire(&rb->consumer_pos);
 	case BPF_RB_PROD_POS:
@@ -603,3 +639,138 @@ const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
 	.arg1_type	= ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
 	.arg2_type	= ARG_ANYTHING,
 };
+
+static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size)
+{
+	int err;
+	u32 hdr_len, sample_len, total_len, flags, *hdr;
+	u64 cons_pos, prod_pos;
+
+	/* Synchronizes with smp_store_release() in user-space producer. */
+	prod_pos = smp_load_acquire(&rb->producer_pos);
+	if (prod_pos % 8)
+		return -EINVAL;
+
+	/* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */
+	cons_pos = smp_load_acquire(&rb->consumer_pos);
+	if (cons_pos >= prod_pos)
+		return -ENODATA;
+
+	hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
+	/* Synchronizes with smp_store_release() in user-space producer. */
+	hdr_len = smp_load_acquire(hdr);
+	flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT);
+	sample_len = hdr_len & ~flags;
+	total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8);
+
+	/* The sample must fit within the region advertised by the producer position. */
+	if (total_len > prod_pos - cons_pos)
+		return -EINVAL;
+
+	/* The sample must fit within the data region of the ring buffer. */
+	if (total_len > ringbuf_total_data_sz(rb))
+		return -E2BIG;
+
+	/* The sample must fit into a struct bpf_dynptr. */
+	err = bpf_dynptr_check_size(sample_len);
+	if (err)
+		return -E2BIG;
+
+	if (flags & BPF_RINGBUF_DISCARD_BIT) {
+		/* If the discard bit is set, the sample should be skipped.
+		 *
+		 * Update the consumer pos, and return -EAGAIN so the caller
+		 * knows to skip this sample and try to read the next one.
+		 */
+		smp_store_release(&rb->consumer_pos, cons_pos + total_len);
+		return -EAGAIN;
+	}
+
+	if (flags & BPF_RINGBUF_BUSY_BIT)
+		return -ENODATA;
+
+	*sample = (void *)((uintptr_t)rb->data +
+			   (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
+	*size = sample_len;
+	return 0;
+}
+
+static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
+{
+	u64 consumer_pos;
+	u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+
+	/* Using smp_load_acquire() is unnecessary here, as the busy-bit
+	 * prevents another task from writing to consumer_pos after it was read
+	 * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
+	 */
+	consumer_pos = rb->consumer_pos;
+	 /* Synchronizes with smp_load_acquire() in user-space producer. */
+	smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
+}
+
+BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
+	   void *, callback_fn, void *, callback_ctx, u64, flags)
+{
+	struct bpf_ringbuf *rb;
+	long samples, discarded_samples = 0, ret = 0;
+	bpf_callback_t callback = (bpf_callback_t)callback_fn;
+	u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP;
+	int busy = 0;
+
+	if (unlikely(flags & ~wakeup_flags))
+		return -EINVAL;
+
+	rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+	/* If another consumer is already consuming a sample, wait for them to finish. */
+	if (!atomic_try_cmpxchg(&rb->busy, &busy, 1))
+		return -EBUSY;
+
+	for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) {
+		int err;
+		u32 size;
+		void *sample;
+		struct bpf_dynptr_kern dynptr;
+
+		err = __bpf_user_ringbuf_peek(rb, &sample, &size);
+		if (err) {
+			if (err == -ENODATA) {
+				break;
+			} else if (err == -EAGAIN) {
+				discarded_samples++;
+				continue;
+			} else {
+				ret = err;
+				goto schedule_work_return;
+			}
+		}
+
+		bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
+		ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0);
+		__bpf_user_ringbuf_sample_release(rb, size, flags);
+	}
+	ret = samples - discarded_samples;
+
+schedule_work_return:
+	/* Prevent the clearing of the busy-bit from being reordered before the
+	 * storing of any rb consumer or producer positions.
+	 */
+	smp_mb__before_atomic();
+	atomic_set(&rb->busy, 0);
+
+	if (flags & BPF_RB_FORCE_WAKEUP)
+		irq_work_queue(&rb->work);
+	else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0)
+		irq_work_queue(&rb->work);
+	return ret;
+}
+
+const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
+	.func		= bpf_user_ringbuf_drain,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_FUNC,
+	.arg3_type	= ARG_PTR_TO_STACK_OR_NULL,
+	.arg4_type	= ARG_ANYTHING,
+};
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 83710b60e708..c76fa45a5906 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -563,6 +563,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
 		[PTR_TO_BUF]		= "buf",
 		[PTR_TO_FUNC]		= "func",
 		[PTR_TO_MAP_KEY]	= "map_key",
+		[PTR_TO_DYNPTR]		= "dynptr_ptr",
 	};
 
 	if (type & PTR_MAYBE_NULL) {
@@ -5688,6 +5689,12 @@ static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK }
 static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
 static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
 static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
+static const struct bpf_reg_types dynptr_types = {
+	.types = {
+		PTR_TO_STACK,
+		PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL,
+	}
+};
 
 static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_MAP_KEY]		= &map_key_value_types,
@@ -5714,7 +5721,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
 	[ARG_PTR_TO_CONST_STR]		= &const_str_ptr_types,
 	[ARG_PTR_TO_TIMER]		= &timer_types,
 	[ARG_PTR_TO_KPTR]		= &kptr_types,
-	[ARG_PTR_TO_DYNPTR]		= &stack_ptr_types,
+	[ARG_PTR_TO_DYNPTR]		= &dynptr_types,
 };
 
 static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@ -6066,6 +6073,13 @@ skip_type_check:
 		err = check_mem_size_reg(env, reg, regno, true, meta);
 		break;
 	case ARG_PTR_TO_DYNPTR:
+		/* We only need to check for initialized / uninitialized helper
+		 * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
+		 * assumption is that if it is, that a helper function
+		 * initialized the dynptr on behalf of the BPF program.
+		 */
+		if (base_type(reg->type) == PTR_TO_DYNPTR)
+			break;
 		if (arg_type & MEM_UNINIT) {
 			if (!is_dynptr_reg_valid_uninit(env, reg)) {
 				verbose(env, "Dynptr has to be an uninitialized dynptr\n");
@@ -6241,7 +6255,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_MAP_TYPE_USER_RINGBUF:
-		goto error;
+		if (func_id != BPF_FUNC_user_ringbuf_drain)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_STACK_TRACE:
 		if (func_id != BPF_FUNC_get_stackid)
 			goto error;
@@ -6361,6 +6377,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (map->map_type != BPF_MAP_TYPE_RINGBUF)
 			goto error;
 		break;
+	case BPF_FUNC_user_ringbuf_drain:
+		if (map->map_type != BPF_MAP_TYPE_USER_RINGBUF)
+			goto error;
+		break;
 	case BPF_FUNC_get_stackid:
 		if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
 			goto error;
@@ -6887,6 +6907,29 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env,
 	return 0;
 }
 
+static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
+					   struct bpf_func_state *caller,
+					   struct bpf_func_state *callee,
+					   int insn_idx)
+{
+	/* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
+	 *			  callback_ctx, u64 flags);
+	 * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx);
+	 */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
+	callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL;
+	__mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+	callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+	/* unused */
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+	__mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+	callee->in_callback_fn = true;
+	return 0;
+}
+
 static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 {
 	struct bpf_verifier_state *state = env->cur_state;
@@ -7346,12 +7389,18 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 	case BPF_FUNC_dynptr_data:
 		for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
 			if (arg_type_is_dynptr(fn->arg_type[i])) {
+				struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
+
 				if (meta.ref_obj_id) {
 					verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
 					return -EFAULT;
 				}
-				/* Find the id of the dynptr we're tracking the reference of */
-				meta.ref_obj_id = stack_slot_get_id(env, &regs[BPF_REG_1 + i]);
+
+				if (base_type(reg->type) != PTR_TO_DYNPTR)
+					/* Find the id of the dynptr we're
+					 * tracking the reference of
+					 */
+					meta.ref_obj_id = stack_slot_get_id(env, reg);
 				break;
 			}
 		}
@@ -7360,6 +7409,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			return -EFAULT;
 		}
 		break;
+	case BPF_FUNC_user_ringbuf_drain:
+		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+					set_user_ringbuf_callback_state);
+		break;
 	}
 
 	if (err)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e18c85324db6..ead35f39f185 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5388,6 +5388,43 @@ union bpf_attr {
  *	Return
  *		Current *ktime*.
  *
+ * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags)
+ *	Description
+ *		Drain samples from the specified user ring buffer, and invoke
+ *		the provided callback for each such sample:
+ *
+ *		long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx);
+ *
+ *		If **callback_fn** returns 0, the helper will continue to try
+ *		and drain the next sample, up to a maximum of
+ *		BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1,
+ *		the helper will skip the rest of the samples and return. Other
+ *		return values are not used now, and will be rejected by the
+ *		verifier.
+ *	Return
+ *		The number of drained samples if no error was encountered while
+ *		draining samples, or 0 if no samples were present in the ring
+ *		buffer. If a user-space producer was epoll-waiting on this map,
+ *		and at least one sample was drained, they will receive an event
+ *		notification notifying them of available space in the ring
+ *		buffer. If the BPF_RB_NO_WAKEUP flag is passed to this
+ *		function, no wakeup notification will be sent. If the
+ *		BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will
+ *		be sent even if no sample was drained.
+ *
+ *		On failure, the returned value is one of the following:
+ *
+ *		**-EBUSY** if the ring buffer is contended, and another calling
+ *		context was concurrently draining the ring buffer.
+ *
+ *		**-EINVAL** if user-space is not properly tracking the ring
+ *		buffer due to the producer position not being aligned to 8
+ *		bytes, a sample not being aligned to 8 bytes, or the producer
+ *		position not matching the advertised length of a sample.
+ *
+ *		**-E2BIG** if user-space has tried to publish a sample which is
+ *		larger than the size of the ring buffer, or which cannot fit
+ *		within a struct bpf_dynptr.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -5599,6 +5636,7 @@ union bpf_attr {
 	FN(tcp_raw_check_syncookie_ipv4),	\
 	FN(tcp_raw_check_syncookie_ipv6),	\
 	FN(ktime_get_tai_ns),		\
+	FN(user_ringbuf_drain),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
-- 
cgit v1.2.3


From b66ccae01f1ddce47fe2c7f393a3a5c5ab3d7f06 Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 19 Sep 2022 19:00:59 -0500
Subject: bpf: Add libbpf logic for user-space ring buffer

Now that all of the logic is in place in the kernel to support user-space
produced ring buffers, we can add the user-space logic to libbpf. This
patch therefore adds the following public symbols to libbpf:

struct user_ring_buffer *
user_ring_buffer__new(int map_fd,
		      const struct user_ring_buffer_opts *opts);
void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size);
void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb,
                                         __u32 size, int timeout_ms);
void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample);
void user_ring_buffer__discard(struct user_ring_buffer *rb,
void user_ring_buffer__free(struct user_ring_buffer *rb);

A user-space producer must first create a struct user_ring_buffer * object
with user_ring_buffer__new(), and can then reserve samples in the
ring buffer using one of the following two symbols:

void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size);
void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb,
                                         __u32 size, int timeout_ms);

With user_ring_buffer__reserve(), a pointer to a 'size' region of the ring
buffer will be returned if sufficient space is available in the buffer.
user_ring_buffer__reserve_blocking() provides similar semantics, but will
block for up to 'timeout_ms' in epoll_wait if there is insufficient space
in the buffer. This function has the guarantee from the kernel that it will
receive at least one event-notification per invocation to
bpf_ringbuf_drain(), provided that at least one sample is drained, and the
BPF program did not pass the BPF_RB_NO_WAKEUP flag to bpf_ringbuf_drain().

Once a sample is reserved, it must either be committed to the ring buffer
with user_ring_buffer__submit(), or discarded with
user_ring_buffer__discard().

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220920000100.477320-4-void@manifault.com
---
 tools/lib/bpf/libbpf.c         |  10 +-
 tools/lib/bpf/libbpf.h         | 107 ++++++++++++++++
 tools/lib/bpf/libbpf.map       |  10 ++
 tools/lib/bpf/libbpf_probes.c  |   1 +
 tools/lib/bpf/libbpf_version.h |   2 +-
 tools/lib/bpf/ringbuf.c        | 271 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 398 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index d480da05b6de..67bc18506150 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2373,6 +2373,12 @@ static size_t adjust_ringbuf_sz(size_t sz)
 	return sz;
 }
 
+static bool map_is_ringbuf(const struct bpf_map *map)
+{
+	return map->def.type == BPF_MAP_TYPE_RINGBUF ||
+	       map->def.type == BPF_MAP_TYPE_USER_RINGBUF;
+}
+
 static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def)
 {
 	map->def.type = def->map_type;
@@ -2387,7 +2393,7 @@ static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def
 	map->btf_value_type_id = def->value_type_id;
 
 	/* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */
-	if (map->def.type == BPF_MAP_TYPE_RINGBUF)
+	if (map_is_ringbuf(map))
 		map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries);
 
 	if (def->parts & MAP_DEF_MAP_TYPE)
@@ -4370,7 +4376,7 @@ int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries)
 	map->def.max_entries = max_entries;
 
 	/* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */
-	if (map->def.type == BPF_MAP_TYPE_RINGBUF)
+	if (map_is_ringbuf(map))
 		map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries);
 
 	return 0;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 88a1ac34b12a..e2d8c17f2e85 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -1011,6 +1011,7 @@ LIBBPF_API int bpf_tc_query(const struct bpf_tc_hook *hook,
 
 /* Ring buffer APIs */
 struct ring_buffer;
+struct user_ring_buffer;
 
 typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size);
 
@@ -1030,6 +1031,112 @@ LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms);
 LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb);
 LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb);
 
+struct user_ring_buffer_opts {
+	size_t sz; /* size of this struct, for forward/backward compatibility */
+};
+
+#define user_ring_buffer_opts__last_field sz
+
+/* @brief **user_ring_buffer__new()** creates a new instance of a user ring
+ * buffer.
+ *
+ * @param map_fd A file descriptor to a BPF_MAP_TYPE_USER_RINGBUF map.
+ * @param opts Options for how the ring buffer should be created.
+ * @return A user ring buffer on success; NULL and errno being set on a
+ * failure.
+ */
+LIBBPF_API struct user_ring_buffer *
+user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts);
+
+/* @brief **user_ring_buffer__reserve()** reserves a pointer to a sample in the
+ * user ring buffer.
+ * @param rb A pointer to a user ring buffer.
+ * @param size The size of the sample, in bytes.
+ * @return A pointer to an 8-byte aligned reserved region of the user ring
+ * buffer; NULL, and errno being set if a sample could not be reserved.
+ *
+ * This function is *not* thread safe, and callers must synchronize accessing
+ * this function if there are multiple producers.  If a size is requested that
+ * is larger than the size of the entire ring buffer, errno will be set to
+ * E2BIG and NULL is returned. If the ring buffer could accommodate the size,
+ * but currently does not have enough space, errno is set to ENOSPC and NULL is
+ * returned.
+ *
+ * After initializing the sample, callers must invoke
+ * **user_ring_buffer__submit()** to post the sample to the kernel. Otherwise,
+ * the sample must be freed with **user_ring_buffer__discard()**.
+ */
+LIBBPF_API void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size);
+
+/* @brief **user_ring_buffer__reserve_blocking()** reserves a record in the
+ * ring buffer, possibly blocking for up to @timeout_ms until a sample becomes
+ * available.
+ * @param rb The user ring buffer.
+ * @param size The size of the sample, in bytes.
+ * @param timeout_ms The amount of time, in milliseconds, for which the caller
+ * should block when waiting for a sample. -1 causes the caller to block
+ * indefinitely.
+ * @return A pointer to an 8-byte aligned reserved region of the user ring
+ * buffer; NULL, and errno being set if a sample could not be reserved.
+ *
+ * This function is *not* thread safe, and callers must synchronize
+ * accessing this function if there are multiple producers
+ *
+ * If **timeout_ms** is -1, the function will block indefinitely until a sample
+ * becomes available. Otherwise, **timeout_ms** must be non-negative, or errno
+ * is set to EINVAL, and NULL is returned. If **timeout_ms** is 0, no blocking
+ * will occur and the function will return immediately after attempting to
+ * reserve a sample.
+ *
+ * If **size** is larger than the size of the entire ring buffer, errno is set
+ * to E2BIG and NULL is returned. If the ring buffer could accommodate
+ * **size**, but currently does not have enough space, the caller will block
+ * until at most **timeout_ms** has elapsed. If insufficient space is available
+ * at that time, errno is set to ENOSPC, and NULL is returned.
+ *
+ * The kernel guarantees that it will wake up this thread to check if
+ * sufficient space is available in the ring buffer at least once per
+ * invocation of the **bpf_ringbuf_drain()** helper function, provided that at
+ * least one sample is consumed, and the BPF program did not invoke the
+ * function with BPF_RB_NO_WAKEUP. A wakeup may occur sooner than that, but the
+ * kernel does not guarantee this. If the helper function is invoked with
+ * BPF_RB_FORCE_WAKEUP, a wakeup event will be sent even if no sample is
+ * consumed.
+ *
+ * When a sample of size **size** is found within **timeout_ms**, a pointer to
+ * the sample is returned. After initializing the sample, callers must invoke
+ * **user_ring_buffer__submit()** to post the sample to the ring buffer.
+ * Otherwise, the sample must be freed with **user_ring_buffer__discard()**.
+ */
+LIBBPF_API void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb,
+						    __u32 size,
+						    int timeout_ms);
+
+/* @brief **user_ring_buffer__submit()** submits a previously reserved sample
+ * into the ring buffer.
+ * @param rb The user ring buffer.
+ * @param sample A reserved sample.
+ *
+ * It is not necessary to synchronize amongst multiple producers when invoking
+ * this function.
+ */
+LIBBPF_API void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample);
+
+/* @brief **user_ring_buffer__discard()** discards a previously reserved sample.
+ * @param rb The user ring buffer.
+ * @param sample A reserved sample.
+ *
+ * It is not necessary to synchronize amongst multiple producers when invoking
+ * this function.
+ */
+LIBBPF_API void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample);
+
+/* @brief **user_ring_buffer__free()** frees a ring buffer that was previously
+ * created with **user_ring_buffer__new()**.
+ * @param rb The user ring buffer being freed.
+ */
+LIBBPF_API void user_ring_buffer__free(struct user_ring_buffer *rb);
+
 /* Perf buffer APIs */
 struct perf_buffer;
 
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 2b928dc21af0..c1d6aa7c82b6 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -368,3 +368,13 @@ LIBBPF_1.0.0 {
 		libbpf_bpf_prog_type_str;
 		perf_buffer__buffer;
 };
+
+LIBBPF_1.1.0 {
+	global:
+		user_ring_buffer__discard;
+		user_ring_buffer__free;
+		user_ring_buffer__new;
+		user_ring_buffer__reserve;
+		user_ring_buffer__reserve_blocking;
+		user_ring_buffer__submit;
+} LIBBPF_1.0.0;
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 6d495656f554..f3a8e8e74eb8 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -231,6 +231,7 @@ static int probe_map_create(enum bpf_map_type map_type)
 			return btf_fd;
 		break;
 	case BPF_MAP_TYPE_RINGBUF:
+	case BPF_MAP_TYPE_USER_RINGBUF:
 		key_size = 0;
 		value_size = 0;
 		max_entries = 4096;
diff --git a/tools/lib/bpf/libbpf_version.h b/tools/lib/bpf/libbpf_version.h
index 2fb2f4290080..e944f5bce728 100644
--- a/tools/lib/bpf/libbpf_version.h
+++ b/tools/lib/bpf/libbpf_version.h
@@ -4,6 +4,6 @@
 #define __LIBBPF_VERSION_H
 
 #define LIBBPF_MAJOR_VERSION 1
-#define LIBBPF_MINOR_VERSION 0
+#define LIBBPF_MINOR_VERSION 1
 
 #endif /* __LIBBPF_VERSION_H */
diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
index 8bc117bcc7bc..d285171d4b69 100644
--- a/tools/lib/bpf/ringbuf.c
+++ b/tools/lib/bpf/ringbuf.c
@@ -16,6 +16,7 @@
 #include <asm/barrier.h>
 #include <sys/mman.h>
 #include <sys/epoll.h>
+#include <time.h>
 
 #include "libbpf.h"
 #include "libbpf_internal.h"
@@ -39,6 +40,23 @@ struct ring_buffer {
 	int ring_cnt;
 };
 
+struct user_ring_buffer {
+	struct epoll_event event;
+	unsigned long *consumer_pos;
+	unsigned long *producer_pos;
+	void *data;
+	unsigned long mask;
+	size_t page_size;
+	int map_fd;
+	int epoll_fd;
+};
+
+/* 8-byte ring buffer header structure */
+struct ringbuf_hdr {
+	__u32 len;
+	__u32 pad;
+};
+
 static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r)
 {
 	if (r->consumer_pos) {
@@ -300,3 +318,256 @@ int ring_buffer__epoll_fd(const struct ring_buffer *rb)
 {
 	return rb->epoll_fd;
 }
+
+static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb)
+{
+	if (rb->consumer_pos) {
+		munmap(rb->consumer_pos, rb->page_size);
+		rb->consumer_pos = NULL;
+	}
+	if (rb->producer_pos) {
+		munmap(rb->producer_pos, rb->page_size + 2 * (rb->mask + 1));
+		rb->producer_pos = NULL;
+	}
+}
+
+void user_ring_buffer__free(struct user_ring_buffer *rb)
+{
+	if (!rb)
+		return;
+
+	user_ringbuf_unmap_ring(rb);
+
+	if (rb->epoll_fd >= 0)
+		close(rb->epoll_fd);
+
+	free(rb);
+}
+
+static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd)
+{
+	struct bpf_map_info info;
+	__u32 len = sizeof(info);
+	void *tmp;
+	struct epoll_event *rb_epoll;
+	int err;
+
+	memset(&info, 0, sizeof(info));
+
+	err = bpf_obj_get_info_by_fd(map_fd, &info, &len);
+	if (err) {
+		err = -errno;
+		pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n", map_fd, err);
+		return err;
+	}
+
+	if (info.type != BPF_MAP_TYPE_USER_RINGBUF) {
+		pr_warn("user ringbuf: map fd=%d is not BPF_MAP_TYPE_USER_RINGBUF\n", map_fd);
+		return -EINVAL;
+	}
+
+	rb->map_fd = map_fd;
+	rb->mask = info.max_entries - 1;
+
+	/* Map read-only consumer page */
+	tmp = mmap(NULL, rb->page_size, PROT_READ, MAP_SHARED, map_fd, 0);
+	if (tmp == MAP_FAILED) {
+		err = -errno;
+		pr_warn("user ringbuf: failed to mmap consumer page for map fd=%d: %d\n",
+			map_fd, err);
+		return err;
+	}
+	rb->consumer_pos = tmp;
+
+	/* Map read-write the producer page and data pages. We map the data
+	 * region as twice the total size of the ring buffer to allow the
+	 * simple reading and writing of samples that wrap around the end of
+	 * the buffer.  See the kernel implementation for details.
+	 */
+	tmp = mmap(NULL, rb->page_size + 2 * info.max_entries,
+		   PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, rb->page_size);
+	if (tmp == MAP_FAILED) {
+		err = -errno;
+		pr_warn("user ringbuf: failed to mmap data pages for map fd=%d: %d\n",
+			map_fd, err);
+		return err;
+	}
+
+	rb->producer_pos = tmp;
+	rb->data = tmp + rb->page_size;
+
+	rb_epoll = &rb->event;
+	rb_epoll->events = EPOLLOUT;
+	if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, rb_epoll) < 0) {
+		err = -errno;
+		pr_warn("user ringbuf: failed to epoll add map fd=%d: %d\n", map_fd, err);
+		return err;
+	}
+
+	return 0;
+}
+
+struct user_ring_buffer *
+user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts)
+{
+	struct user_ring_buffer *rb;
+	int err;
+
+	if (!OPTS_VALID(opts, user_ring_buffer_opts))
+		return errno = EINVAL, NULL;
+
+	rb = calloc(1, sizeof(*rb));
+	if (!rb)
+		return errno = ENOMEM, NULL;
+
+	rb->page_size = getpagesize();
+
+	rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+	if (rb->epoll_fd < 0) {
+		err = -errno;
+		pr_warn("user ringbuf: failed to create epoll instance: %d\n", err);
+		goto err_out;
+	}
+
+	err = user_ringbuf_map(rb, map_fd);
+	if (err)
+		goto err_out;
+
+	return rb;
+
+err_out:
+	user_ring_buffer__free(rb);
+	return errno = -err, NULL;
+}
+
+static void user_ringbuf_commit(struct user_ring_buffer *rb, void *sample, bool discard)
+{
+	__u32 new_len;
+	struct ringbuf_hdr *hdr;
+	uintptr_t hdr_offset;
+
+	hdr_offset = rb->mask + 1 + (sample - rb->data) - BPF_RINGBUF_HDR_SZ;
+	hdr = rb->data + (hdr_offset & rb->mask);
+
+	new_len = hdr->len & ~BPF_RINGBUF_BUSY_BIT;
+	if (discard)
+		new_len |= BPF_RINGBUF_DISCARD_BIT;
+
+	/* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in
+	 * the kernel.
+	 */
+	__atomic_exchange_n(&hdr->len, new_len, __ATOMIC_ACQ_REL);
+}
+
+void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample)
+{
+	user_ringbuf_commit(rb, sample, true);
+}
+
+void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample)
+{
+	user_ringbuf_commit(rb, sample, false);
+}
+
+void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size)
+{
+	__u32 avail_size, total_size, max_size;
+	/* 64-bit to avoid overflow in case of extreme application behavior */
+	__u64 cons_pos, prod_pos;
+	struct ringbuf_hdr *hdr;
+
+	/* Synchronizes with smp_store_release() in __bpf_user_ringbuf_peek() in
+	 * the kernel.
+	 */
+	cons_pos = smp_load_acquire(rb->consumer_pos);
+	/* Synchronizes with smp_store_release() in user_ringbuf_commit() */
+	prod_pos = smp_load_acquire(rb->producer_pos);
+
+	max_size = rb->mask + 1;
+	avail_size = max_size - (prod_pos - cons_pos);
+	/* Round up total size to a multiple of 8. */
+	total_size = (size + BPF_RINGBUF_HDR_SZ + 7) / 8 * 8;
+
+	if (total_size > max_size)
+		return errno = E2BIG, NULL;
+
+	if (avail_size < total_size)
+		return errno = ENOSPC, NULL;
+
+	hdr = rb->data + (prod_pos & rb->mask);
+	hdr->len = size | BPF_RINGBUF_BUSY_BIT;
+	hdr->pad = 0;
+
+	/* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in
+	 * the kernel.
+	 */
+	smp_store_release(rb->producer_pos, prod_pos + total_size);
+
+	return (void *)rb->data + ((prod_pos + BPF_RINGBUF_HDR_SZ) & rb->mask);
+}
+
+static __u64 ns_elapsed_timespec(const struct timespec *start, const struct timespec *end)
+{
+	__u64 start_ns, end_ns, ns_per_s = 1000000000;
+
+	start_ns = (__u64)start->tv_sec * ns_per_s + start->tv_nsec;
+	end_ns = (__u64)end->tv_sec * ns_per_s + end->tv_nsec;
+
+	return end_ns - start_ns;
+}
+
+void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, __u32 size, int timeout_ms)
+{
+	void *sample;
+	int err, ms_remaining = timeout_ms;
+	struct timespec start;
+
+	if (timeout_ms < 0 && timeout_ms != -1)
+		return errno = EINVAL, NULL;
+
+	if (timeout_ms != -1) {
+		err = clock_gettime(CLOCK_MONOTONIC, &start);
+		if (err)
+			return NULL;
+	}
+
+	do {
+		int cnt, ms_elapsed;
+		struct timespec curr;
+		__u64 ns_per_ms = 1000000;
+
+		sample = user_ring_buffer__reserve(rb, size);
+		if (sample)
+			return sample;
+		else if (errno != ENOSPC)
+			return NULL;
+
+		/* The kernel guarantees at least one event notification
+		 * delivery whenever at least one sample is drained from the
+		 * ring buffer in an invocation to bpf_ringbuf_drain(). Other
+		 * additional events may be delivered at any time, but only one
+		 * event is guaranteed per bpf_ringbuf_drain() invocation,
+		 * provided that a sample is drained, and the BPF program did
+		 * not pass BPF_RB_NO_WAKEUP to bpf_ringbuf_drain(). If
+		 * BPF_RB_FORCE_WAKEUP is passed to bpf_ringbuf_drain(), a
+		 * wakeup event will be delivered even if no samples are
+		 * drained.
+		 */
+		cnt = epoll_wait(rb->epoll_fd, &rb->event, 1, ms_remaining);
+		if (cnt < 0)
+			return NULL;
+
+		if (timeout_ms == -1)
+			continue;
+
+		err = clock_gettime(CLOCK_MONOTONIC, &curr);
+		if (err)
+			return NULL;
+
+		ms_elapsed = ns_elapsed_timespec(&start, &curr) / ns_per_ms;
+		ms_remaining = timeout_ms - ms_elapsed;
+	} while (ms_remaining > 0);
+
+	/* Try one more time to reserve a sample after the specified timeout has elapsed. */
+	return user_ring_buffer__reserve(rb, size);
+}
-- 
cgit v1.2.3


From e5a9df51c74671cfe15af1d50e5f508bd3efddab Mon Sep 17 00:00:00 2001
From: David Vernet <void@manifault.com>
Date: Mon, 19 Sep 2022 19:01:00 -0500
Subject: selftests/bpf: Add selftests validating the user ringbuf

This change includes selftests that validate the expected behavior and
APIs of the new BPF_MAP_TYPE_USER_RINGBUF map type.

Signed-off-by: David Vernet <void@manifault.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220920000100.477320-5-void@manifault.com
---
 tools/testing/selftests/bpf/DENYLIST.s390x         |   1 +
 .../selftests/bpf/prog_tests/user_ringbuf.c        | 754 +++++++++++++++++++++
 .../selftests/bpf/progs/test_user_ringbuf.h        |  35 +
 .../selftests/bpf/progs/user_ringbuf_fail.c        | 177 +++++
 .../selftests/bpf/progs/user_ringbuf_success.c     | 218 ++++++
 5 files changed, 1185 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_user_ringbuf.h
 create mode 100644 tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
 create mode 100644 tools/testing/selftests/bpf/progs/user_ringbuf_success.c

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index 168c5b287b5c..981c2be922f4 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -71,3 +71,4 @@ cb_refs                                  # expected error message unexpected err
 cgroup_hierarchical_stats                # JIT does not support calling kernel function                                (kfunc)
 htab_update                              # failed to attach: ERROR: strerror_r(-524)=22                                (trampoline)
 tracing_struct                           # failed to auto-attach: -524                                                 (trampoline)
+user_ringbuf                             # failed to find kernel BTF type ID of '__s390x_sys_prctl': -3                (?)
diff --git a/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
new file mode 100644
index 000000000000..02b18d018b36
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/user_ringbuf.c
@@ -0,0 +1,754 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#define _GNU_SOURCE
+#include <linux/compiler.h>
+#include <linux/ring_buffer.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/sysinfo.h>
+#include <test_progs.h>
+#include <uapi/linux/bpf.h>
+#include <unistd.h>
+
+#include "user_ringbuf_fail.skel.h"
+#include "user_ringbuf_success.skel.h"
+
+#include "../progs/test_user_ringbuf.h"
+
+static size_t log_buf_sz = 1 << 20; /* 1 MB */
+static char obj_log_buf[1048576];
+static const long c_sample_size = sizeof(struct sample) + BPF_RINGBUF_HDR_SZ;
+static const long c_ringbuf_size = 1 << 12; /* 1 small page */
+static const long c_max_entries = c_ringbuf_size / c_sample_size;
+
+static void drain_current_samples(void)
+{
+	syscall(__NR_getpgid);
+}
+
+static int write_samples(struct user_ring_buffer *ringbuf, uint32_t num_samples)
+{
+	int i, err = 0;
+
+	/* Write some number of samples to the ring buffer. */
+	for (i = 0; i < num_samples; i++) {
+		struct sample *entry;
+		int read;
+
+		entry = user_ring_buffer__reserve(ringbuf, sizeof(*entry));
+		if (!entry) {
+			err = -errno;
+			goto done;
+		}
+
+		entry->pid = getpid();
+		entry->seq = i;
+		entry->value = i * i;
+
+		read = snprintf(entry->comm, sizeof(entry->comm), "%u", i);
+		if (read <= 0) {
+			/* Assert on the error path to avoid spamming logs with
+			 * mostly success messages.
+			 */
+			ASSERT_GT(read, 0, "snprintf_comm");
+			err = read;
+			user_ring_buffer__discard(ringbuf, entry);
+			goto done;
+		}
+
+		user_ring_buffer__submit(ringbuf, entry);
+	}
+
+done:
+	drain_current_samples();
+
+	return err;
+}
+
+static struct user_ringbuf_success *open_load_ringbuf_skel(void)
+{
+	struct user_ringbuf_success *skel;
+	int err;
+
+	skel = user_ringbuf_success__open();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return NULL;
+
+	err = bpf_map__set_max_entries(skel->maps.user_ringbuf, c_ringbuf_size);
+	if (!ASSERT_OK(err, "set_max_entries"))
+		goto cleanup;
+
+	err = bpf_map__set_max_entries(skel->maps.kernel_ringbuf, c_ringbuf_size);
+	if (!ASSERT_OK(err, "set_max_entries"))
+		goto cleanup;
+
+	err = user_ringbuf_success__load(skel);
+	if (!ASSERT_OK(err, "skel_load"))
+		goto cleanup;
+
+	return skel;
+
+cleanup:
+	user_ringbuf_success__destroy(skel);
+	return NULL;
+}
+
+static void test_user_ringbuf_mappings(void)
+{
+	int err, rb_fd;
+	int page_size = getpagesize();
+	void *mmap_ptr;
+	struct user_ringbuf_success *skel;
+
+	skel = open_load_ringbuf_skel();
+	if (!skel)
+		return;
+
+	rb_fd = bpf_map__fd(skel->maps.user_ringbuf);
+	/* cons_pos can be mapped R/O, can't add +X with mprotect. */
+	mmap_ptr = mmap(NULL, page_size, PROT_READ, MAP_SHARED, rb_fd, 0);
+	ASSERT_OK_PTR(mmap_ptr, "ro_cons_pos");
+	ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_WRITE), "write_cons_pos_protect");
+	ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_cons_pos_protect");
+	ASSERT_ERR_PTR(mremap(mmap_ptr, 0, 4 * page_size, MREMAP_MAYMOVE), "wr_prod_pos");
+	err = -errno;
+	ASSERT_ERR(err, "wr_prod_pos_err");
+	ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_ro_cons");
+
+	/* prod_pos can be mapped RW, can't add +X with mprotect. */
+	mmap_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+			rb_fd, page_size);
+	ASSERT_OK_PTR(mmap_ptr, "rw_prod_pos");
+	ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_prod_pos_protect");
+	err = -errno;
+	ASSERT_ERR(err, "wr_prod_pos_err");
+	ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_rw_prod");
+
+	/* data pages can be mapped RW, can't add +X with mprotect. */
+	mmap_ptr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, rb_fd,
+			2 * page_size);
+	ASSERT_OK_PTR(mmap_ptr, "rw_data");
+	ASSERT_ERR(mprotect(mmap_ptr, page_size, PROT_EXEC), "exec_data_protect");
+	err = -errno;
+	ASSERT_ERR(err, "exec_data_err");
+	ASSERT_OK(munmap(mmap_ptr, page_size), "unmap_rw_data");
+
+	user_ringbuf_success__destroy(skel);
+}
+
+static int load_skel_create_ringbufs(struct user_ringbuf_success **skel_out,
+				     struct ring_buffer **kern_ringbuf_out,
+				     ring_buffer_sample_fn callback,
+				     struct user_ring_buffer **user_ringbuf_out)
+{
+	struct user_ringbuf_success *skel;
+	struct ring_buffer *kern_ringbuf = NULL;
+	struct user_ring_buffer *user_ringbuf = NULL;
+	int err = -ENOMEM, rb_fd;
+
+	skel = open_load_ringbuf_skel();
+	if (!skel)
+		return err;
+
+	/* only trigger BPF program for current process */
+	skel->bss->pid = getpid();
+
+	if (kern_ringbuf_out) {
+		rb_fd = bpf_map__fd(skel->maps.kernel_ringbuf);
+		kern_ringbuf = ring_buffer__new(rb_fd, callback, skel, NULL);
+		if (!ASSERT_OK_PTR(kern_ringbuf, "kern_ringbuf_create"))
+			goto cleanup;
+
+		*kern_ringbuf_out = kern_ringbuf;
+	}
+
+	if (user_ringbuf_out) {
+		rb_fd = bpf_map__fd(skel->maps.user_ringbuf);
+		user_ringbuf = user_ring_buffer__new(rb_fd, NULL);
+		if (!ASSERT_OK_PTR(user_ringbuf, "user_ringbuf_create"))
+			goto cleanup;
+
+		*user_ringbuf_out = user_ringbuf;
+		ASSERT_EQ(skel->bss->read, 0, "no_reads_after_load");
+	}
+
+	err = user_ringbuf_success__attach(skel);
+	if (!ASSERT_OK(err, "skel_attach"))
+		goto cleanup;
+
+	*skel_out = skel;
+	return 0;
+
+cleanup:
+	if (kern_ringbuf_out)
+		*kern_ringbuf_out = NULL;
+	if (user_ringbuf_out)
+		*user_ringbuf_out = NULL;
+	ring_buffer__free(kern_ringbuf);
+	user_ring_buffer__free(user_ringbuf);
+	user_ringbuf_success__destroy(skel);
+	return err;
+}
+
+static int load_skel_create_user_ringbuf(struct user_ringbuf_success **skel_out,
+					 struct user_ring_buffer **ringbuf_out)
+{
+	return load_skel_create_ringbufs(skel_out, NULL, NULL, ringbuf_out);
+}
+
+static void manually_write_test_invalid_sample(struct user_ringbuf_success *skel,
+					       __u32 size, __u64 producer_pos, int err)
+{
+	void *data_ptr;
+	__u64 *producer_pos_ptr;
+	int rb_fd, page_size = getpagesize();
+
+	rb_fd = bpf_map__fd(skel->maps.user_ringbuf);
+
+	ASSERT_EQ(skel->bss->read, 0, "num_samples_before_bad_sample");
+
+	/* Map the producer_pos as RW. */
+	producer_pos_ptr = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+				MAP_SHARED, rb_fd, page_size);
+	ASSERT_OK_PTR(producer_pos_ptr, "producer_pos_ptr");
+
+	/* Map the data pages as RW. */
+	data_ptr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, rb_fd, 2 * page_size);
+	ASSERT_OK_PTR(data_ptr, "rw_data");
+
+	memset(data_ptr, 0, BPF_RINGBUF_HDR_SZ);
+	*(__u32 *)data_ptr = size;
+
+	/* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in the kernel. */
+	smp_store_release(producer_pos_ptr, producer_pos + BPF_RINGBUF_HDR_SZ);
+
+	drain_current_samples();
+	ASSERT_EQ(skel->bss->read, 0, "num_samples_after_bad_sample");
+	ASSERT_EQ(skel->bss->err, err, "err_after_bad_sample");
+
+	ASSERT_OK(munmap(producer_pos_ptr, page_size), "unmap_producer_pos");
+	ASSERT_OK(munmap(data_ptr, page_size), "unmap_data_ptr");
+}
+
+static void test_user_ringbuf_post_misaligned(void)
+{
+	struct user_ringbuf_success *skel;
+	struct user_ring_buffer *ringbuf;
+	int err;
+	__u32 size = (1 << 5) + 7;
+
+	err = load_skel_create_user_ringbuf(&skel, &ringbuf);
+	if (!ASSERT_OK(err, "misaligned_skel"))
+		return;
+
+	manually_write_test_invalid_sample(skel, size, size, -EINVAL);
+	user_ring_buffer__free(ringbuf);
+	user_ringbuf_success__destroy(skel);
+}
+
+static void test_user_ringbuf_post_producer_wrong_offset(void)
+{
+	struct user_ringbuf_success *skel;
+	struct user_ring_buffer *ringbuf;
+	int err;
+	__u32 size = (1 << 5);
+
+	err = load_skel_create_user_ringbuf(&skel, &ringbuf);
+	if (!ASSERT_OK(err, "wrong_offset_skel"))
+		return;
+
+	manually_write_test_invalid_sample(skel, size, size - 8, -EINVAL);
+	user_ring_buffer__free(ringbuf);
+	user_ringbuf_success__destroy(skel);
+}
+
+static void test_user_ringbuf_post_larger_than_ringbuf_sz(void)
+{
+	struct user_ringbuf_success *skel;
+	struct user_ring_buffer *ringbuf;
+	int err;
+	__u32 size = c_ringbuf_size;
+
+	err = load_skel_create_user_ringbuf(&skel, &ringbuf);
+	if (!ASSERT_OK(err, "huge_sample_skel"))
+		return;
+
+	manually_write_test_invalid_sample(skel, size, size, -E2BIG);
+	user_ring_buffer__free(ringbuf);
+	user_ringbuf_success__destroy(skel);
+}
+
+static void test_user_ringbuf_basic(void)
+{
+	struct user_ringbuf_success *skel;
+	struct user_ring_buffer *ringbuf;
+	int err;
+
+	err = load_skel_create_user_ringbuf(&skel, &ringbuf);
+	if (!ASSERT_OK(err, "ringbuf_basic_skel"))
+		return;
+
+	ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before");
+
+	err = write_samples(ringbuf, 2);
+	if (!ASSERT_OK(err, "write_samples"))
+		goto cleanup;
+
+	ASSERT_EQ(skel->bss->read, 2, "num_samples_read_after");
+
+cleanup:
+	user_ring_buffer__free(ringbuf);
+	user_ringbuf_success__destroy(skel);
+}
+
+static void test_user_ringbuf_sample_full_ring_buffer(void)
+{
+	struct user_ringbuf_success *skel;
+	struct user_ring_buffer *ringbuf;
+	int err;
+	void *sample;
+
+	err = load_skel_create_user_ringbuf(&skel, &ringbuf);
+	if (!ASSERT_OK(err, "ringbuf_full_sample_skel"))
+		return;
+
+	sample = user_ring_buffer__reserve(ringbuf, c_ringbuf_size - BPF_RINGBUF_HDR_SZ);
+	if (!ASSERT_OK_PTR(sample, "full_sample"))
+		goto cleanup;
+
+	user_ring_buffer__submit(ringbuf, sample);
+	ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before");
+	drain_current_samples();
+	ASSERT_EQ(skel->bss->read, 1, "num_samples_read_after");
+
+cleanup:
+	user_ring_buffer__free(ringbuf);
+	user_ringbuf_success__destroy(skel);
+}
+
+static void test_user_ringbuf_post_alignment_autoadjust(void)
+{
+	struct user_ringbuf_success *skel;
+	struct user_ring_buffer *ringbuf;
+	struct sample *sample;
+	int err;
+
+	err = load_skel_create_user_ringbuf(&skel, &ringbuf);
+	if (!ASSERT_OK(err, "ringbuf_align_autoadjust_skel"))
+		return;
+
+	/* libbpf should automatically round any sample up to an 8-byte alignment. */
+	sample = user_ring_buffer__reserve(ringbuf, sizeof(*sample) + 1);
+	ASSERT_OK_PTR(sample, "reserve_autoaligned");
+	user_ring_buffer__submit(ringbuf, sample);
+
+	ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before");
+	drain_current_samples();
+	ASSERT_EQ(skel->bss->read, 1, "num_samples_read_after");
+
+	user_ring_buffer__free(ringbuf);
+	user_ringbuf_success__destroy(skel);
+}
+
+static void test_user_ringbuf_overfill(void)
+{
+	struct user_ringbuf_success *skel;
+	struct user_ring_buffer *ringbuf;
+	int err;
+
+	err = load_skel_create_user_ringbuf(&skel, &ringbuf);
+	if (err)
+		return;
+
+	err = write_samples(ringbuf, c_max_entries * 5);
+	ASSERT_ERR(err, "write_samples");
+	ASSERT_EQ(skel->bss->read, c_max_entries, "max_entries");
+
+	user_ring_buffer__free(ringbuf);
+	user_ringbuf_success__destroy(skel);
+}
+
+static void test_user_ringbuf_discards_properly_ignored(void)
+{
+	struct user_ringbuf_success *skel;
+	struct user_ring_buffer *ringbuf;
+	int err, num_discarded = 0;
+	__u64 *token;
+
+	err = load_skel_create_user_ringbuf(&skel, &ringbuf);
+	if (err)
+		return;
+
+	ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before");
+
+	while (1) {
+		/* Write samples until the buffer is full. */
+		token = user_ring_buffer__reserve(ringbuf, sizeof(*token));
+		if (!token)
+			break;
+
+		user_ring_buffer__discard(ringbuf, token);
+		num_discarded++;
+	}
+
+	if (!ASSERT_GE(num_discarded, 0, "num_discarded"))
+		goto cleanup;
+
+	/* Should not read any samples, as they are all discarded. */
+	ASSERT_EQ(skel->bss->read, 0, "num_pre_kick");
+	drain_current_samples();
+	ASSERT_EQ(skel->bss->read, 0, "num_post_kick");
+
+	/* Now that the ring buffer has been drained, we should be able to
+	 * reserve another token.
+	 */
+	token = user_ring_buffer__reserve(ringbuf, sizeof(*token));
+
+	if (!ASSERT_OK_PTR(token, "new_token"))
+		goto cleanup;
+
+	user_ring_buffer__discard(ringbuf, token);
+cleanup:
+	user_ring_buffer__free(ringbuf);
+	user_ringbuf_success__destroy(skel);
+}
+
+static void test_user_ringbuf_loop(void)
+{
+	struct user_ringbuf_success *skel;
+	struct user_ring_buffer *ringbuf;
+	uint32_t total_samples = 8192;
+	uint32_t remaining_samples = total_samples;
+	int err;
+
+	BUILD_BUG_ON(total_samples <= c_max_entries);
+	err = load_skel_create_user_ringbuf(&skel, &ringbuf);
+	if (err)
+		return;
+
+	do  {
+		uint32_t curr_samples;
+
+		curr_samples = remaining_samples > c_max_entries
+			? c_max_entries : remaining_samples;
+		err = write_samples(ringbuf, curr_samples);
+		if (err != 0) {
+			/* Assert inside of if statement to avoid flooding logs
+			 * on the success path.
+			 */
+			ASSERT_OK(err, "write_samples");
+			goto cleanup;
+		}
+
+		remaining_samples -= curr_samples;
+		ASSERT_EQ(skel->bss->read, total_samples - remaining_samples,
+			  "current_batched_entries");
+	} while (remaining_samples > 0);
+	ASSERT_EQ(skel->bss->read, total_samples, "total_batched_entries");
+
+cleanup:
+	user_ring_buffer__free(ringbuf);
+	user_ringbuf_success__destroy(skel);
+}
+
+static int send_test_message(struct user_ring_buffer *ringbuf,
+			     enum test_msg_op op, s64 operand_64,
+			     s32 operand_32)
+{
+	struct test_msg *msg;
+
+	msg = user_ring_buffer__reserve(ringbuf, sizeof(*msg));
+	if (!msg) {
+		/* Assert on the error path to avoid spamming logs with mostly
+		 * success messages.
+		 */
+		ASSERT_OK_PTR(msg, "reserve_msg");
+		return -ENOMEM;
+	}
+
+	msg->msg_op = op;
+
+	switch (op) {
+	case TEST_MSG_OP_INC64:
+	case TEST_MSG_OP_MUL64:
+		msg->operand_64 = operand_64;
+		break;
+	case TEST_MSG_OP_INC32:
+	case TEST_MSG_OP_MUL32:
+		msg->operand_32 = operand_32;
+		break;
+	default:
+		PRINT_FAIL("Invalid operand %d\n", op);
+		user_ring_buffer__discard(ringbuf, msg);
+		return -EINVAL;
+	}
+
+	user_ring_buffer__submit(ringbuf, msg);
+
+	return 0;
+}
+
+static void kick_kernel_read_messages(void)
+{
+	syscall(__NR_prctl);
+}
+
+static int handle_kernel_msg(void *ctx, void *data, size_t len)
+{
+	struct user_ringbuf_success *skel = ctx;
+	struct test_msg *msg = data;
+
+	switch (msg->msg_op) {
+	case TEST_MSG_OP_INC64:
+		skel->bss->user_mutated += msg->operand_64;
+		return 0;
+	case TEST_MSG_OP_INC32:
+		skel->bss->user_mutated += msg->operand_32;
+		return 0;
+	case TEST_MSG_OP_MUL64:
+		skel->bss->user_mutated *= msg->operand_64;
+		return 0;
+	case TEST_MSG_OP_MUL32:
+		skel->bss->user_mutated *= msg->operand_32;
+		return 0;
+	default:
+		fprintf(stderr, "Invalid operand %d\n", msg->msg_op);
+		return -EINVAL;
+	}
+}
+
+static void drain_kernel_messages_buffer(struct ring_buffer *kern_ringbuf,
+					 struct user_ringbuf_success *skel)
+{
+	int cnt;
+
+	cnt = ring_buffer__consume(kern_ringbuf);
+	ASSERT_EQ(cnt, 8, "consume_kern_ringbuf");
+	ASSERT_OK(skel->bss->err, "consume_kern_ringbuf_err");
+}
+
+static void test_user_ringbuf_msg_protocol(void)
+{
+	struct user_ringbuf_success *skel;
+	struct user_ring_buffer *user_ringbuf;
+	struct ring_buffer *kern_ringbuf;
+	int err, i;
+	__u64 expected_kern = 0;
+
+	err = load_skel_create_ringbufs(&skel, &kern_ringbuf, handle_kernel_msg, &user_ringbuf);
+	if (!ASSERT_OK(err, "create_ringbufs"))
+		return;
+
+	for (i = 0; i < 64; i++) {
+		enum test_msg_op op = i % TEST_MSG_OP_NUM_OPS;
+		__u64 operand_64 = TEST_OP_64;
+		__u32 operand_32 = TEST_OP_32;
+
+		err = send_test_message(user_ringbuf, op, operand_64, operand_32);
+		if (err) {
+			/* Only assert on a failure to avoid spamming success logs. */
+			ASSERT_OK(err, "send_test_message");
+			goto cleanup;
+		}
+
+		switch (op) {
+		case TEST_MSG_OP_INC64:
+			expected_kern += operand_64;
+			break;
+		case TEST_MSG_OP_INC32:
+			expected_kern += operand_32;
+			break;
+		case TEST_MSG_OP_MUL64:
+			expected_kern *= operand_64;
+			break;
+		case TEST_MSG_OP_MUL32:
+			expected_kern *= operand_32;
+			break;
+		default:
+			PRINT_FAIL("Unexpected op %d\n", op);
+			goto cleanup;
+		}
+
+		if (i % 8 == 0) {
+			kick_kernel_read_messages();
+			ASSERT_EQ(skel->bss->kern_mutated, expected_kern, "expected_kern");
+			ASSERT_EQ(skel->bss->err, 0, "bpf_prog_err");
+			drain_kernel_messages_buffer(kern_ringbuf, skel);
+		}
+	}
+
+cleanup:
+	ring_buffer__free(kern_ringbuf);
+	user_ring_buffer__free(user_ringbuf);
+	user_ringbuf_success__destroy(skel);
+}
+
+static void *kick_kernel_cb(void *arg)
+{
+	/* Kick the kernel, causing it to drain the ring buffer and then wake
+	 * up the test thread waiting on epoll.
+	 */
+	syscall(__NR_getrlimit);
+
+	return NULL;
+}
+
+static int spawn_kick_thread_for_poll(void)
+{
+	pthread_t thread;
+
+	return pthread_create(&thread, NULL, kick_kernel_cb, NULL);
+}
+
+static void test_user_ringbuf_blocking_reserve(void)
+{
+	struct user_ringbuf_success *skel;
+	struct user_ring_buffer *ringbuf;
+	int err, num_written = 0;
+	__u64 *token;
+
+	err = load_skel_create_user_ringbuf(&skel, &ringbuf);
+	if (err)
+		return;
+
+	ASSERT_EQ(skel->bss->read, 0, "num_samples_read_before");
+
+	while (1) {
+		/* Write samples until the buffer is full. */
+		token = user_ring_buffer__reserve(ringbuf, sizeof(*token));
+		if (!token)
+			break;
+
+		*token = 0xdeadbeef;
+
+		user_ring_buffer__submit(ringbuf, token);
+		num_written++;
+	}
+
+	if (!ASSERT_GE(num_written, 0, "num_written"))
+		goto cleanup;
+
+	/* Should not have read any samples until the kernel is kicked. */
+	ASSERT_EQ(skel->bss->read, 0, "num_pre_kick");
+
+	/* We correctly time out after 1 second, without a sample. */
+	token = user_ring_buffer__reserve_blocking(ringbuf, sizeof(*token), 1000);
+	if (!ASSERT_EQ(token, NULL, "pre_kick_timeout_token"))
+		goto cleanup;
+
+	err = spawn_kick_thread_for_poll();
+	if (!ASSERT_EQ(err, 0, "deferred_kick_thread\n"))
+		goto cleanup;
+
+	/* After spawning another thread that asychronously kicks the kernel to
+	 * drain the messages, we're able to block and successfully get a
+	 * sample once we receive an event notification.
+	 */
+	token = user_ring_buffer__reserve_blocking(ringbuf, sizeof(*token), 10000);
+
+	if (!ASSERT_OK_PTR(token, "block_token"))
+		goto cleanup;
+
+	ASSERT_GT(skel->bss->read, 0, "num_post_kill");
+	ASSERT_LE(skel->bss->read, num_written, "num_post_kill");
+	ASSERT_EQ(skel->bss->err, 0, "err_post_poll");
+	user_ring_buffer__discard(ringbuf, token);
+
+cleanup:
+	user_ring_buffer__free(ringbuf);
+	user_ringbuf_success__destroy(skel);
+}
+
+static struct {
+	const char *prog_name;
+	const char *expected_err_msg;
+} failure_tests[] = {
+	/* failure cases */
+	{"user_ringbuf_callback_bad_access1", "negative offset dynptr_ptr ptr"},
+	{"user_ringbuf_callback_bad_access2", "dereference of modified dynptr_ptr ptr"},
+	{"user_ringbuf_callback_write_forbidden", "invalid mem access 'dynptr_ptr'"},
+	{"user_ringbuf_callback_null_context_write", "invalid mem access 'scalar'"},
+	{"user_ringbuf_callback_null_context_read", "invalid mem access 'scalar'"},
+	{"user_ringbuf_callback_discard_dynptr", "arg 1 is an unacquired reference"},
+	{"user_ringbuf_callback_submit_dynptr", "arg 1 is an unacquired reference"},
+	{"user_ringbuf_callback_invalid_return", "At callback return the register R0 has value"},
+};
+
+#define SUCCESS_TEST(_func) { _func, #_func }
+
+static struct {
+	void (*test_callback)(void);
+	const char *test_name;
+} success_tests[] = {
+	SUCCESS_TEST(test_user_ringbuf_mappings),
+	SUCCESS_TEST(test_user_ringbuf_post_misaligned),
+	SUCCESS_TEST(test_user_ringbuf_post_producer_wrong_offset),
+	SUCCESS_TEST(test_user_ringbuf_post_larger_than_ringbuf_sz),
+	SUCCESS_TEST(test_user_ringbuf_basic),
+	SUCCESS_TEST(test_user_ringbuf_sample_full_ring_buffer),
+	SUCCESS_TEST(test_user_ringbuf_post_alignment_autoadjust),
+	SUCCESS_TEST(test_user_ringbuf_overfill),
+	SUCCESS_TEST(test_user_ringbuf_discards_properly_ignored),
+	SUCCESS_TEST(test_user_ringbuf_loop),
+	SUCCESS_TEST(test_user_ringbuf_msg_protocol),
+	SUCCESS_TEST(test_user_ringbuf_blocking_reserve),
+};
+
+static void verify_fail(const char *prog_name, const char *expected_err_msg)
+{
+	LIBBPF_OPTS(bpf_object_open_opts, opts);
+	struct bpf_program *prog;
+	struct user_ringbuf_fail *skel;
+	int err;
+
+	opts.kernel_log_buf = obj_log_buf;
+	opts.kernel_log_size = log_buf_sz;
+	opts.kernel_log_level = 1;
+
+	skel = user_ringbuf_fail__open_opts(&opts);
+	if (!ASSERT_OK_PTR(skel, "dynptr_fail__open_opts"))
+		goto cleanup;
+
+	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
+	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+		goto cleanup;
+
+	bpf_program__set_autoload(prog, true);
+
+	bpf_map__set_max_entries(skel->maps.user_ringbuf, getpagesize());
+
+	err = user_ringbuf_fail__load(skel);
+	if (!ASSERT_ERR(err, "unexpected load success"))
+		goto cleanup;
+
+	if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) {
+		fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg);
+		fprintf(stderr, "Verifier output: %s\n", obj_log_buf);
+	}
+
+cleanup:
+	user_ringbuf_fail__destroy(skel);
+}
+
+void test_user_ringbuf(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(success_tests); i++) {
+		if (!test__start_subtest(success_tests[i].test_name))
+			continue;
+
+		success_tests[i].test_callback();
+	}
+
+	for (i = 0; i < ARRAY_SIZE(failure_tests); i++) {
+		if (!test__start_subtest(failure_tests[i].prog_name))
+			continue;
+
+		verify_fail(failure_tests[i].prog_name, failure_tests[i].expected_err_msg);
+	}
+}
diff --git a/tools/testing/selftests/bpf/progs/test_user_ringbuf.h b/tools/testing/selftests/bpf/progs/test_user_ringbuf.h
new file mode 100644
index 000000000000..1643b4d59ba7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_user_ringbuf.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#ifndef _TEST_USER_RINGBUF_H
+#define _TEST_USER_RINGBUF_H
+
+#define TEST_OP_64 4
+#define TEST_OP_32 2
+
+enum test_msg_op {
+	TEST_MSG_OP_INC64,
+	TEST_MSG_OP_INC32,
+	TEST_MSG_OP_MUL64,
+	TEST_MSG_OP_MUL32,
+
+	// Must come last.
+	TEST_MSG_OP_NUM_OPS,
+};
+
+struct test_msg {
+	enum test_msg_op msg_op;
+	union {
+		__s64 operand_64;
+		__s32 operand_32;
+	};
+};
+
+struct sample {
+	int pid;
+	int seq;
+	long value;
+	char comm[16];
+};
+
+#endif /* _TEST_USER_RINGBUF_H */
diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
new file mode 100644
index 000000000000..82aba4529aa9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/user_ringbuf_fail.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct sample {
+	int pid;
+	int seq;
+	long value;
+	char comm[16];
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_USER_RINGBUF);
+} user_ringbuf SEC(".maps");
+
+static long
+bad_access1(struct bpf_dynptr *dynptr, void *context)
+{
+	const struct sample *sample;
+
+	sample = bpf_dynptr_data(dynptr - 1, 0, sizeof(*sample));
+	bpf_printk("Was able to pass bad pointer %lx\n", (__u64)dynptr - 1);
+
+	return 0;
+}
+
+/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
+ * not be able to read before the pointer.
+ */
+SEC("?raw_tp/sys_nanosleep")
+int user_ringbuf_callback_bad_access1(void *ctx)
+{
+	bpf_user_ringbuf_drain(&user_ringbuf, bad_access1, NULL, 0);
+
+	return 0;
+}
+
+static long
+bad_access2(struct bpf_dynptr *dynptr, void *context)
+{
+	const struct sample *sample;
+
+	sample = bpf_dynptr_data(dynptr + 1, 0, sizeof(*sample));
+	bpf_printk("Was able to pass bad pointer %lx\n", (__u64)dynptr + 1);
+
+	return 0;
+}
+
+/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
+ * not be able to read past the end of the pointer.
+ */
+SEC("?raw_tp/sys_nanosleep")
+int user_ringbuf_callback_bad_access2(void *ctx)
+{
+	bpf_user_ringbuf_drain(&user_ringbuf, bad_access2, NULL, 0);
+
+	return 0;
+}
+
+static long
+write_forbidden(struct bpf_dynptr *dynptr, void *context)
+{
+	*((long *)dynptr) = 0;
+
+	return 0;
+}
+
+/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
+ * not be able to write to that pointer.
+ */
+SEC("?raw_tp/sys_nanosleep")
+int user_ringbuf_callback_write_forbidden(void *ctx)
+{
+	bpf_user_ringbuf_drain(&user_ringbuf, write_forbidden, NULL, 0);
+
+	return 0;
+}
+
+static long
+null_context_write(struct bpf_dynptr *dynptr, void *context)
+{
+	*((__u64 *)context) = 0;
+
+	return 0;
+}
+
+/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
+ * not be able to write to that pointer.
+ */
+SEC("?raw_tp/sys_nanosleep")
+int user_ringbuf_callback_null_context_write(void *ctx)
+{
+	bpf_user_ringbuf_drain(&user_ringbuf, null_context_write, NULL, 0);
+
+	return 0;
+}
+
+static long
+null_context_read(struct bpf_dynptr *dynptr, void *context)
+{
+	__u64 id = *((__u64 *)context);
+
+	bpf_printk("Read id %lu\n", id);
+
+	return 0;
+}
+
+/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
+ * not be able to write to that pointer.
+ */
+SEC("?raw_tp/sys_nanosleep")
+int user_ringbuf_callback_null_context_read(void *ctx)
+{
+	bpf_user_ringbuf_drain(&user_ringbuf, null_context_read, NULL, 0);
+
+	return 0;
+}
+
+static long
+try_discard_dynptr(struct bpf_dynptr *dynptr, void *context)
+{
+	bpf_ringbuf_discard_dynptr(dynptr, 0);
+
+	return 0;
+}
+
+/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
+ * not be able to read past the end of the pointer.
+ */
+SEC("?raw_tp/sys_nanosleep")
+int user_ringbuf_callback_discard_dynptr(void *ctx)
+{
+	bpf_user_ringbuf_drain(&user_ringbuf, try_discard_dynptr, NULL, 0);
+
+	return 0;
+}
+
+static long
+try_submit_dynptr(struct bpf_dynptr *dynptr, void *context)
+{
+	bpf_ringbuf_submit_dynptr(dynptr, 0);
+
+	return 0;
+}
+
+/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
+ * not be able to read past the end of the pointer.
+ */
+SEC("?raw_tp/sys_nanosleep")
+int user_ringbuf_callback_submit_dynptr(void *ctx)
+{
+	bpf_user_ringbuf_drain(&user_ringbuf, try_submit_dynptr, NULL, 0);
+
+	return 0;
+}
+
+static long
+invalid_drain_callback_return(struct bpf_dynptr *dynptr, void *context)
+{
+	return 2;
+}
+
+/* A callback that accesses a dynptr in a bpf_user_ringbuf_drain callback should
+ * not be able to write to that pointer.
+ */
+SEC("?raw_tp/sys_nanosleep")
+int user_ringbuf_callback_invalid_return(void *ctx)
+{
+	bpf_user_ringbuf_drain(&user_ringbuf, invalid_drain_callback_return, NULL, 0);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/user_ringbuf_success.c b/tools/testing/selftests/bpf/progs/user_ringbuf_success.c
new file mode 100644
index 000000000000..099c23d9aa21
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/user_ringbuf_success.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_misc.h"
+#include "test_user_ringbuf.h"
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+	__uint(type, BPF_MAP_TYPE_USER_RINGBUF);
+} user_ringbuf SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_RINGBUF);
+} kernel_ringbuf SEC(".maps");
+
+/* inputs */
+int pid, err, val;
+
+int read = 0;
+
+/* Counter used for end-to-end protocol test */
+__u64 kern_mutated = 0;
+__u64 user_mutated = 0;
+__u64 expected_user_mutated = 0;
+
+static int
+is_test_process(void)
+{
+	int cur_pid = bpf_get_current_pid_tgid() >> 32;
+
+	return cur_pid == pid;
+}
+
+static long
+record_sample(struct bpf_dynptr *dynptr, void *context)
+{
+	const struct sample *sample = NULL;
+	struct sample stack_sample;
+	int status;
+	static int num_calls;
+
+	if (num_calls++ % 2 == 0) {
+		status = bpf_dynptr_read(&stack_sample, sizeof(stack_sample), dynptr, 0, 0);
+		if (status) {
+			bpf_printk("bpf_dynptr_read() failed: %d\n", status);
+			err = 1;
+			return 0;
+		}
+	} else {
+		sample = bpf_dynptr_data(dynptr, 0, sizeof(*sample));
+		if (!sample) {
+			bpf_printk("Unexpectedly failed to get sample\n");
+			err = 2;
+			return 0;
+		}
+		stack_sample = *sample;
+	}
+
+	__sync_fetch_and_add(&read, 1);
+	return 0;
+}
+
+static void
+handle_sample_msg(const struct test_msg *msg)
+{
+	switch (msg->msg_op) {
+	case TEST_MSG_OP_INC64:
+		kern_mutated += msg->operand_64;
+		break;
+	case TEST_MSG_OP_INC32:
+		kern_mutated += msg->operand_32;
+		break;
+	case TEST_MSG_OP_MUL64:
+		kern_mutated *= msg->operand_64;
+		break;
+	case TEST_MSG_OP_MUL32:
+		kern_mutated *= msg->operand_32;
+		break;
+	default:
+		bpf_printk("Unrecognized op %d\n", msg->msg_op);
+		err = 2;
+	}
+}
+
+static long
+read_protocol_msg(struct bpf_dynptr *dynptr, void *context)
+{
+	const struct test_msg *msg = NULL;
+
+	msg = bpf_dynptr_data(dynptr, 0, sizeof(*msg));
+	if (!msg) {
+		err = 1;
+		bpf_printk("Unexpectedly failed to get msg\n");
+		return 0;
+	}
+
+	handle_sample_msg(msg);
+
+	return 0;
+}
+
+static int publish_next_kern_msg(__u32 index, void *context)
+{
+	struct test_msg *msg = NULL;
+	int operand_64 = TEST_OP_64;
+	int operand_32 = TEST_OP_32;
+
+	msg = bpf_ringbuf_reserve(&kernel_ringbuf, sizeof(*msg), 0);
+	if (!msg) {
+		err = 4;
+		return 1;
+	}
+
+	switch (index % TEST_MSG_OP_NUM_OPS) {
+	case TEST_MSG_OP_INC64:
+		msg->operand_64 = operand_64;
+		msg->msg_op = TEST_MSG_OP_INC64;
+		expected_user_mutated += operand_64;
+		break;
+	case TEST_MSG_OP_INC32:
+		msg->operand_32 = operand_32;
+		msg->msg_op = TEST_MSG_OP_INC32;
+		expected_user_mutated += operand_32;
+		break;
+	case TEST_MSG_OP_MUL64:
+		msg->operand_64 = operand_64;
+		msg->msg_op = TEST_MSG_OP_MUL64;
+		expected_user_mutated *= operand_64;
+		break;
+	case TEST_MSG_OP_MUL32:
+		msg->operand_32 = operand_32;
+		msg->msg_op = TEST_MSG_OP_MUL32;
+		expected_user_mutated *= operand_32;
+		break;
+	default:
+		bpf_ringbuf_discard(msg, 0);
+		err = 5;
+		return 1;
+	}
+
+	bpf_ringbuf_submit(msg, 0);
+
+	return 0;
+}
+
+static void
+publish_kern_messages(void)
+{
+	if (expected_user_mutated != user_mutated) {
+		bpf_printk("%lu != %lu\n", expected_user_mutated, user_mutated);
+		err = 3;
+		return;
+	}
+
+	bpf_loop(8, publish_next_kern_msg, NULL, 0);
+}
+
+SEC("fentry/" SYS_PREFIX "sys_prctl")
+int test_user_ringbuf_protocol(void *ctx)
+{
+	long status = 0;
+	struct sample *sample = NULL;
+	struct bpf_dynptr ptr;
+
+	if (!is_test_process())
+		return 0;
+
+	status = bpf_user_ringbuf_drain(&user_ringbuf, read_protocol_msg, NULL, 0);
+	if (status < 0) {
+		bpf_printk("Drain returned: %ld\n", status);
+		err = 1;
+		return 0;
+	}
+
+	publish_kern_messages();
+
+	return 0;
+}
+
+SEC("fentry/" SYS_PREFIX "sys_getpgid")
+int test_user_ringbuf(void *ctx)
+{
+	int status = 0;
+	struct sample *sample = NULL;
+	struct bpf_dynptr ptr;
+
+	if (!is_test_process())
+		return 0;
+
+	err = bpf_user_ringbuf_drain(&user_ringbuf, record_sample, NULL, 0);
+
+	return 0;
+}
+
+static long
+do_nothing_cb(struct bpf_dynptr *dynptr, void *context)
+{
+	__sync_fetch_and_add(&read, 1);
+	return 0;
+}
+
+SEC("fentry/" SYS_PREFIX "sys_getrlimit")
+int test_user_ringbuf_epoll(void *ctx)
+{
+	long num_samples;
+
+	if (!is_test_process())
+		return 0;
+
+	num_samples = bpf_user_ringbuf_drain(&user_ringbuf, do_nothing_cb, NULL, 0);
+	if (num_samples <= 0)
+		err = 1;
+
+	return 0;
+}
-- 
cgit v1.2.3


From 9f2f5d7830ddfeeca147595f473e14eadbeb3db1 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Fri, 9 Sep 2022 19:52:14 -0700
Subject: libbpf: Improve BPF_PROG2 macro code quality and description

Commit 34586d29f8df ("libbpf: Add new BPF_PROG2 macro") added BPF_PROG2
macro for trampoline based programs with struct arguments. Andrii
made a few suggestions to improve code quality and description.
This patch implemented these suggestions including better internal
macro name, consistent usage pattern for __builtin_choose_expr(),
simpler macro definition for always-inline func arguments and
better macro description.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Stanislav Fomichev <sdf@google.com>
Link: https://lore.kernel.org/bpf/20220910025214.1536510-1-yhs@fb.com
---
 tools/lib/bpf/bpf_tracing.h | 154 ++++++++++++++++++++++++++------------------
 1 file changed, 91 insertions(+), 63 deletions(-)

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 8d4bdd18cb3d..2972dc25ff72 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -438,84 +438,112 @@ typeof(name(0)) name(unsigned long long *ctx)				    \
 static __always_inline typeof(name(0))					    \
 ____##name(unsigned long long *ctx, ##args)
 
-#ifndef ____bpf_nth
-#define ____bpf_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, N, ...) N
+#ifndef ___bpf_nth2
+#define ___bpf_nth2(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13,	\
+		    _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, N, ...) N
 #endif
-#ifndef ____bpf_narg
-#define ____bpf_narg(...) ____bpf_nth(_, ##__VA_ARGS__, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0)
+#ifndef ___bpf_narg2
+#define ___bpf_narg2(...)	\
+	___bpf_nth2(_, ##__VA_ARGS__, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7,	\
+		    6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0)
 #endif
 
-#define BPF_REG_CNT(t) \
-	(__builtin_choose_expr(sizeof(t) == 1 || sizeof(t) == 2 || sizeof(t) == 4 || sizeof(t) == 8, 1,	\
-	 __builtin_choose_expr(sizeof(t) == 16, 2,							\
-			       (void)0)))
-
-#define ____bpf_reg_cnt0()			(0)
-#define ____bpf_reg_cnt1(t, x)			(____bpf_reg_cnt0() + BPF_REG_CNT(t))
-#define ____bpf_reg_cnt2(t, x, args...)		(____bpf_reg_cnt1(args) + BPF_REG_CNT(t))
-#define ____bpf_reg_cnt3(t, x, args...)		(____bpf_reg_cnt2(args) + BPF_REG_CNT(t))
-#define ____bpf_reg_cnt4(t, x, args...)		(____bpf_reg_cnt3(args) + BPF_REG_CNT(t))
-#define ____bpf_reg_cnt5(t, x, args...)		(____bpf_reg_cnt4(args) + BPF_REG_CNT(t))
-#define ____bpf_reg_cnt6(t, x, args...)		(____bpf_reg_cnt5(args) + BPF_REG_CNT(t))
-#define ____bpf_reg_cnt7(t, x, args...)		(____bpf_reg_cnt6(args) + BPF_REG_CNT(t))
-#define ____bpf_reg_cnt8(t, x, args...)		(____bpf_reg_cnt7(args) + BPF_REG_CNT(t))
-#define ____bpf_reg_cnt9(t, x, args...)		(____bpf_reg_cnt8(args) + BPF_REG_CNT(t))
-#define ____bpf_reg_cnt10(t, x, args...)	(____bpf_reg_cnt9(args) + BPF_REG_CNT(t))
-#define ____bpf_reg_cnt11(t, x, args...)	(____bpf_reg_cnt10(args) + BPF_REG_CNT(t))
-#define ____bpf_reg_cnt12(t, x, args...)	(____bpf_reg_cnt11(args) + BPF_REG_CNT(t))
-#define ____bpf_reg_cnt(args...)	 ___bpf_apply(____bpf_reg_cnt, ____bpf_narg(args))(args)
-
-#define ____bpf_union_arg(t, x, n) \
-	__builtin_choose_expr(sizeof(t) == 1, ({ union { struct { __u8 x; } ___z; t x; } ___tmp = { .___z = {ctx[n]}}; ___tmp.x; }), \
-	__builtin_choose_expr(sizeof(t) == 2, ({ union { struct { __u16 x; } ___z; t x; } ___tmp = { .___z = {ctx[n]} }; ___tmp.x; }), \
-	__builtin_choose_expr(sizeof(t) == 4, ({ union { struct { __u32 x; } ___z; t x; } ___tmp = { .___z = {ctx[n]} }; ___tmp.x; }), \
-	__builtin_choose_expr(sizeof(t) == 8, ({ union { struct { __u64 x; } ___z; t x; } ___tmp = {.___z = {ctx[n]} }; ___tmp.x; }), \
-	__builtin_choose_expr(sizeof(t) == 16, ({ union { struct { __u64 x, y; } ___z; t x; } ___tmp = {.___z = {ctx[n], ctx[n + 1]} }; ___tmp.x; }), \
+#define ___bpf_treg_cnt(t) \
+	__builtin_choose_expr(sizeof(t) == 1, 1,	\
+	__builtin_choose_expr(sizeof(t) == 2, 1,	\
+	__builtin_choose_expr(sizeof(t) == 4, 1,	\
+	__builtin_choose_expr(sizeof(t) == 8, 1,	\
+	__builtin_choose_expr(sizeof(t) == 16, 2,	\
 			      (void)0)))))
 
-#define ____bpf_ctx_arg0(n, args...)
-#define ____bpf_ctx_arg1(n, t, x)		, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt1(t, x))
-#define ____bpf_ctx_arg2(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt2(t, x, args)) ____bpf_ctx_arg1(n, args)
-#define ____bpf_ctx_arg3(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt3(t, x, args)) ____bpf_ctx_arg2(n, args)
-#define ____bpf_ctx_arg4(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt4(t, x, args)) ____bpf_ctx_arg3(n, args)
-#define ____bpf_ctx_arg5(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt5(t, x, args)) ____bpf_ctx_arg4(n, args)
-#define ____bpf_ctx_arg6(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt6(t, x, args)) ____bpf_ctx_arg5(n, args)
-#define ____bpf_ctx_arg7(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt7(t, x, args)) ____bpf_ctx_arg6(n, args)
-#define ____bpf_ctx_arg8(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt8(t, x, args)) ____bpf_ctx_arg7(n, args)
-#define ____bpf_ctx_arg9(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt9(t, x, args)) ____bpf_ctx_arg8(n, args)
-#define ____bpf_ctx_arg10(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt10(t, x, args)) ____bpf_ctx_arg9(n, args)
-#define ____bpf_ctx_arg11(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt11(t, x, args)) ____bpf_ctx_arg10(n, args)
-#define ____bpf_ctx_arg12(n, t, x, args...)	, ____bpf_union_arg(t, x, n - ____bpf_reg_cnt12(t, x, args)) ____bpf_ctx_arg11(n, args)
-#define ____bpf_ctx_arg(n, args...)	___bpf_apply(____bpf_ctx_arg, ____bpf_narg(args))(n, args)
-
-#define ____bpf_ctx_decl0()
-#define ____bpf_ctx_decl1(t, x)			, t x
-#define ____bpf_ctx_decl2(t, x, args...)	, t x ____bpf_ctx_decl1(args)
-#define ____bpf_ctx_decl3(t, x, args...)	, t x ____bpf_ctx_decl2(args)
-#define ____bpf_ctx_decl4(t, x, args...)	, t x ____bpf_ctx_decl3(args)
-#define ____bpf_ctx_decl5(t, x, args...)	, t x ____bpf_ctx_decl4(args)
-#define ____bpf_ctx_decl6(t, x, args...)	, t x ____bpf_ctx_decl5(args)
-#define ____bpf_ctx_decl7(t, x, args...)	, t x ____bpf_ctx_decl6(args)
-#define ____bpf_ctx_decl8(t, x, args...)	, t x ____bpf_ctx_decl7(args)
-#define ____bpf_ctx_decl9(t, x, args...)	, t x ____bpf_ctx_decl8(args)
-#define ____bpf_ctx_decl10(t, x, args...)	, t x ____bpf_ctx_decl9(args)
-#define ____bpf_ctx_decl11(t, x, args...)	, t x ____bpf_ctx_decl10(args)
-#define ____bpf_ctx_decl12(t, x, args...)	, t x ____bpf_ctx_decl11(args)
-#define ____bpf_ctx_decl(args...)	___bpf_apply(____bpf_ctx_decl, ____bpf_narg(args))(args)
+#define ___bpf_reg_cnt0()		(0)
+#define ___bpf_reg_cnt1(t, x)		(___bpf_reg_cnt0() + ___bpf_treg_cnt(t))
+#define ___bpf_reg_cnt2(t, x, args...)	(___bpf_reg_cnt1(args) + ___bpf_treg_cnt(t))
+#define ___bpf_reg_cnt3(t, x, args...)	(___bpf_reg_cnt2(args) + ___bpf_treg_cnt(t))
+#define ___bpf_reg_cnt4(t, x, args...)	(___bpf_reg_cnt3(args) + ___bpf_treg_cnt(t))
+#define ___bpf_reg_cnt5(t, x, args...)	(___bpf_reg_cnt4(args) + ___bpf_treg_cnt(t))
+#define ___bpf_reg_cnt6(t, x, args...)	(___bpf_reg_cnt5(args) + ___bpf_treg_cnt(t))
+#define ___bpf_reg_cnt7(t, x, args...)	(___bpf_reg_cnt6(args) + ___bpf_treg_cnt(t))
+#define ___bpf_reg_cnt8(t, x, args...)	(___bpf_reg_cnt7(args) + ___bpf_treg_cnt(t))
+#define ___bpf_reg_cnt9(t, x, args...)	(___bpf_reg_cnt8(args) + ___bpf_treg_cnt(t))
+#define ___bpf_reg_cnt10(t, x, args...)	(___bpf_reg_cnt9(args) + ___bpf_treg_cnt(t))
+#define ___bpf_reg_cnt11(t, x, args...)	(___bpf_reg_cnt10(args) + ___bpf_treg_cnt(t))
+#define ___bpf_reg_cnt12(t, x, args...)	(___bpf_reg_cnt11(args) + ___bpf_treg_cnt(t))
+#define ___bpf_reg_cnt(args...)	 ___bpf_apply(___bpf_reg_cnt, ___bpf_narg2(args))(args)
+
+#define ___bpf_union_arg(t, x, n) \
+	__builtin_choose_expr(sizeof(t) == 1, ({ union { __u8 z[1]; t x; } ___t = { .z = {ctx[n]}}; ___t.x; }), \
+	__builtin_choose_expr(sizeof(t) == 2, ({ union { __u16 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \
+	__builtin_choose_expr(sizeof(t) == 4, ({ union { __u32 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \
+	__builtin_choose_expr(sizeof(t) == 8, ({ union { __u64 z[1]; t x; } ___t = {.z = {ctx[n]} }; ___t.x; }), \
+	__builtin_choose_expr(sizeof(t) == 16, ({ union { __u64 z[2]; t x; } ___t = {.z = {ctx[n], ctx[n + 1]} }; ___t.x; }), \
+			      (void)0)))))
+
+#define ___bpf_ctx_arg0(n, args...)
+#define ___bpf_ctx_arg1(n, t, x)		, ___bpf_union_arg(t, x, n - ___bpf_reg_cnt1(t, x))
+#define ___bpf_ctx_arg2(n, t, x, args...)	, ___bpf_union_arg(t, x, n - ___bpf_reg_cnt2(t, x, args)) ___bpf_ctx_arg1(n, args)
+#define ___bpf_ctx_arg3(n, t, x, args...)	, ___bpf_union_arg(t, x, n - ___bpf_reg_cnt3(t, x, args)) ___bpf_ctx_arg2(n, args)
+#define ___bpf_ctx_arg4(n, t, x, args...)	, ___bpf_union_arg(t, x, n - ___bpf_reg_cnt4(t, x, args)) ___bpf_ctx_arg3(n, args)
+#define ___bpf_ctx_arg5(n, t, x, args...)	, ___bpf_union_arg(t, x, n - ___bpf_reg_cnt5(t, x, args)) ___bpf_ctx_arg4(n, args)
+#define ___bpf_ctx_arg6(n, t, x, args...)	, ___bpf_union_arg(t, x, n - ___bpf_reg_cnt6(t, x, args)) ___bpf_ctx_arg5(n, args)
+#define ___bpf_ctx_arg7(n, t, x, args...)	, ___bpf_union_arg(t, x, n - ___bpf_reg_cnt7(t, x, args)) ___bpf_ctx_arg6(n, args)
+#define ___bpf_ctx_arg8(n, t, x, args...)	, ___bpf_union_arg(t, x, n - ___bpf_reg_cnt8(t, x, args)) ___bpf_ctx_arg7(n, args)
+#define ___bpf_ctx_arg9(n, t, x, args...)	, ___bpf_union_arg(t, x, n - ___bpf_reg_cnt9(t, x, args)) ___bpf_ctx_arg8(n, args)
+#define ___bpf_ctx_arg10(n, t, x, args...)	, ___bpf_union_arg(t, x, n - ___bpf_reg_cnt10(t, x, args)) ___bpf_ctx_arg9(n, args)
+#define ___bpf_ctx_arg11(n, t, x, args...)	, ___bpf_union_arg(t, x, n - ___bpf_reg_cnt11(t, x, args)) ___bpf_ctx_arg10(n, args)
+#define ___bpf_ctx_arg12(n, t, x, args...)	, ___bpf_union_arg(t, x, n - ___bpf_reg_cnt12(t, x, args)) ___bpf_ctx_arg11(n, args)
+#define ___bpf_ctx_arg(args...)	___bpf_apply(___bpf_ctx_arg, ___bpf_narg2(args))(___bpf_reg_cnt(args), args)
+
+#define ___bpf_ctx_decl0()
+#define ___bpf_ctx_decl1(t, x)			, t x
+#define ___bpf_ctx_decl2(t, x, args...)		, t x ___bpf_ctx_decl1(args)
+#define ___bpf_ctx_decl3(t, x, args...)		, t x ___bpf_ctx_decl2(args)
+#define ___bpf_ctx_decl4(t, x, args...)		, t x ___bpf_ctx_decl3(args)
+#define ___bpf_ctx_decl5(t, x, args...)		, t x ___bpf_ctx_decl4(args)
+#define ___bpf_ctx_decl6(t, x, args...)		, t x ___bpf_ctx_decl5(args)
+#define ___bpf_ctx_decl7(t, x, args...)		, t x ___bpf_ctx_decl6(args)
+#define ___bpf_ctx_decl8(t, x, args...)		, t x ___bpf_ctx_decl7(args)
+#define ___bpf_ctx_decl9(t, x, args...)		, t x ___bpf_ctx_decl8(args)
+#define ___bpf_ctx_decl10(t, x, args...)	, t x ___bpf_ctx_decl9(args)
+#define ___bpf_ctx_decl11(t, x, args...)	, t x ___bpf_ctx_decl10(args)
+#define ___bpf_ctx_decl12(t, x, args...)	, t x ___bpf_ctx_decl11(args)
+#define ___bpf_ctx_decl(args...)	___bpf_apply(___bpf_ctx_decl, ___bpf_narg2(args))(args)
 
 /*
- * BPF_PROG2 can handle struct arguments.
+ * BPF_PROG2 is an enhanced version of BPF_PROG in order to handle struct
+ * arguments. Since each struct argument might take one or two u64 values
+ * in the trampoline stack, argument type size is needed to place proper number
+ * of u64 values for each argument. Therefore, BPF_PROG2 has different
+ * syntax from BPF_PROG. For example, for the following BPF_PROG syntax:
+ *
+ *   int BPF_PROG(test2, int a, int b) { ... }
+ *
+ * the corresponding BPF_PROG2 syntax is:
+ *
+ *   int BPF_PROG2(test2, int, a, int, b) { ... }
+ *
+ * where type and the corresponding argument name are separated by comma.
+ *
+ * Use BPF_PROG2 macro if one of the arguments might be a struct/union larger
+ * than 8 bytes:
+ *
+ *   int BPF_PROG2(test_struct_arg, struct bpf_testmod_struct_arg_1, a, int, b,
+ *		   int, c, int, d, struct bpf_testmod_struct_arg_2, e, int, ret)
+ *   {
+ *        // access a, b, c, d, e, and ret directly
+ *        ...
+ *   }
  */
 #define BPF_PROG2(name, args...)						\
 name(unsigned long long *ctx);							\
 static __always_inline typeof(name(0))						\
-____##name(unsigned long long *ctx ____bpf_ctx_decl(args));			\
+____##name(unsigned long long *ctx ___bpf_ctx_decl(args));			\
 typeof(name(0)) name(unsigned long long *ctx)					\
 {										\
-	return ____##name(ctx ____bpf_ctx_arg(____bpf_reg_cnt(args), args));	\
+	return ____##name(ctx ___bpf_ctx_arg(args));				\
 }										\
 static __always_inline typeof(name(0))						\
-____##name(unsigned long long *ctx ____bpf_ctx_decl(args))
+____##name(unsigned long long *ctx ___bpf_ctx_decl(args))
 
 struct pt_regs;
 
-- 
cgit v1.2.3


From 272d1f4cfa3c75d4828b62ef33ccb207da3b7350 Mon Sep 17 00:00:00 2001
From: Yauheni Kaliuta <ykaliuta@redhat.com>
Date: Thu, 8 Sep 2022 15:01:46 +0300
Subject: selftests: bpf: test_kmod.sh: Pass parameters to the module

It's possible to specify particular tests for test_bpf.ko with
module parameters. Make it possible to pass the module parameters,
example:

test_kmod.sh test_range=1,3

Since magnitude tests take long time it can be reasonable to skip
them.

Signed-off-by: Yauheni Kaliuta <ykaliuta@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220908120146.381218-1-ykaliuta@redhat.com
---
 tools/testing/selftests/bpf/test_kmod.sh | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_kmod.sh b/tools/testing/selftests/bpf/test_kmod.sh
index 4f6444bcd53f..d4a4279c0181 100755
--- a/tools/testing/selftests/bpf/test_kmod.sh
+++ b/tools/testing/selftests/bpf/test_kmod.sh
@@ -1,6 +1,11 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 
+# Usage:
+# ./test_kmod.sh [module_param]...
+# Ex.: ./test_kmod.sh test_range=1,3
+# All the parameters are passed to the kernel module.
+
 # Kselftest framework requirement - SKIP code is 4.
 ksft_skip=4
 
@@ -26,15 +31,15 @@ test_run()
 	echo "[ JIT enabled:$1 hardened:$2 ]"
 	dmesg -C
 	if [ -f ${OUTPUT}/lib/test_bpf.ko ]; then
-		insmod ${OUTPUT}/lib/test_bpf.ko 2> /dev/null
+		insmod ${OUTPUT}/lib/test_bpf.ko "$@" 2> /dev/null
 		if [ $? -ne 0 ]; then
 			rc=1
 		fi
 	else
 		# Use modprobe dry run to check for missing test_bpf module
-		if ! /sbin/modprobe -q -n test_bpf; then
+		if ! /sbin/modprobe -q -n test_bpf "$@"; then
 			echo "test_bpf: [SKIP]"
-		elif /sbin/modprobe -q test_bpf; then
+		elif /sbin/modprobe -q test_bpf "$@"; then
 			echo "test_bpf: ok"
 		else
 			echo "test_bpf: [FAIL]"
-- 
cgit v1.2.3


From 01f2e36c959c813a532ae836db49b2ac9de46de4 Mon Sep 17 00:00:00 2001
From: Tao Chen <chentao.kernel@linux.alibaba.com>
Date: Tue, 13 Sep 2022 00:43:00 +0800
Subject: libbpf: Support raw BTF placed in the default search path

Currently, the default vmlinux files at '/boot/vmlinux-*',
'/lib/modules/*/vmlinux-*' etc. are parsed with 'btf__parse_elf()' to
extract BTF. It is possible that these files are actually raw BTF files
similar to /sys/kernel/btf/vmlinux. So parse these files with
'btf__parse' which tries both raw format and ELF format.

This might be useful in some scenarios where users put their custom BTF
into known locations and don't want to specify btf_custom_path option.

Signed-off-by: Tao Chen <chentao.kernel@linux.alibaba.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/3f59fb5a345d2e4f10e16fe9e35fbc4c03ecaa3e.1662999860.git.chentao.kernel@linux.alibaba.com
---
 tools/lib/bpf/btf.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 361131518d63..b4d9a96c3c1b 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -4642,20 +4642,17 @@ static int btf_dedup_remap_types(struct btf_dedup *d)
  */
 struct btf *btf__load_vmlinux_btf(void)
 {
-	struct {
-		const char *path_fmt;
-		bool raw_btf;
-	} locations[] = {
+	const char *locations[] = {
 		/* try canonical vmlinux BTF through sysfs first */
-		{ "/sys/kernel/btf/vmlinux", true /* raw BTF */ },
-		/* fall back to trying to find vmlinux ELF on disk otherwise */
-		{ "/boot/vmlinux-%1$s" },
-		{ "/lib/modules/%1$s/vmlinux-%1$s" },
-		{ "/lib/modules/%1$s/build/vmlinux" },
-		{ "/usr/lib/modules/%1$s/kernel/vmlinux" },
-		{ "/usr/lib/debug/boot/vmlinux-%1$s" },
-		{ "/usr/lib/debug/boot/vmlinux-%1$s.debug" },
-		{ "/usr/lib/debug/lib/modules/%1$s/vmlinux" },
+		"/sys/kernel/btf/vmlinux",
+		/* fall back to trying to find vmlinux on disk otherwise */
+		"/boot/vmlinux-%1$s",
+		"/lib/modules/%1$s/vmlinux-%1$s",
+		"/lib/modules/%1$s/build/vmlinux",
+		"/usr/lib/modules/%1$s/kernel/vmlinux",
+		"/usr/lib/debug/boot/vmlinux-%1$s",
+		"/usr/lib/debug/boot/vmlinux-%1$s.debug",
+		"/usr/lib/debug/lib/modules/%1$s/vmlinux",
 	};
 	char path[PATH_MAX + 1];
 	struct utsname buf;
@@ -4665,15 +4662,12 @@ struct btf *btf__load_vmlinux_btf(void)
 	uname(&buf);
 
 	for (i = 0; i < ARRAY_SIZE(locations); i++) {
-		snprintf(path, PATH_MAX, locations[i].path_fmt, buf.release);
+		snprintf(path, PATH_MAX, locations[i], buf.release);
 
 		if (access(path, R_OK))
 			continue;
 
-		if (locations[i].raw_btf)
-			btf = btf__parse_raw(path);
-		else
-			btf = btf__parse_elf(path, NULL);
+		btf = btf__parse(path, NULL);
 		err = libbpf_get_error(btf);
 		pr_debug("loading kernel BTF '%s': %d\n", path, err);
 		if (err)
-- 
cgit v1.2.3


From d15bf1501c7533826a616478002c601fcc7671f3 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@kernel.org>
Date: Tue, 20 Sep 2022 09:59:39 +0200
Subject: bpf: Allow kfuncs to be used in LSM programs

In preparation for the addition of new kfuncs, allow kfuncs defined in the
tracing subsystem to be used in LSM programs by mapping the LSM program
type to the TRACING hook.

Signed-off-by: KP Singh <kpsingh@kernel.org>
Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220920075951.929132-2-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/btf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6ccd4f4d731e..dbcf020c4124 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7376,6 +7376,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
 	case BPF_PROG_TYPE_STRUCT_OPS:
 		return BTF_KFUNC_HOOK_STRUCT_OPS;
 	case BPF_PROG_TYPE_TRACING:
+	case BPF_PROG_TYPE_LSM:
 		return BTF_KFUNC_HOOK_TRACING;
 	case BPF_PROG_TYPE_SYSCALL:
 		return BTF_KFUNC_HOOK_SYSCALL;
-- 
cgit v1.2.3


From 00f146413ccb6c84308e559281449755c83f54c5 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:40 +0200
Subject: btf: Export bpf_dynptr definition

eBPF dynamic pointers is a new feature recently added to upstream. It binds
together a pointer to a memory area and its size. The internal kernel
structure bpf_dynptr_kern is not accessible by eBPF programs in user space.
They instead see bpf_dynptr, which is then translated to the internal
kernel structure by the eBPF verifier.

The problem is that it is not possible to include at the same time the uapi
include linux/bpf.h and the vmlinux BTF vmlinux.h, as they both contain the
definition of some structures/enums. The compiler complains saying that the
structures/enums are redefined.

As bpf_dynptr is defined in the uapi include linux/bpf.h, this makes it
impossible to include vmlinux.h. However, in some cases, e.g. when using
kfuncs, vmlinux.h has to be included. The only option until now was to
include vmlinux.h and add the definition of bpf_dynptr directly in the eBPF
program source code from linux/bpf.h.

Solve the problem by using the same approach as for bpf_timer (which also
follows the same scheme with the _kern suffix for the internal kernel
structure).

Add the following line in one of the dynamic pointer helpers,
bpf_dynptr_from_mem():

BTF_TYPE_EMIT(struct bpf_dynptr);

Cc: stable@vger.kernel.org
Cc: Joanne Koong <joannelkoong@gmail.com>
Fixes: 97e03f521050c ("bpf: Add verifier support for dynptrs")
Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Yonghong Song <yhs@fb.com>
Tested-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/r/20220920075951.929132-3-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/helpers.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index cb5564c77482..6d69e30f42d8 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1446,6 +1446,8 @@ BPF_CALL_4(bpf_dynptr_from_mem, void *, data, u32, size, u64, flags, struct bpf_
 {
 	int err;
 
+	BTF_TYPE_EMIT(struct bpf_dynptr);
+
 	err = bpf_dynptr_check_size(size);
 	if (err)
 		goto error;
-- 
cgit v1.2.3


From e9e315b4a5de32d0482b92f482517095d5d844e4 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:41 +0200
Subject: bpf: Move dynptr type check to is_dynptr_type_expected()

Move dynptr type check to is_dynptr_type_expected() from
is_dynptr_reg_valid_init(), so that callers can better determine the cause
of a negative result (dynamic pointer not valid/initialized, dynamic
pointer of the wrong type). It will be useful for example for BTF, to
restrict which dynamic pointer types can be passed to kfuncs, as initially
only the local type will be supported.

Also, splitting makes the code more readable, since checking the dynamic
pointer type is not necessarily related to validity and initialization.

Split the validity/initialization and dynamic pointer type check also in
the verifier, and adjust the expected error message in the test (a test for
an unexpected dynptr type passed to a helper cannot be added due to missing
suitable helpers, but this case has been tested manually).

Cc: Joanne Koong <joannelkoong@gmail.com>
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220920075951.929132-4-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                           | 35 +++++++++++++++++++------
 tools/testing/selftests/bpf/prog_tests/dynptr.c |  2 +-
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c76fa45a5906..c08dde19eb67 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -782,8 +782,8 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
 	return true;
 }
 
-static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
-				     enum bpf_arg_type arg_type)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
+				     struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
 	int spi = get_spi(reg->off);
@@ -799,11 +799,24 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_re
 			return false;
 	}
 
+	return true;
+}
+
+static bool is_dynptr_type_expected(struct bpf_verifier_env *env,
+				    struct bpf_reg_state *reg,
+				    enum bpf_arg_type arg_type)
+{
+	struct bpf_func_state *state = func(env, reg);
+	enum bpf_dynptr_type dynptr_type;
+	int spi = get_spi(reg->off);
+
 	/* ARG_PTR_TO_DYNPTR takes any type of dynptr */
 	if (arg_type == ARG_PTR_TO_DYNPTR)
 		return true;
 
-	return state->stack[spi].spilled_ptr.dynptr.type == arg_to_dynptr_type(arg_type);
+	dynptr_type = arg_to_dynptr_type(arg_type);
+
+	return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
 }
 
 /* The reg state of a pointer or a bounded scalar was saved when
@@ -6095,21 +6108,27 @@ skip_type_check:
 			}
 
 			meta->uninit_dynptr_regno = regno;
-		} else if (!is_dynptr_reg_valid_init(env, reg, arg_type)) {
+		} else if (!is_dynptr_reg_valid_init(env, reg)) {
+			verbose(env,
+				"Expected an initialized dynptr as arg #%d\n",
+				arg + 1);
+			return -EINVAL;
+		} else if (!is_dynptr_type_expected(env, reg, arg_type)) {
 			const char *err_extra = "";
 
 			switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
 			case DYNPTR_TYPE_LOCAL:
-				err_extra = "local ";
+				err_extra = "local";
 				break;
 			case DYNPTR_TYPE_RINGBUF:
-				err_extra = "ringbuf ";
+				err_extra = "ringbuf";
 				break;
 			default:
+				err_extra = "<unknown>";
 				break;
 			}
-
-			verbose(env, "Expected an initialized %sdynptr as arg #%d\n",
+			verbose(env,
+				"Expected a dynptr of type %s as arg #%d\n",
 				err_extra, arg + 1);
 			return -EINVAL;
 		}
diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
index bcf80b9f7c27..8fc4e6c02bfd 100644
--- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
@@ -30,7 +30,7 @@ static struct {
 	{"invalid_helper2", "Expected an initialized dynptr as arg #3"},
 	{"invalid_write1", "Expected an initialized dynptr as arg #1"},
 	{"invalid_write2", "Expected an initialized dynptr as arg #3"},
-	{"invalid_write3", "Expected an initialized ringbuf dynptr as arg #1"},
+	{"invalid_write3", "Expected an initialized dynptr as arg #1"},
 	{"invalid_write4", "arg 1 is an unacquired reference"},
 	{"invalid_read1", "invalid read from stack"},
 	{"invalid_read2", "cannot pass in dynptr at an offset"},
-- 
cgit v1.2.3


From b8d31762a0ae6861e1115302ee338560d853e317 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:42 +0200
Subject: btf: Allow dynamic pointer parameters in kfuncs

Allow dynamic pointers (struct bpf_dynptr_kern *) to be specified as
parameters in kfuncs. Also, ensure that dynamic pointers passed as argument
are valid and initialized, are a pointer to the stack, and of the type
local. More dynamic pointer types can be supported in the future.

To properly detect whether a parameter is of the desired type, introduce
the stringify_struct() macro to compare the returned structure name with
the desired name. In addition, protect against structure renames, by
halting the build with BUILD_BUG_ON(), so that developers have to revisit
the code.

To check if a dynamic pointer passed to the kfunc is valid and initialized,
and if its type is local, export the existing functions
is_dynptr_reg_valid_init() and is_dynptr_type_expected().

Cc: Joanne Koong <joannelkoong@gmail.com>
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220920075951.929132-5-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  5 +++++
 include/linux/btf.h          |  9 +++++++++
 kernel/bpf/btf.c             | 33 +++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c        | 10 +++++-----
 4 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e197f8fb27e2..9e1e6965f407 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -593,6 +593,11 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state
 			     u32 regno);
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size);
+bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
+			      struct bpf_reg_state *reg);
+bool is_dynptr_type_expected(struct bpf_verifier_env *env,
+			     struct bpf_reg_state *reg,
+			     enum bpf_arg_type arg_type);
 
 /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */
 static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog,
diff --git a/include/linux/btf.h b/include/linux/btf.h
index 1fcc833a8690..f9aababc5d78 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -52,6 +52,15 @@
 #define KF_SLEEPABLE    (1 << 5) /* kfunc may sleep */
 #define KF_DESTRUCTIVE  (1 << 6) /* kfunc performs destructive actions */
 
+/*
+ * Return the name of the passed struct, if exists, or halt the build if for
+ * example the structure gets renamed. In this way, developers have to revisit
+ * the code using that structure name, and update it accordingly.
+ */
+#define stringify_struct(x)			\
+	({ BUILD_BUG_ON(sizeof(struct x) < 0);	\
+	   __stringify(x); })
+
 struct btf;
 struct btf_member;
 struct btf_type;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index dbcf020c4124..13faede0f2b4 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6449,15 +6449,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 
 			if (is_kfunc) {
 				bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
+				bool arg_dynptr = btf_type_is_struct(ref_t) &&
+						  !strcmp(ref_tname,
+							  stringify_struct(bpf_dynptr_kern));
 
 				/* Permit pointer to mem, but only when argument
 				 * type is pointer to scalar, or struct composed
 				 * (recursively) of scalars.
 				 * When arg_mem_size is true, the pointer can be
 				 * void *.
+				 * Also permit initialized local dynamic pointers.
 				 */
 				if (!btf_type_is_scalar(ref_t) &&
 				    !__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
+				    !arg_dynptr &&
 				    (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
 					bpf_log(log,
 						"arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
@@ -6465,6 +6470,34 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 					return -EINVAL;
 				}
 
+				if (arg_dynptr) {
+					if (reg->type != PTR_TO_STACK) {
+						bpf_log(log, "arg#%d pointer type %s %s not to stack\n",
+							i, btf_type_str(ref_t),
+							ref_tname);
+						return -EINVAL;
+					}
+
+					if (!is_dynptr_reg_valid_init(env, reg)) {
+						bpf_log(log,
+							"arg#%d pointer type %s %s must be valid and initialized\n",
+							i, btf_type_str(ref_t),
+							ref_tname);
+						return -EINVAL;
+					}
+
+					if (!is_dynptr_type_expected(env, reg,
+							ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) {
+						bpf_log(log,
+							"arg#%d pointer type %s %s points to unsupported dynamic pointer type\n",
+							i, btf_type_str(ref_t),
+							ref_tname);
+						return -EINVAL;
+					}
+
+					continue;
+				}
+
 				/* Check for mem, len pair */
 				if (arg_mem_size) {
 					if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c08dde19eb67..6f6d2d511c06 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -782,8 +782,8 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
 	return true;
 }
 
-static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
-				     struct bpf_reg_state *reg)
+bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
+			      struct bpf_reg_state *reg)
 {
 	struct bpf_func_state *state = func(env, reg);
 	int spi = get_spi(reg->off);
@@ -802,9 +802,9 @@ static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
 	return true;
 }
 
-static bool is_dynptr_type_expected(struct bpf_verifier_env *env,
-				    struct bpf_reg_state *reg,
-				    enum bpf_arg_type arg_type)
+bool is_dynptr_type_expected(struct bpf_verifier_env *env,
+			     struct bpf_reg_state *reg,
+			     enum bpf_arg_type arg_type)
 {
 	struct bpf_func_state *state = func(env, reg);
 	enum bpf_dynptr_type dynptr_type;
-- 
cgit v1.2.3


From 51df4865718540f51bb5d3e552c50dc88e1333d6 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:43 +0200
Subject: bpf: Export bpf_dynptr_get_size()

Export bpf_dynptr_get_size(), so that kernel code dealing with eBPF dynamic
pointers can obtain the real size of data carried by this data structure.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Acked-by: KP Singh <kpsingh@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220920075951.929132-6-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h  | 1 +
 kernel/bpf/helpers.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 33e543b86e1a..6535fb1e21b9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2654,6 +2654,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data,
 		     enum bpf_dynptr_type type, u32 offset, u32 size);
 void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr);
 int bpf_dynptr_check_size(u32 size);
+u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr);
 
 #ifdef CONFIG_BPF_LSM
 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 6d69e30f42d8..b069517a3da0 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1408,7 +1408,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ
 	ptr->size |= type << DYNPTR_TYPE_SHIFT;
 }
 
-static u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
 {
 	return ptr->size & DYNPTR_SIZE_MASK;
 }
-- 
cgit v1.2.3


From 90fd8f26edd47942203639bf3a5dde8fa1931a0e Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:44 +0200
Subject: KEYS: Move KEY_LOOKUP_ to include/linux/key.h and define
 KEY_LOOKUP_ALL

In preparation for the patch that introduces the bpf_lookup_user_key() eBPF
kfunc, move KEY_LOOKUP_ definitions to include/linux/key.h, to be able to
validate the kfunc parameters. Add them to enum key_lookup_flag, so that
all the current ones and the ones defined in the future are automatically
exported through BTF and available to eBPF programs.

Also, add KEY_LOOKUP_ALL to the enum, with the logical OR of currently
defined flags as value, to facilitate checking whether a variable contains
only those flags.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lore.kernel.org/r/20220920075951.929132-7-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/key.h      | 6 ++++++
 security/keys/internal.h | 2 --
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/key.h b/include/linux/key.h
index 7febc4881363..d27477faf00d 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -88,6 +88,12 @@ enum key_need_perm {
 	KEY_DEFER_PERM_CHECK,	/* Special: permission check is deferred */
 };
 
+enum key_lookup_flag {
+	KEY_LOOKUP_CREATE = 0x01,
+	KEY_LOOKUP_PARTIAL = 0x02,
+	KEY_LOOKUP_ALL = (KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL),
+};
+
 struct seq_file;
 struct user_struct;
 struct signal_struct;
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 9b9cf3b6fcbb..3c1e7122076b 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -165,8 +165,6 @@ extern struct key *request_key_and_link(struct key_type *type,
 
 extern bool lookup_user_key_possessed(const struct key *key,
 				      const struct key_match_data *match_data);
-#define KEY_LOOKUP_CREATE	0x01
-#define KEY_LOOKUP_PARTIAL	0x02
 
 extern long join_session_keyring(const char *name);
 extern void key_change_session_keyring(struct callback_head *twork);
-- 
cgit v1.2.3


From f3cf4134c5c6c47b9b5c7aa3cb2d67e107887a7b Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:45 +0200
Subject: bpf: Add bpf_lookup_*_key() and bpf_key_put() kfuncs

Add the bpf_lookup_user_key(), bpf_lookup_system_key() and bpf_key_put()
kfuncs, to respectively search a key with a given key handle serial number
and flags, obtain a key from a pre-determined ID defined in
include/linux/verification.h, and cleanup.

Introduce system_keyring_id_check() to validate the keyring ID parameter of
bpf_lookup_system_key().

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20220920075951.929132-8-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |   8 +++
 include/linux/verification.h |   8 +++
 kernel/trace/bpf_trace.c     | 135 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 151 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6535fb1e21b9..a1435b019aca 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2664,4 +2664,12 @@ static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {}
 static inline void bpf_cgroup_atype_put(int cgroup_atype) {}
 #endif /* CONFIG_BPF_LSM */
 
+struct key;
+
+#ifdef CONFIG_KEYS
+struct bpf_key {
+	struct key *key;
+	bool has_ref;
+};
+#endif /* CONFIG_KEYS */
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/verification.h b/include/linux/verification.h
index a655923335ae..f34e50ebcf60 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h
@@ -17,6 +17,14 @@
 #define VERIFY_USE_SECONDARY_KEYRING ((struct key *)1UL)
 #define VERIFY_USE_PLATFORM_KEYRING  ((struct key *)2UL)
 
+static inline int system_keyring_id_check(u64 id)
+{
+	if (id > (unsigned long)VERIFY_USE_PLATFORM_KEYRING)
+		return -EINVAL;
+
+	return 0;
+}
+
 /*
  * The use to which an asymmetric key is being put.
  */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 68e5cdd24cef..ab183dbaa8d1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,8 @@
 #include <linux/fprobe.h>
 #include <linux/bsearch.h>
 #include <linux/sort.h>
+#include <linux/key.h>
+#include <linux/verification.h>
 
 #include <net/bpf_sk_storage.h>
 
@@ -1181,6 +1183,139 @@ static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+#ifdef CONFIG_KEYS
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "kfuncs which will be used in BPF programs");
+
+/**
+ * bpf_lookup_user_key - lookup a key by its serial
+ * @serial: key handle serial number
+ * @flags: lookup-specific flags
+ *
+ * Search a key with a given *serial* and the provided *flags*.
+ * If found, increment the reference count of the key by one, and
+ * return it in the bpf_key structure.
+ *
+ * The bpf_key structure must be passed to bpf_key_put() when done
+ * with it, so that the key reference count is decremented and the
+ * bpf_key structure is freed.
+ *
+ * Permission checks are deferred to the time the key is used by
+ * one of the available key-specific kfuncs.
+ *
+ * Set *flags* with KEY_LOOKUP_CREATE, to attempt creating a requested
+ * special keyring (e.g. session keyring), if it doesn't yet exist.
+ * Set *flags* with KEY_LOOKUP_PARTIAL, to lookup a key without waiting
+ * for the key construction, and to retrieve uninstantiated keys (keys
+ * without data attached to them).
+ *
+ * Return: a bpf_key pointer with a valid key pointer if the key is found, a
+ *         NULL pointer otherwise.
+ */
+struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags)
+{
+	key_ref_t key_ref;
+	struct bpf_key *bkey;
+
+	if (flags & ~KEY_LOOKUP_ALL)
+		return NULL;
+
+	/*
+	 * Permission check is deferred until the key is used, as the
+	 * intent of the caller is unknown here.
+	 */
+	key_ref = lookup_user_key(serial, flags, KEY_DEFER_PERM_CHECK);
+	if (IS_ERR(key_ref))
+		return NULL;
+
+	bkey = kmalloc(sizeof(*bkey), GFP_KERNEL);
+	if (!bkey) {
+		key_put(key_ref_to_ptr(key_ref));
+		return NULL;
+	}
+
+	bkey->key = key_ref_to_ptr(key_ref);
+	bkey->has_ref = true;
+
+	return bkey;
+}
+
+/**
+ * bpf_lookup_system_key - lookup a key by a system-defined ID
+ * @id: key ID
+ *
+ * Obtain a bpf_key structure with a key pointer set to the passed key ID.
+ * The key pointer is marked as invalid, to prevent bpf_key_put() from
+ * attempting to decrement the key reference count on that pointer. The key
+ * pointer set in such way is currently understood only by
+ * verify_pkcs7_signature().
+ *
+ * Set *id* to one of the values defined in include/linux/verification.h:
+ * 0 for the primary keyring (immutable keyring of system keys);
+ * VERIFY_USE_SECONDARY_KEYRING for both the primary and secondary keyring
+ * (where keys can be added only if they are vouched for by existing keys
+ * in those keyrings); VERIFY_USE_PLATFORM_KEYRING for the platform
+ * keyring (primarily used by the integrity subsystem to verify a kexec'ed
+ * kerned image and, possibly, the initramfs signature).
+ *
+ * Return: a bpf_key pointer with an invalid key pointer set from the
+ *         pre-determined ID on success, a NULL pointer otherwise
+ */
+struct bpf_key *bpf_lookup_system_key(u64 id)
+{
+	struct bpf_key *bkey;
+
+	if (system_keyring_id_check(id) < 0)
+		return NULL;
+
+	bkey = kmalloc(sizeof(*bkey), GFP_ATOMIC);
+	if (!bkey)
+		return NULL;
+
+	bkey->key = (struct key *)(unsigned long)id;
+	bkey->has_ref = false;
+
+	return bkey;
+}
+
+/**
+ * bpf_key_put - decrement key reference count if key is valid and free bpf_key
+ * @bkey: bpf_key structure
+ *
+ * Decrement the reference count of the key inside *bkey*, if the pointer
+ * is valid, and free *bkey*.
+ */
+void bpf_key_put(struct bpf_key *bkey)
+{
+	if (bkey->has_ref)
+		key_put(bkey->key);
+
+	kfree(bkey);
+}
+
+__diag_pop();
+
+BTF_SET8_START(key_sig_kfunc_set)
+BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
+BTF_SET8_END(key_sig_kfunc_set)
+
+static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set = &key_sig_kfunc_set,
+};
+
+static int __init bpf_key_sig_kfuncs_init(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+					 &bpf_key_sig_kfunc_set);
+}
+
+late_initcall(bpf_key_sig_kfuncs_init);
+#endif /* CONFIG_KEYS */
+
 static const struct bpf_func_proto *
 bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
-- 
cgit v1.2.3


From 865b0566d8f1a0c3937e5eb4bd6ba4ef03e7e98c Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:46 +0200
Subject: bpf: Add bpf_verify_pkcs7_signature() kfunc

Add the bpf_verify_pkcs7_signature() kfunc, to give eBPF security modules
the ability to check the validity of a signature against supplied data, by
using user-provided or system-provided keys as trust anchor.

The new kfunc makes it possible to enforce mandatory policies, as eBPF
programs might be allowed to make security decisions only based on data
sources the system administrator approves.

The caller should provide the data to be verified and the signature as eBPF
dynamic pointers (to minimize the number of parameters) and a bpf_key
structure containing a reference to the keyring with keys trusted for
signature verification, obtained from bpf_lookup_user_key() or
bpf_lookup_system_key().

For bpf_key structures obtained from the former lookup function,
bpf_verify_pkcs7_signature() completes the permission check deferred by
that function by calling key_validate(). key_task_permission() is already
called by the PKCS#7 code.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: KP Singh <kpsingh@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20220920075951.929132-9-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ab183dbaa8d1..9df53c40cffd 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1294,12 +1294,57 @@ void bpf_key_put(struct bpf_key *bkey)
 	kfree(bkey);
 }
 
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+/**
+ * bpf_verify_pkcs7_signature - verify a PKCS#7 signature
+ * @data_ptr: data to verify
+ * @sig_ptr: signature of the data
+ * @trusted_keyring: keyring with keys trusted for signature verification
+ *
+ * Verify the PKCS#7 signature *sig_ptr* against the supplied *data_ptr*
+ * with keys in a keyring referenced by *trusted_keyring*.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
+			       struct bpf_dynptr_kern *sig_ptr,
+			       struct bpf_key *trusted_keyring)
+{
+	int ret;
+
+	if (trusted_keyring->has_ref) {
+		/*
+		 * Do the permission check deferred in bpf_lookup_user_key().
+		 * See bpf_lookup_user_key() for more details.
+		 *
+		 * A call to key_task_permission() here would be redundant, as
+		 * it is already done by keyring_search() called by
+		 * find_asymmetric_key().
+		 */
+		ret = key_validate(trusted_keyring->key);
+		if (ret < 0)
+			return ret;
+	}
+
+	return verify_pkcs7_signature(data_ptr->data,
+				      bpf_dynptr_get_size(data_ptr),
+				      sig_ptr->data,
+				      bpf_dynptr_get_size(sig_ptr),
+				      trusted_keyring->key,
+				      VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
+				      NULL);
+}
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+
 __diag_pop();
 
 BTF_SET8_START(key_sig_kfunc_set)
 BTF_ID_FLAGS(func, bpf_lookup_user_key, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_lookup_system_key, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_key_put, KF_RELEASE)
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+BTF_ID_FLAGS(func, bpf_verify_pkcs7_signature, KF_SLEEPABLE)
+#endif
 BTF_SET8_END(key_sig_kfunc_set)
 
 static const struct btf_kfunc_id_set bpf_key_sig_kfunc_set = {
-- 
cgit v1.2.3


From 94fd7420faa0bc85341c0a9cbe5e5240ef4f123d Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:47 +0200
Subject: selftests/bpf: Compile kernel with everything as built-in
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since the eBPF CI does not support kernel modules, change the kernel config
to compile everything as built-in.

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Daniel Müller <deso@posteo.net>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220920075951.929132-10-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/config        | 26 +++++++++++++-------------
 tools/testing/selftests/bpf/config.x86_64 |  2 +-
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 3fc46f9cfb22..0fdd11e6b742 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -7,9 +7,9 @@ CONFIG_BPF_LSM=y
 CONFIG_BPF_STREAM_PARSER=y
 CONFIG_BPF_SYSCALL=y
 CONFIG_CGROUP_BPF=y
-CONFIG_CRYPTO_HMAC=m
-CONFIG_CRYPTO_SHA256=m
-CONFIG_CRYPTO_USER_API_HASH=m
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_SHA256=y
+CONFIG_CRYPTO_USER_API_HASH=y
 CONFIG_DYNAMIC_FTRACE=y
 CONFIG_FPROBE=y
 CONFIG_FTRACE_SYSCALLS=y
@@ -24,30 +24,30 @@ CONFIG_IP_NF_FILTER=y
 CONFIG_IP_NF_RAW=y
 CONFIG_IP_NF_TARGET_SYNPROXY=y
 CONFIG_IPV6=y
-CONFIG_IPV6_FOU=m
-CONFIG_IPV6_FOU_TUNNEL=m
+CONFIG_IPV6_FOU=y
+CONFIG_IPV6_FOU_TUNNEL=y
 CONFIG_IPV6_GRE=y
 CONFIG_IPV6_SEG6_BPF=y
-CONFIG_IPV6_SIT=m
+CONFIG_IPV6_SIT=y
 CONFIG_IPV6_TUNNEL=y
 CONFIG_LIRC=y
 CONFIG_LWTUNNEL=y
 CONFIG_MPLS=y
-CONFIG_MPLS_IPTUNNEL=m
-CONFIG_MPLS_ROUTING=m
+CONFIG_MPLS_IPTUNNEL=y
+CONFIG_MPLS_ROUTING=y
 CONFIG_MPTCP=y
 CONFIG_NET_CLS_ACT=y
 CONFIG_NET_CLS_BPF=y
-CONFIG_NET_CLS_FLOWER=m
-CONFIG_NET_FOU=m
+CONFIG_NET_CLS_FLOWER=y
+CONFIG_NET_FOU=y
 CONFIG_NET_FOU_IP_TUNNELS=y
 CONFIG_NET_IPGRE=y
 CONFIG_NET_IPGRE_DEMUX=y
 CONFIG_NET_IPIP=y
-CONFIG_NET_MPLS_GSO=m
+CONFIG_NET_MPLS_GSO=y
 CONFIG_NET_SCH_INGRESS=y
 CONFIG_NET_SCHED=y
-CONFIG_NETDEVSIM=m
+CONFIG_NETDEVSIM=y
 CONFIG_NETFILTER=y
 CONFIG_NETFILTER_SYNPROXY=y
 CONFIG_NETFILTER_XT_CONNMARK=y
@@ -60,7 +60,7 @@ CONFIG_NF_DEFRAG_IPV6=y
 CONFIG_RC_CORE=y
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
-CONFIG_TEST_BPF=m
+CONFIG_TEST_BPF=y
 CONFIG_USERFAULTFD=y
 CONFIG_VXLAN=y
 CONFIG_XDP_SOCKETS=y
diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64
index f0859a1d37ab..ce70c9509204 100644
--- a/tools/testing/selftests/bpf/config.x86_64
+++ b/tools/testing/selftests/bpf/config.x86_64
@@ -47,7 +47,7 @@ CONFIG_CPU_IDLE_GOV_LADDER=y
 CONFIG_CPUSETS=y
 CONFIG_CRC_T10DIF=y
 CONFIG_CRYPTO_BLAKE2B=y
-CONFIG_CRYPTO_DEV_VIRTIO=m
+CONFIG_CRYPTO_DEV_VIRTIO=y
 CONFIG_CRYPTO_SEQIV=y
 CONFIG_CRYPTO_XXHASH=y
 CONFIG_DCB=y
-- 
cgit v1.2.3


From 7c036ed9e0065e852fb1886d9ea97ceb35680e3f Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:48 +0200
Subject: selftests/bpf: Add verifier tests for bpf_lookup_*_key() and
 bpf_key_put()

Add verifier tests for bpf_lookup_*_key() and bpf_key_put(), to ensure that
acquired key references stored in the bpf_key structure are released, that
a non-NULL bpf_key pointer is passed to bpf_key_put(), and that key
references are not leaked.

Also, slightly modify test_verifier.c, to find the BTF ID of the attach
point for the LSM program type (currently, it is done only for TRACING).

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220920075951.929132-11-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/config                 |   1 +
 tools/testing/selftests/bpf/test_verifier.c        |   3 +-
 .../testing/selftests/bpf/verifier/ref_tracking.c  | 139 +++++++++++++++++++++
 3 files changed, 142 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 0fdd11e6b742..add5a5a919b4 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -30,6 +30,7 @@ CONFIG_IPV6_GRE=y
 CONFIG_IPV6_SEG6_BPF=y
 CONFIG_IPV6_SIT=y
 CONFIG_IPV6_TUNNEL=y
+CONFIG_KEYS=y
 CONFIG_LIRC=y
 CONFIG_LWTUNNEL=y
 CONFIG_MPLS=y
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index f9d553fbf68a..2dbcbf363c18 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -1498,7 +1498,8 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 		opts.log_level = DEFAULT_LIBBPF_LOG_LEVEL;
 	opts.prog_flags = pflags;
 
-	if (prog_type == BPF_PROG_TYPE_TRACING && test->kfunc) {
+	if ((prog_type == BPF_PROG_TYPE_TRACING ||
+	     prog_type == BPF_PROG_TYPE_LSM) && test->kfunc) {
 		int attach_btf_id;
 
 		attach_btf_id = libbpf_find_vmlinux_btf_id(test->kfunc,
diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c
index 57a83d763ec1..f18ce867271f 100644
--- a/tools/testing/selftests/bpf/verifier/ref_tracking.c
+++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c
@@ -84,6 +84,145 @@
 	.errstr = "Unreleased reference",
 	.result = REJECT,
 },
+{
+	"reference tracking: acquire/release user key reference",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_1, -3),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_LSM,
+	.kfunc = "bpf",
+	.expected_attach_type = BPF_LSM_MAC,
+	.flags = BPF_F_SLEEPABLE,
+	.fixup_kfunc_btf_id = {
+		{ "bpf_lookup_user_key", 2 },
+		{ "bpf_key_put", 5 },
+	},
+	.result = ACCEPT,
+},
+{
+	"reference tracking: acquire/release system key reference",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_1, 1),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_LSM,
+	.kfunc = "bpf",
+	.expected_attach_type = BPF_LSM_MAC,
+	.flags = BPF_F_SLEEPABLE,
+	.fixup_kfunc_btf_id = {
+		{ "bpf_lookup_system_key", 1 },
+		{ "bpf_key_put", 4 },
+	},
+	.result = ACCEPT,
+},
+{
+	"reference tracking: release user key reference without check",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_1, -3),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_LSM,
+	.kfunc = "bpf",
+	.expected_attach_type = BPF_LSM_MAC,
+	.flags = BPF_F_SLEEPABLE,
+	.errstr = "arg#0 pointer type STRUCT bpf_key must point to scalar, or struct with scalar",
+	.fixup_kfunc_btf_id = {
+		{ "bpf_lookup_user_key", 2 },
+		{ "bpf_key_put", 4 },
+	},
+	.result = REJECT,
+},
+{
+	"reference tracking: release system key reference without check",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_1, 1),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_LSM,
+	.kfunc = "bpf",
+	.expected_attach_type = BPF_LSM_MAC,
+	.flags = BPF_F_SLEEPABLE,
+	.errstr = "arg#0 pointer type STRUCT bpf_key must point to scalar, or struct with scalar",
+	.fixup_kfunc_btf_id = {
+		{ "bpf_lookup_system_key", 1 },
+		{ "bpf_key_put", 3 },
+	},
+	.result = REJECT,
+},
+{
+	"reference tracking: release with NULL key pointer",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_1, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_MOV64_IMM(BPF_REG_0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_LSM,
+	.kfunc = "bpf",
+	.expected_attach_type = BPF_LSM_MAC,
+	.flags = BPF_F_SLEEPABLE,
+	.errstr = "arg#0 pointer type STRUCT bpf_key must point to scalar, or struct with scalar",
+	.fixup_kfunc_btf_id = {
+		{ "bpf_key_put", 1 },
+	},
+	.result = REJECT,
+},
+{
+	"reference tracking: leak potential reference to user key",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_1, -3),
+	BPF_MOV64_IMM(BPF_REG_2, 0),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_LSM,
+	.kfunc = "bpf",
+	.expected_attach_type = BPF_LSM_MAC,
+	.flags = BPF_F_SLEEPABLE,
+	.errstr = "Unreleased reference",
+	.fixup_kfunc_btf_id = {
+		{ "bpf_lookup_user_key", 2 },
+	},
+	.result = REJECT,
+},
+{
+	"reference tracking: leak potential reference to system key",
+	.insns = {
+	BPF_MOV64_IMM(BPF_REG_1, 1),
+	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
+	BPF_EXIT_INSN(),
+	},
+	.prog_type = BPF_PROG_TYPE_LSM,
+	.kfunc = "bpf",
+	.expected_attach_type = BPF_LSM_MAC,
+	.flags = BPF_F_SLEEPABLE,
+	.errstr = "Unreleased reference",
+	.fixup_kfunc_btf_id = {
+		{ "bpf_lookup_system_key", 1 },
+	},
+	.result = REJECT,
+},
 {
 	"reference tracking: release reference without check",
 	.insns = {
-- 
cgit v1.2.3


From ecce368d6e6d76168be5d8d34b411c69ec367859 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:49 +0200
Subject: selftests/bpf: Add additional tests for bpf_lookup_*_key()

Add a test to ensure that bpf_lookup_user_key() creates a referenced
special keyring when the KEY_LOOKUP_CREATE flag is passed to this function.

Ensure that the kfunc rejects invalid flags.

Ensure that a keyring can be obtained from bpf_lookup_system_key() when one
of the pre-determined keyring IDs is provided.

The test is currently blacklisted for s390x (JIT does not support calling
kernel function).

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Link: https://lore.kernel.org/r/20220920075951.929132-12-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.s390x         |   1 +
 .../testing/selftests/bpf/prog_tests/lookup_key.c  | 112 +++++++++++++++++++++
 .../testing/selftests/bpf/progs/test_lookup_key.c  |  46 +++++++++
 3 files changed, 159 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/lookup_key.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_lookup_key.c

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index 981c2be922f4..a6ac5dce7856 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -72,3 +72,4 @@ cgroup_hierarchical_stats                # JIT does not support calling kernel f
 htab_update                              # failed to attach: ERROR: strerror_r(-524)=22                                (trampoline)
 tracing_struct                           # failed to auto-attach: -524                                                 (trampoline)
 user_ringbuf                             # failed to find kernel BTF type ID of '__s390x_sys_prctl': -3                (?)
+lookup_key                               # JIT does not support calling kernel function                                (kfunc)
diff --git a/tools/testing/selftests/bpf/prog_tests/lookup_key.c b/tools/testing/selftests/bpf/prog_tests/lookup_key.c
new file mode 100644
index 000000000000..68025e88f352
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/lookup_key.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2022 Huawei Technologies Duesseldorf GmbH
+ *
+ * Author: Roberto Sassu <roberto.sassu@huawei.com>
+ */
+
+#include <linux/keyctl.h>
+#include <test_progs.h>
+
+#include "test_lookup_key.skel.h"
+
+#define KEY_LOOKUP_CREATE	0x01
+#define KEY_LOOKUP_PARTIAL	0x02
+
+static bool kfunc_not_supported;
+
+static int libbpf_print_cb(enum libbpf_print_level level, const char *fmt,
+			   va_list args)
+{
+	char *func;
+
+	if (strcmp(fmt, "libbpf: extern (func ksym) '%s': not found in kernel or module BTFs\n"))
+		return 0;
+
+	func = va_arg(args, char *);
+
+	if (strcmp(func, "bpf_lookup_user_key") && strcmp(func, "bpf_key_put") &&
+	    strcmp(func, "bpf_lookup_system_key"))
+		return 0;
+
+	kfunc_not_supported = true;
+	return 0;
+}
+
+void test_lookup_key(void)
+{
+	libbpf_print_fn_t old_print_cb;
+	struct test_lookup_key *skel;
+	__u32 next_id;
+	int ret;
+
+	skel = test_lookup_key__open();
+	if (!ASSERT_OK_PTR(skel, "test_lookup_key__open"))
+		return;
+
+	old_print_cb = libbpf_set_print(libbpf_print_cb);
+	ret = test_lookup_key__load(skel);
+	libbpf_set_print(old_print_cb);
+
+	if (ret < 0 && kfunc_not_supported) {
+		printf("%s:SKIP:bpf_lookup_*_key(), bpf_key_put() kfuncs not supported\n",
+		       __func__);
+		test__skip();
+		goto close_prog;
+	}
+
+	if (!ASSERT_OK(ret, "test_lookup_key__load"))
+		goto close_prog;
+
+	ret = test_lookup_key__attach(skel);
+	if (!ASSERT_OK(ret, "test_lookup_key__attach"))
+		goto close_prog;
+
+	skel->bss->monitored_pid = getpid();
+	skel->bss->key_serial = KEY_SPEC_THREAD_KEYRING;
+
+	/* The thread-specific keyring does not exist, this test fails. */
+	skel->bss->flags = 0;
+
+	ret = bpf_prog_get_next_id(0, &next_id);
+	if (!ASSERT_LT(ret, 0, "bpf_prog_get_next_id"))
+		goto close_prog;
+
+	/* Force creation of the thread-specific keyring, this test succeeds. */
+	skel->bss->flags = KEY_LOOKUP_CREATE;
+
+	ret = bpf_prog_get_next_id(0, &next_id);
+	if (!ASSERT_OK(ret, "bpf_prog_get_next_id"))
+		goto close_prog;
+
+	/* Pass both lookup flags for parameter validation. */
+	skel->bss->flags = KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL;
+
+	ret = bpf_prog_get_next_id(0, &next_id);
+	if (!ASSERT_OK(ret, "bpf_prog_get_next_id"))
+		goto close_prog;
+
+	/* Pass invalid flags. */
+	skel->bss->flags = UINT64_MAX;
+
+	ret = bpf_prog_get_next_id(0, &next_id);
+	if (!ASSERT_LT(ret, 0, "bpf_prog_get_next_id"))
+		goto close_prog;
+
+	skel->bss->key_serial = 0;
+	skel->bss->key_id = 1;
+
+	ret = bpf_prog_get_next_id(0, &next_id);
+	if (!ASSERT_OK(ret, "bpf_prog_get_next_id"))
+		goto close_prog;
+
+	skel->bss->key_id = UINT32_MAX;
+
+	ret = bpf_prog_get_next_id(0, &next_id);
+	ASSERT_LT(ret, 0, "bpf_prog_get_next_id");
+
+close_prog:
+	skel->bss->monitored_pid = 0;
+	test_lookup_key__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_lookup_key.c b/tools/testing/selftests/bpf/progs/test_lookup_key.c
new file mode 100644
index 000000000000..c73776990ae3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_lookup_key.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2022 Huawei Technologies Duesseldorf GmbH
+ *
+ * Author: Roberto Sassu <roberto.sassu@huawei.com>
+ */
+
+#include "vmlinux.h"
+#include <errno.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u32 monitored_pid;
+__u32 key_serial;
+__u32 key_id;
+__u64 flags;
+
+extern struct bpf_key *bpf_lookup_user_key(__u32 serial, __u64 flags) __ksym;
+extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym;
+extern void bpf_key_put(struct bpf_key *key) __ksym;
+
+SEC("lsm.s/bpf")
+int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size)
+{
+	struct bpf_key *bkey;
+	__u32 pid;
+
+	pid = bpf_get_current_pid_tgid() >> 32;
+	if (pid != monitored_pid)
+		return 0;
+
+	if (key_serial)
+		bkey = bpf_lookup_user_key(key_serial, flags);
+	else
+		bkey = bpf_lookup_system_key(key_id);
+
+	if (!bkey)
+		return -ENOENT;
+
+	bpf_key_put(bkey);
+
+	return 0;
+}
-- 
cgit v1.2.3


From fc97590668ae60b94ad8bc4d9e85958f10cb3567 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:50 +0200
Subject: selftests/bpf: Add test for bpf_verify_pkcs7_signature() kfunc

Perform several tests to ensure the correct implementation of the
bpf_verify_pkcs7_signature() kfunc.

Do the tests with data signed with a generated testing key (by using
sign-file from scripts/) and with the tcp_bic.ko kernel module if it is
found in the system. The test does not fail if tcp_bic.ko is not found.

First, perform an unsuccessful signature verification without data.

Second, perform a successful signature verification with the session
keyring and a new one created for testing.

Then, ensure that permission and validation checks are done properly on the
keyring provided to bpf_verify_pkcs7_signature(), despite those checks were
deferred at the time the keyring was retrieved with bpf_lookup_user_key().
The tests expect to encounter an error if the Search permission is removed
from the keyring, or the keyring is expired.

Finally, perform a successful and unsuccessful signature verification with
the keyrings with pre-determined IDs (the last test fails because the key
is not in the platform keyring).

The test is currently in the deny list for s390x (JIT does not support
calling kernel function).

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Link: https://lore.kernel.org/r/20220920075951.929132-13-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.s390x         |   1 +
 tools/testing/selftests/bpf/Makefile               |  14 +-
 tools/testing/selftests/bpf/config                 |   5 +
 tools/testing/selftests/bpf/config.x86_64          |   5 -
 .../selftests/bpf/prog_tests/verify_pkcs7_sig.c    | 399 +++++++++++++++++++++
 .../selftests/bpf/progs/test_verify_pkcs7_sig.c    |  90 +++++
 tools/testing/selftests/bpf/verify_sig_setup.sh    | 104 ++++++
 7 files changed, 610 insertions(+), 8 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c
 create mode 100755 tools/testing/selftests/bpf/verify_sig_setup.sh

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index a6ac5dce7856..a992fd978c1e 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -73,3 +73,4 @@ htab_update                              # failed to attach: ERROR: strerror_r(-
 tracing_struct                           # failed to auto-attach: -524                                                 (trampoline)
 user_ringbuf                             # failed to find kernel BTF type ID of '__s390x_sys_prctl': -3                (?)
 lookup_key                               # JIT does not support calling kernel function                                (kfunc)
+verify_pkcs7_sig                         # JIT does not support calling kernel function                                (kfunc)
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 1a0296bd744a..5898d3828b82 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -14,6 +14,7 @@ BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
 APIDIR := $(TOOLSINCDIR)/uapi
 GENDIR := $(abspath ../../../../include/generated)
 GENHDR := $(GENDIR)/autoconf.h
+HOSTPKG_CONFIG := pkg-config
 
 ifneq ($(wildcard $(GENHDR)),)
   GENFLAGS := -DHAVE_GENHDR
@@ -75,7 +76,7 @@ TEST_PROGS := test_kmod.sh \
 	test_xsk.sh
 
 TEST_PROGS_EXTENDED := with_addr.sh \
-	with_tunnels.sh ima_setup.sh \
+	with_tunnels.sh ima_setup.sh verify_sig_setup.sh \
 	test_xdp_vlan.sh test_bpftool.py
 
 # Compile but not part of 'make run_tests'
@@ -84,7 +85,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
 	test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
 	xskxceiver xdp_redirect_multi xdp_synproxy veristat
 
-TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read
+TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/sign-file
 
 # Emit succinct information message describing current building step
 # $1 - generic step name (e.g., CC, LINK, etc);
@@ -189,6 +190,12 @@ $(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_r
 		     -fuse-ld=$(LLD) -Wl,-znoseparate-code		       \
 		     -Wl,-rpath=. -Wl,--build-id=sha1 -o $@
 
+$(OUTPUT)/sign-file: ../../../../scripts/sign-file.c
+	$(call msg,SIGN-FILE,,$@)
+	$(Q)$(CC) $(shell $(HOSTPKG_CONFIG)--cflags libcrypto 2> /dev/null) \
+		  $< -o $@ \
+		  $(shell $(HOSTPKG_CONFIG) --libs libcrypto 2> /dev/null || echo -lcrypto)
+
 $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch])
 	$(call msg,MOD,,$@)
 	$(Q)$(RM) bpf_testmod/bpf_testmod.ko # force re-compilation
@@ -516,7 +523,8 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c	\
 TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko	\
 		       $(OUTPUT)/liburandom_read.so			\
 		       $(OUTPUT)/xdp_synproxy				\
-		       ima_setup.sh					\
+		       $(OUTPUT)/sign-file				\
+		       ima_setup.sh verify_sig_setup.sh			\
 		       $(wildcard progs/btf_dump_test_case_*.c)
 TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE
 TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS) -DENABLE_ATOMICS_TESTS
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index add5a5a919b4..905a9be8d0a2 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -33,6 +33,11 @@ CONFIG_IPV6_TUNNEL=y
 CONFIG_KEYS=y
 CONFIG_LIRC=y
 CONFIG_LWTUNNEL=y
+CONFIG_MODULE_SIG=y
+CONFIG_MODULE_SRCVERSION_ALL=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODULES=y
+CONFIG_MODVERSIONS=y
 CONFIG_MPLS=y
 CONFIG_MPLS_IPTUNNEL=y
 CONFIG_MPLS_ROUTING=y
diff --git a/tools/testing/selftests/bpf/config.x86_64 b/tools/testing/selftests/bpf/config.x86_64
index ce70c9509204..21ce5ea4304e 100644
--- a/tools/testing/selftests/bpf/config.x86_64
+++ b/tools/testing/selftests/bpf/config.x86_64
@@ -145,11 +145,6 @@ CONFIG_MCORE2=y
 CONFIG_MEMCG=y
 CONFIG_MEMORY_FAILURE=y
 CONFIG_MINIX_SUBPARTITION=y
-CONFIG_MODULE_SIG=y
-CONFIG_MODULE_SRCVERSION_ALL=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODULES=y
-CONFIG_MODVERSIONS=y
 CONFIG_NAMESPACES=y
 CONFIG_NET=y
 CONFIG_NET_9P=y
diff --git a/tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c b/tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c
new file mode 100644
index 000000000000..579d6ee83ce0
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/verify_pkcs7_sig.c
@@ -0,0 +1,399 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2022 Huawei Technologies Duesseldorf GmbH
+ *
+ * Author: Roberto Sassu <roberto.sassu@huawei.com>
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <endian.h>
+#include <limits.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <linux/keyctl.h>
+#include <test_progs.h>
+
+#include "test_verify_pkcs7_sig.skel.h"
+
+#define MAX_DATA_SIZE (1024 * 1024)
+#define MAX_SIG_SIZE 1024
+
+#define VERIFY_USE_SECONDARY_KEYRING (1UL)
+#define VERIFY_USE_PLATFORM_KEYRING  (2UL)
+
+/* In stripped ARM and x86-64 modules, ~ is surprisingly rare. */
+#define MODULE_SIG_STRING "~Module signature appended~\n"
+
+/*
+ * Module signature information block.
+ *
+ * The constituents of the signature section are, in order:
+ *
+ *	- Signer's name
+ *	- Key identifier
+ *	- Signature data
+ *	- Information block
+ */
+struct module_signature {
+	__u8	algo;		/* Public-key crypto algorithm [0] */
+	__u8	hash;		/* Digest algorithm [0] */
+	__u8	id_type;	/* Key identifier type [PKEY_ID_PKCS7] */
+	__u8	signer_len;	/* Length of signer's name [0] */
+	__u8	key_id_len;	/* Length of key identifier [0] */
+	__u8	__pad[3];
+	__be32	sig_len;	/* Length of signature data */
+};
+
+struct data {
+	__u8 data[MAX_DATA_SIZE];
+	__u32 data_len;
+	__u8 sig[MAX_SIG_SIZE];
+	__u32 sig_len;
+};
+
+static bool kfunc_not_supported;
+
+static int libbpf_print_cb(enum libbpf_print_level level, const char *fmt,
+			   va_list args)
+{
+	if (strcmp(fmt, "libbpf: extern (func ksym) '%s': not found in kernel or module BTFs\n"))
+		return 0;
+
+	if (strcmp(va_arg(args, char *), "bpf_verify_pkcs7_signature"))
+		return 0;
+
+	kfunc_not_supported = true;
+	return 0;
+}
+
+static int _run_setup_process(const char *setup_dir, const char *cmd)
+{
+	int child_pid, child_status;
+
+	child_pid = fork();
+	if (child_pid == 0) {
+		execlp("./verify_sig_setup.sh", "./verify_sig_setup.sh", cmd,
+		       setup_dir, NULL);
+		exit(errno);
+
+	} else if (child_pid > 0) {
+		waitpid(child_pid, &child_status, 0);
+		return WEXITSTATUS(child_status);
+	}
+
+	return -EINVAL;
+}
+
+static int populate_data_item_str(const char *tmp_dir, struct data *data_item)
+{
+	struct stat st;
+	char data_template[] = "/tmp/dataXXXXXX";
+	char path[PATH_MAX];
+	int ret, fd, child_status, child_pid;
+
+	data_item->data_len = 4;
+	memcpy(data_item->data, "test", data_item->data_len);
+
+	fd = mkstemp(data_template);
+	if (fd == -1)
+		return -errno;
+
+	ret = write(fd, data_item->data, data_item->data_len);
+
+	close(fd);
+
+	if (ret != data_item->data_len) {
+		ret = -EIO;
+		goto out;
+	}
+
+	child_pid = fork();
+
+	if (child_pid == -1) {
+		ret = -errno;
+		goto out;
+	}
+
+	if (child_pid == 0) {
+		snprintf(path, sizeof(path), "%s/signing_key.pem", tmp_dir);
+
+		return execlp("./sign-file", "./sign-file", "-d", "sha256",
+			      path, path, data_template, NULL);
+	}
+
+	waitpid(child_pid, &child_status, 0);
+
+	ret = WEXITSTATUS(child_status);
+	if (ret)
+		goto out;
+
+	snprintf(path, sizeof(path), "%s.p7s", data_template);
+
+	ret = stat(path, &st);
+	if (ret == -1) {
+		ret = -errno;
+		goto out;
+	}
+
+	if (st.st_size > sizeof(data_item->sig)) {
+		ret = -EINVAL;
+		goto out_sig;
+	}
+
+	data_item->sig_len = st.st_size;
+
+	fd = open(path, O_RDONLY);
+	if (fd == -1) {
+		ret = -errno;
+		goto out_sig;
+	}
+
+	ret = read(fd, data_item->sig, data_item->sig_len);
+
+	close(fd);
+
+	if (ret != data_item->sig_len) {
+		ret = -EIO;
+		goto out_sig;
+	}
+
+	ret = 0;
+out_sig:
+	unlink(path);
+out:
+	unlink(data_template);
+	return ret;
+}
+
+static int populate_data_item_mod(struct data *data_item)
+{
+	char mod_path[PATH_MAX], *mod_path_ptr;
+	struct stat st;
+	void *mod;
+	FILE *fp;
+	struct module_signature ms;
+	int ret, fd, modlen, marker_len, sig_len;
+
+	data_item->data_len = 0;
+
+	if (stat("/lib/modules", &st) == -1)
+		return 0;
+
+	/* Requires CONFIG_TCP_CONG_BIC=m. */
+	fp = popen("find /lib/modules/$(uname -r) -name tcp_bic.ko", "r");
+	if (!fp)
+		return 0;
+
+	mod_path_ptr = fgets(mod_path, sizeof(mod_path), fp);
+	pclose(fp);
+
+	if (!mod_path_ptr)
+		return 0;
+
+	mod_path_ptr = strchr(mod_path, '\n');
+	if (!mod_path_ptr)
+		return 0;
+
+	*mod_path_ptr = '\0';
+
+	if (stat(mod_path, &st) == -1)
+		return 0;
+
+	modlen = st.st_size;
+	marker_len = sizeof(MODULE_SIG_STRING) - 1;
+
+	fd = open(mod_path, O_RDONLY);
+	if (fd == -1)
+		return -errno;
+
+	mod = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+
+	close(fd);
+
+	if (mod == MAP_FAILED)
+		return -errno;
+
+	if (strncmp(mod + modlen - marker_len, MODULE_SIG_STRING, marker_len)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	modlen -= marker_len;
+
+	memcpy(&ms, mod + (modlen - sizeof(ms)), sizeof(ms));
+
+	sig_len = __be32_to_cpu(ms.sig_len);
+	modlen -= sig_len + sizeof(ms);
+
+	if (modlen > sizeof(data_item->data)) {
+		ret = -E2BIG;
+		goto out;
+	}
+
+	memcpy(data_item->data, mod, modlen);
+	data_item->data_len = modlen;
+
+	if (sig_len > sizeof(data_item->sig)) {
+		ret = -E2BIG;
+		goto out;
+	}
+
+	memcpy(data_item->sig, mod + modlen, sig_len);
+	data_item->sig_len = sig_len;
+	ret = 0;
+out:
+	munmap(mod, st.st_size);
+	return ret;
+}
+
+void test_verify_pkcs7_sig(void)
+{
+	libbpf_print_fn_t old_print_cb;
+	char tmp_dir_template[] = "/tmp/verify_sigXXXXXX";
+	char *tmp_dir;
+	struct test_verify_pkcs7_sig *skel = NULL;
+	struct bpf_map *map;
+	struct data data;
+	int ret, zero = 0;
+
+	/* Trigger creation of session keyring. */
+	syscall(__NR_request_key, "keyring", "_uid.0", NULL,
+		KEY_SPEC_SESSION_KEYRING);
+
+	tmp_dir = mkdtemp(tmp_dir_template);
+	if (!ASSERT_OK_PTR(tmp_dir, "mkdtemp"))
+		return;
+
+	ret = _run_setup_process(tmp_dir, "setup");
+	if (!ASSERT_OK(ret, "_run_setup_process"))
+		goto close_prog;
+
+	skel = test_verify_pkcs7_sig__open();
+	if (!ASSERT_OK_PTR(skel, "test_verify_pkcs7_sig__open"))
+		goto close_prog;
+
+	old_print_cb = libbpf_set_print(libbpf_print_cb);
+	ret = test_verify_pkcs7_sig__load(skel);
+	libbpf_set_print(old_print_cb);
+
+	if (ret < 0 && kfunc_not_supported) {
+		printf(
+		  "%s:SKIP:bpf_verify_pkcs7_signature() kfunc not supported\n",
+		  __func__);
+		test__skip();
+		goto close_prog;
+	}
+
+	if (!ASSERT_OK(ret, "test_verify_pkcs7_sig__load"))
+		goto close_prog;
+
+	ret = test_verify_pkcs7_sig__attach(skel);
+	if (!ASSERT_OK(ret, "test_verify_pkcs7_sig__attach"))
+		goto close_prog;
+
+	map = bpf_object__find_map_by_name(skel->obj, "data_input");
+	if (!ASSERT_OK_PTR(map, "data_input not found"))
+		goto close_prog;
+
+	skel->bss->monitored_pid = getpid();
+
+	/* Test without data and signature. */
+	skel->bss->user_keyring_serial = KEY_SPEC_SESSION_KEYRING;
+
+	ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, BPF_ANY);
+	if (!ASSERT_LT(ret, 0, "bpf_map_update_elem data_input"))
+		goto close_prog;
+
+	/* Test successful signature verification with session keyring. */
+	ret = populate_data_item_str(tmp_dir, &data);
+	if (!ASSERT_OK(ret, "populate_data_item_str"))
+		goto close_prog;
+
+	ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, BPF_ANY);
+	if (!ASSERT_OK(ret, "bpf_map_update_elem data_input"))
+		goto close_prog;
+
+	/* Test successful signature verification with testing keyring. */
+	skel->bss->user_keyring_serial = syscall(__NR_request_key, "keyring",
+						 "ebpf_testing_keyring", NULL,
+						 KEY_SPEC_SESSION_KEYRING);
+
+	ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, BPF_ANY);
+	if (!ASSERT_OK(ret, "bpf_map_update_elem data_input"))
+		goto close_prog;
+
+	/*
+	 * Ensure key_task_permission() is called and rejects the keyring
+	 * (no Search permission).
+	 */
+	syscall(__NR_keyctl, KEYCTL_SETPERM, skel->bss->user_keyring_serial,
+		0x37373737);
+
+	ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, BPF_ANY);
+	if (!ASSERT_LT(ret, 0, "bpf_map_update_elem data_input"))
+		goto close_prog;
+
+	syscall(__NR_keyctl, KEYCTL_SETPERM, skel->bss->user_keyring_serial,
+		0x3f3f3f3f);
+
+	/*
+	 * Ensure key_validate() is called and rejects the keyring (key expired)
+	 */
+	syscall(__NR_keyctl, KEYCTL_SET_TIMEOUT,
+		skel->bss->user_keyring_serial, 1);
+	sleep(1);
+
+	ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, BPF_ANY);
+	if (!ASSERT_LT(ret, 0, "bpf_map_update_elem data_input"))
+		goto close_prog;
+
+	skel->bss->user_keyring_serial = KEY_SPEC_SESSION_KEYRING;
+
+	/* Test with corrupted data (signature verification should fail). */
+	data.data[0] = 'a';
+	ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data, BPF_ANY);
+	if (!ASSERT_LT(ret, 0, "bpf_map_update_elem data_input"))
+		goto close_prog;
+
+	ret = populate_data_item_mod(&data);
+	if (!ASSERT_OK(ret, "populate_data_item_mod"))
+		goto close_prog;
+
+	/* Test signature verification with system keyrings. */
+	if (data.data_len) {
+		skel->bss->user_keyring_serial = 0;
+		skel->bss->system_keyring_id = 0;
+
+		ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data,
+					  BPF_ANY);
+		if (!ASSERT_OK(ret, "bpf_map_update_elem data_input"))
+			goto close_prog;
+
+		skel->bss->system_keyring_id = VERIFY_USE_SECONDARY_KEYRING;
+
+		ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data,
+					  BPF_ANY);
+		if (!ASSERT_OK(ret, "bpf_map_update_elem data_input"))
+			goto close_prog;
+
+		skel->bss->system_keyring_id = VERIFY_USE_PLATFORM_KEYRING;
+
+		ret = bpf_map_update_elem(bpf_map__fd(map), &zero, &data,
+					  BPF_ANY);
+		ASSERT_LT(ret, 0, "bpf_map_update_elem data_input");
+	}
+
+close_prog:
+	_run_setup_process(tmp_dir, "cleanup");
+
+	if (!skel)
+		return;
+
+	skel->bss->monitored_pid = 0;
+	test_verify_pkcs7_sig__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c
new file mode 100644
index 000000000000..ce419304ff1f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2022 Huawei Technologies Duesseldorf GmbH
+ *
+ * Author: Roberto Sassu <roberto.sassu@huawei.com>
+ */
+
+#include "vmlinux.h"
+#include <errno.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define MAX_DATA_SIZE (1024 * 1024)
+#define MAX_SIG_SIZE 1024
+
+extern struct bpf_key *bpf_lookup_user_key(__u32 serial, __u64 flags) __ksym;
+extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym;
+extern void bpf_key_put(struct bpf_key *key) __ksym;
+extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr,
+				      struct bpf_dynptr *sig_ptr,
+				      struct bpf_key *trusted_keyring) __ksym;
+
+__u32 monitored_pid;
+__u32 user_keyring_serial;
+__u64 system_keyring_id;
+
+struct data {
+	__u8 data[MAX_DATA_SIZE];
+	__u32 data_len;
+	__u8 sig[MAX_SIG_SIZE];
+	__u32 sig_len;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, struct data);
+} data_input SEC(".maps");
+
+char _license[] SEC("license") = "GPL";
+
+SEC("lsm.s/bpf")
+int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size)
+{
+	struct bpf_dynptr data_ptr, sig_ptr;
+	struct data *data_val;
+	struct bpf_key *trusted_keyring;
+	__u32 pid;
+	__u64 value;
+	int ret, zero = 0;
+
+	pid = bpf_get_current_pid_tgid() >> 32;
+	if (pid != monitored_pid)
+		return 0;
+
+	data_val = bpf_map_lookup_elem(&data_input, &zero);
+	if (!data_val)
+		return 0;
+
+	bpf_probe_read(&value, sizeof(value), &attr->value);
+
+	bpf_copy_from_user(data_val, sizeof(struct data),
+			   (void *)(unsigned long)value);
+
+	if (data_val->data_len > sizeof(data_val->data))
+		return -EINVAL;
+
+	bpf_dynptr_from_mem(data_val->data, data_val->data_len, 0, &data_ptr);
+
+	if (data_val->sig_len > sizeof(data_val->sig))
+		return -EINVAL;
+
+	bpf_dynptr_from_mem(data_val->sig, data_val->sig_len, 0, &sig_ptr);
+
+	if (user_keyring_serial)
+		trusted_keyring = bpf_lookup_user_key(user_keyring_serial, 0);
+	else
+		trusted_keyring = bpf_lookup_system_key(system_keyring_id);
+
+	if (!trusted_keyring)
+		return -ENOENT;
+
+	ret = bpf_verify_pkcs7_signature(&data_ptr, &sig_ptr, trusted_keyring);
+
+	bpf_key_put(trusted_keyring);
+
+	return ret;
+}
diff --git a/tools/testing/selftests/bpf/verify_sig_setup.sh b/tools/testing/selftests/bpf/verify_sig_setup.sh
new file mode 100755
index 000000000000..ba08922b4a27
--- /dev/null
+++ b/tools/testing/selftests/bpf/verify_sig_setup.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+set -u
+set -o pipefail
+
+VERBOSE="${SELFTESTS_VERBOSE:=0}"
+LOG_FILE="$(mktemp /tmp/verify_sig_setup.log.XXXXXX)"
+
+x509_genkey_content="\
+[ req ]
+default_bits = 2048
+distinguished_name = req_distinguished_name
+prompt = no
+string_mask = utf8only
+x509_extensions = myexts
+
+[ req_distinguished_name ]
+CN = eBPF Signature Verification Testing Key
+
+[ myexts ]
+basicConstraints=critical,CA:FALSE
+keyUsage=digitalSignature
+subjectKeyIdentifier=hash
+authorityKeyIdentifier=keyid
+"
+
+usage()
+{
+	echo "Usage: $0 <setup|cleanup <existing_tmp_dir>"
+	exit 1
+}
+
+setup()
+{
+	local tmp_dir="$1"
+
+	echo "${x509_genkey_content}" > ${tmp_dir}/x509.genkey
+
+	openssl req -new -nodes -utf8 -sha256 -days 36500 \
+			-batch -x509 -config ${tmp_dir}/x509.genkey \
+			-outform PEM -out ${tmp_dir}/signing_key.pem \
+			-keyout ${tmp_dir}/signing_key.pem 2>&1
+
+	openssl x509 -in ${tmp_dir}/signing_key.pem -out \
+		${tmp_dir}/signing_key.der -outform der
+
+	key_id=$(cat ${tmp_dir}/signing_key.der | keyctl padd asymmetric ebpf_testing_key @s)
+
+	keyring_id=$(keyctl newring ebpf_testing_keyring @s)
+	keyctl link $key_id $keyring_id
+}
+
+cleanup() {
+	local tmp_dir="$1"
+
+	keyctl unlink $(keyctl search @s asymmetric ebpf_testing_key) @s
+	keyctl unlink $(keyctl search @s keyring ebpf_testing_keyring) @s
+	rm -rf ${tmp_dir}
+}
+
+catch()
+{
+	local exit_code="$1"
+	local log_file="$2"
+
+	if [[ "${exit_code}" -ne 0 ]]; then
+		cat "${log_file}" >&3
+	fi
+
+	rm -f "${log_file}"
+	exit ${exit_code}
+}
+
+main()
+{
+	[[ $# -ne 2 ]] && usage
+
+	local action="$1"
+	local tmp_dir="$2"
+
+	[[ ! -d "${tmp_dir}" ]] && echo "Directory ${tmp_dir} doesn't exist" && exit 1
+
+	if [[ "${action}" == "setup" ]]; then
+		setup "${tmp_dir}"
+	elif [[ "${action}" == "cleanup" ]]; then
+		cleanup "${tmp_dir}"
+	else
+		echo "Unknown action: ${action}"
+		exit 1
+	fi
+}
+
+trap 'catch "$?" "${LOG_FILE}"' EXIT
+
+if [[ "${VERBOSE}" -eq 0 ]]; then
+	# Save the stderr to 3 so that we can output back to
+	# it incase of an error.
+	exec 3>&2 1>"${LOG_FILE}" 2>&1
+fi
+
+main "$@"
+rm -f "${LOG_FILE}"
-- 
cgit v1.2.3


From b94fa9f9dcf99730eabd8febc4c95e44342bfb59 Mon Sep 17 00:00:00 2001
From: Roberto Sassu <roberto.sassu@huawei.com>
Date: Tue, 20 Sep 2022 09:59:51 +0200
Subject: selftests/bpf: Add tests for dynamic pointers parameters in kfuncs

Add tests to ensure that only supported dynamic pointer types are accepted,
that the passed argument is actually a dynamic pointer, that the passed
argument is a pointer to the stack, and that bpf_verify_pkcs7_signature()
correctly handles dynamic pointers with data set to NULL.

The tests are currently in the deny list for s390x (JIT does not support
calling kernel function).

Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20220920075951.929132-14-roberto.sassu@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/DENYLIST.s390x         |   1 +
 .../selftests/bpf/prog_tests/kfunc_dynptr_param.c  | 164 +++++++++++++++++++++
 .../selftests/bpf/progs/test_kfunc_dynptr_param.c  |  94 ++++++++++++
 3 files changed, 259 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c

diff --git a/tools/testing/selftests/bpf/DENYLIST.s390x b/tools/testing/selftests/bpf/DENYLIST.s390x
index a992fd978c1e..17e074eb42b8 100644
--- a/tools/testing/selftests/bpf/DENYLIST.s390x
+++ b/tools/testing/selftests/bpf/DENYLIST.s390x
@@ -74,3 +74,4 @@ tracing_struct                           # failed to auto-attach: -524
 user_ringbuf                             # failed to find kernel BTF type ID of '__s390x_sys_prctl': -3                (?)
 lookup_key                               # JIT does not support calling kernel function                                (kfunc)
 verify_pkcs7_sig                         # JIT does not support calling kernel function                                (kfunc)
+kfunc_dynptr_param                       # JIT does not support calling kernel function                                (kfunc)
diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
new file mode 100644
index 000000000000..c210657d4d0a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/kfunc_dynptr_param.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2022 Facebook
+ * Copyright (C) 2022 Huawei Technologies Duesseldorf GmbH
+ *
+ * Author: Roberto Sassu <roberto.sassu@huawei.com>
+ */
+
+#include <test_progs.h>
+#include "test_kfunc_dynptr_param.skel.h"
+
+static size_t log_buf_sz = 1048576; /* 1 MB */
+static char obj_log_buf[1048576];
+
+static struct {
+	const char *prog_name;
+	const char *expected_verifier_err_msg;
+	int expected_runtime_err;
+} kfunc_dynptr_tests[] = {
+	{"dynptr_type_not_supp",
+	 "arg#0 pointer type STRUCT bpf_dynptr_kern points to unsupported dynamic pointer type", 0},
+	{"not_valid_dynptr",
+	 "arg#0 pointer type STRUCT bpf_dynptr_kern must be valid and initialized", 0},
+	{"not_ptr_to_stack", "arg#0 pointer type STRUCT bpf_dynptr_kern not to stack", 0},
+	{"dynptr_data_null", NULL, -EBADMSG},
+};
+
+static bool kfunc_not_supported;
+
+static int libbpf_print_cb(enum libbpf_print_level level, const char *fmt,
+			   va_list args)
+{
+	if (strcmp(fmt, "libbpf: extern (func ksym) '%s': not found in kernel or module BTFs\n"))
+		return 0;
+
+	if (strcmp(va_arg(args, char *), "bpf_verify_pkcs7_signature"))
+		return 0;
+
+	kfunc_not_supported = true;
+	return 0;
+}
+
+static void verify_fail(const char *prog_name, const char *expected_err_msg)
+{
+	struct test_kfunc_dynptr_param *skel;
+	LIBBPF_OPTS(bpf_object_open_opts, opts);
+	libbpf_print_fn_t old_print_cb;
+	struct bpf_program *prog;
+	int err;
+
+	opts.kernel_log_buf = obj_log_buf;
+	opts.kernel_log_size = log_buf_sz;
+	opts.kernel_log_level = 1;
+
+	skel = test_kfunc_dynptr_param__open_opts(&opts);
+	if (!ASSERT_OK_PTR(skel, "test_kfunc_dynptr_param__open_opts"))
+		goto cleanup;
+
+	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
+	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+		goto cleanup;
+
+	bpf_program__set_autoload(prog, true);
+
+	bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize());
+
+	kfunc_not_supported = false;
+
+	old_print_cb = libbpf_set_print(libbpf_print_cb);
+	err = test_kfunc_dynptr_param__load(skel);
+	libbpf_set_print(old_print_cb);
+
+	if (err < 0 && kfunc_not_supported) {
+		fprintf(stderr,
+		  "%s:SKIP:bpf_verify_pkcs7_signature() kfunc not supported\n",
+		  __func__);
+		test__skip();
+		goto cleanup;
+	}
+
+	if (!ASSERT_ERR(err, "unexpected load success"))
+		goto cleanup;
+
+	if (!ASSERT_OK_PTR(strstr(obj_log_buf, expected_err_msg), "expected_err_msg")) {
+		fprintf(stderr, "Expected err_msg: %s\n", expected_err_msg);
+		fprintf(stderr, "Verifier output: %s\n", obj_log_buf);
+	}
+
+cleanup:
+	test_kfunc_dynptr_param__destroy(skel);
+}
+
+static void verify_success(const char *prog_name, int expected_runtime_err)
+{
+	struct test_kfunc_dynptr_param *skel;
+	libbpf_print_fn_t old_print_cb;
+	struct bpf_program *prog;
+	struct bpf_link *link;
+	__u32 next_id;
+	int err;
+
+	skel = test_kfunc_dynptr_param__open();
+	if (!ASSERT_OK_PTR(skel, "test_kfunc_dynptr_param__open"))
+		return;
+
+	skel->bss->pid = getpid();
+
+	bpf_map__set_max_entries(skel->maps.ringbuf, getpagesize());
+
+	kfunc_not_supported = false;
+
+	old_print_cb = libbpf_set_print(libbpf_print_cb);
+	err = test_kfunc_dynptr_param__load(skel);
+	libbpf_set_print(old_print_cb);
+
+	if (err < 0 && kfunc_not_supported) {
+		fprintf(stderr,
+		  "%s:SKIP:bpf_verify_pkcs7_signature() kfunc not supported\n",
+		  __func__);
+		test__skip();
+		goto cleanup;
+	}
+
+	if (!ASSERT_OK(err, "test_kfunc_dynptr_param__load"))
+		goto cleanup;
+
+	prog = bpf_object__find_program_by_name(skel->obj, prog_name);
+	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
+		goto cleanup;
+
+	link = bpf_program__attach(prog);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
+		goto cleanup;
+
+	err = bpf_prog_get_next_id(0, &next_id);
+
+	bpf_link__destroy(link);
+
+	if (!ASSERT_OK(err, "bpf_prog_get_next_id"))
+		goto cleanup;
+
+	ASSERT_EQ(skel->bss->err, expected_runtime_err, "err");
+
+cleanup:
+	test_kfunc_dynptr_param__destroy(skel);
+}
+
+void test_kfunc_dynptr_param(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(kfunc_dynptr_tests); i++) {
+		if (!test__start_subtest(kfunc_dynptr_tests[i].prog_name))
+			continue;
+
+		if (kfunc_dynptr_tests[i].expected_verifier_err_msg)
+			verify_fail(kfunc_dynptr_tests[i].prog_name,
+			  kfunc_dynptr_tests[i].expected_verifier_err_msg);
+		else
+			verify_success(kfunc_dynptr_tests[i].prog_name,
+				kfunc_dynptr_tests[i].expected_runtime_err);
+	}
+}
diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
new file mode 100644
index 000000000000..ce39d096bba3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2022 Huawei Technologies Duesseldorf GmbH
+ *
+ * Author: Roberto Sassu <roberto.sassu@huawei.com>
+ */
+
+#include "vmlinux.h"
+#include <errno.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym;
+extern void bpf_key_put(struct bpf_key *key) __ksym;
+extern int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_ptr,
+				      struct bpf_dynptr *sig_ptr,
+				      struct bpf_key *trusted_keyring) __ksym;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_RINGBUF);
+} ringbuf SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, __u32);
+} array_map SEC(".maps");
+
+int err, pid;
+
+char _license[] SEC("license") = "GPL";
+
+SEC("?lsm.s/bpf")
+int BPF_PROG(dynptr_type_not_supp, int cmd, union bpf_attr *attr,
+	     unsigned int size)
+{
+	char write_data[64] = "hello there, world!!";
+	struct bpf_dynptr ptr;
+
+	bpf_ringbuf_reserve_dynptr(&ringbuf, sizeof(write_data), 0, &ptr);
+
+	return bpf_verify_pkcs7_signature(&ptr, &ptr, NULL);
+}
+
+SEC("?lsm.s/bpf")
+int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size)
+{
+	unsigned long val;
+
+	return bpf_verify_pkcs7_signature((struct bpf_dynptr *)&val,
+					  (struct bpf_dynptr *)&val, NULL);
+}
+
+SEC("?lsm.s/bpf")
+int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size)
+{
+	unsigned long val;
+
+	return bpf_verify_pkcs7_signature((struct bpf_dynptr *)val,
+					  (struct bpf_dynptr *)val, NULL);
+}
+
+SEC("lsm.s/bpf")
+int BPF_PROG(dynptr_data_null, int cmd, union bpf_attr *attr, unsigned int size)
+{
+	struct bpf_key *trusted_keyring;
+	struct bpf_dynptr ptr;
+	__u32 *value;
+	int ret, zero = 0;
+
+	if (bpf_get_current_pid_tgid() >> 32 != pid)
+		return 0;
+
+	value = bpf_map_lookup_elem(&array_map, &zero);
+	if (!value)
+		return 0;
+
+	/* Pass invalid flags. */
+	ret = bpf_dynptr_from_mem(value, sizeof(*value), ((__u64)~0ULL), &ptr);
+	if (ret != -EINVAL)
+		return 0;
+
+	trusted_keyring = bpf_lookup_system_key(0);
+	if (!trusted_keyring)
+		return 0;
+
+	err = bpf_verify_pkcs7_signature(&ptr, &ptr, trusted_keyring);
+
+	bpf_key_put(trusted_keyring);
+
+	return 0;
+}
-- 
cgit v1.2.3


From 05b24ff9b2cfabfcfd951daaa915a036ab53c9e1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Fri, 16 Sep 2022 09:19:14 +0200
Subject: bpf: Prevent bpf program recursion for raw tracepoint probes

We got report from sysbot [1] about warnings that were caused by
bpf program attached to contention_begin raw tracepoint triggering
the same tracepoint by using bpf_trace_printk helper that takes
trace_printk_lock lock.

 Call Trace:
  <TASK>
  ? trace_event_raw_event_bpf_trace_printk+0x5f/0x90
  bpf_trace_printk+0x2b/0xe0
  bpf_prog_a9aec6167c091eef_prog+0x1f/0x24
  bpf_trace_run2+0x26/0x90
  native_queued_spin_lock_slowpath+0x1c6/0x2b0
  _raw_spin_lock_irqsave+0x44/0x50
  bpf_trace_printk+0x3f/0xe0
  bpf_prog_a9aec6167c091eef_prog+0x1f/0x24
  bpf_trace_run2+0x26/0x90
  native_queued_spin_lock_slowpath+0x1c6/0x2b0
  _raw_spin_lock_irqsave+0x44/0x50
  bpf_trace_printk+0x3f/0xe0
  bpf_prog_a9aec6167c091eef_prog+0x1f/0x24
  bpf_trace_run2+0x26/0x90
  native_queued_spin_lock_slowpath+0x1c6/0x2b0
  _raw_spin_lock_irqsave+0x44/0x50
  bpf_trace_printk+0x3f/0xe0
  bpf_prog_a9aec6167c091eef_prog+0x1f/0x24
  bpf_trace_run2+0x26/0x90
  native_queued_spin_lock_slowpath+0x1c6/0x2b0
  _raw_spin_lock_irqsave+0x44/0x50
  __unfreeze_partials+0x5b/0x160
  ...

The can be reproduced by attaching bpf program as raw tracepoint on
contention_begin tracepoint. The bpf prog calls bpf_trace_printk
helper. Then by running perf bench the spin lock code is forced to
take slow path and call contention_begin tracepoint.

Fixing this by skipping execution of the bpf program if it's
already running, Using bpf prog 'active' field, which is being
currently used by trampoline programs for the same reason.

Moving bpf_prog_inc_misses_counter to syscall.c because
trampoline.c is compiled in just for CONFIG_BPF_JIT option.

Reviewed-by: Stanislav Fomichev <sdf@google.com>
Reported-by: syzbot+2251879aa068ad9c960d@syzkaller.appspotmail.com
[1] https://lore.kernel.org/bpf/YxhFe3EwqchC%2FfYf@krava/T/#t
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220916071914.7156-1-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      |  6 ++++++
 kernel/bpf/syscall.c     | 11 +++++++++++
 kernel/bpf/trampoline.c  | 15 ++-------------
 kernel/trace/bpf_trace.c |  6 ++++++
 4 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a1435b019aca..edd43edb27d6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2042,6 +2042,8 @@ static inline bool has_current_bpf_ctx(void)
 {
 	return !!current->bpf_ctx;
 }
+
+void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog);
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -2264,6 +2266,10 @@ static inline bool has_current_bpf_ctx(void)
 {
 	return false;
 }
+
+static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog)
+{
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dab156f09f8d..372fad5ef3d3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2093,6 +2093,17 @@ struct bpf_prog_kstats {
 	u64 misses;
 };
 
+void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
+{
+	struct bpf_prog_stats *stats;
+	unsigned int flags;
+
+	stats = this_cpu_ptr(prog->stats);
+	flags = u64_stats_update_begin_irqsave(&stats->syncp);
+	u64_stats_inc(&stats->misses);
+	u64_stats_update_end_irqrestore(&stats->syncp, flags);
+}
+
 static void bpf_prog_get_stats(const struct bpf_prog *prog,
 			       struct bpf_prog_kstats *stats)
 {
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index ad76940b02cc..41b67eb83ab3 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -863,17 +863,6 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
 	return start;
 }
 
-static void notrace inc_misses_counter(struct bpf_prog *prog)
-{
-	struct bpf_prog_stats *stats;
-	unsigned int flags;
-
-	stats = this_cpu_ptr(prog->stats);
-	flags = u64_stats_update_begin_irqsave(&stats->syncp);
-	u64_stats_inc(&stats->misses);
-	u64_stats_update_end_irqrestore(&stats->syncp, flags);
-}
-
 /* The logic is similar to bpf_prog_run(), but with an explicit
  * rcu_read_lock() and migrate_disable() which are required
  * for the trampoline. The macro is split into
@@ -896,7 +885,7 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *ru
 	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
 
 	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
-		inc_misses_counter(prog);
+		bpf_prog_inc_misses_counter(prog);
 		return 0;
 	}
 	return bpf_prog_start_time();
@@ -967,7 +956,7 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r
 	might_fault();
 
 	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
-		inc_misses_counter(prog);
+		bpf_prog_inc_misses_counter(prog);
 		return 0;
 	}
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 9df53c40cffd..b05f0310dbd3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2222,9 +2222,15 @@ static __always_inline
 void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
 {
 	cant_sleep();
+	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
+		bpf_prog_inc_misses_counter(prog);
+		goto out;
+	}
 	rcu_read_lock();
 	(void) bpf_prog_run(prog, args);
 	rcu_read_unlock();
+out:
+	this_cpu_dec(*(prog->active));
 }
 
 #define UNPACK(...)			__VA_ARGS__
-- 
cgit v1.2.3


From 1d8b82c613297f24354b4d750413a7456b5cd92c Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 21 Sep 2022 15:38:26 +0800
Subject: bpf: Always use raw spinlock for hash bucket lock

For a non-preallocated hash map on RT kernel, regular spinlock instead
of raw spinlock is used for bucket lock. The reason is that on RT kernel
memory allocation is forbidden under atomic context and regular spinlock
is sleepable under RT.

Now hash map has been fully converted to use bpf_map_alloc, and there
will be no synchronous memory allocation for non-preallocated hash map,
so it is safe to always use raw spinlock for bucket lock on RT. So
removing the usage of htab_use_raw_lock() and updating the comments
accordingly.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20220921073826.2365800-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/hashtab.c | 66 +++++++++++-----------------------------------------
 1 file changed, 14 insertions(+), 52 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 86aec20c22d0..ed3f8a53603b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -68,24 +68,16 @@
  * In theory the BPF locks could be converted to regular spinlocks as well,
  * but the bucket locks and percpu_freelist locks can be taken from
  * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
- * atomic contexts even on RT. These mechanisms require preallocated maps,
- * so there is no need to invoke memory allocations within the lock held
- * sections.
- *
- * BPF maps which need dynamic allocation are only used from (forced)
- * thread context on RT and can therefore use regular spinlocks which in
- * turn allows to invoke memory allocations from the lock held section.
- *
- * On a non RT kernel this distinction is neither possible nor required.
- * spinlock maps to raw_spinlock and the extra code is optimized out by the
- * compiler.
+ * atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
+ * it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
+ * because there is no memory allocation within the lock held sections. However
+ * after hash map was fully converted to use bpf_mem_alloc, there will be
+ * non-synchronous memory allocation for non-preallocated hash map, so it is
+ * safe to always use raw spinlock for bucket lock.
  */
 struct bucket {
 	struct hlist_nulls_head head;
-	union {
-		raw_spinlock_t raw_lock;
-		spinlock_t     lock;
-	};
+	raw_spinlock_t raw_lock;
 };
 
 #define HASHTAB_MAP_LOCK_COUNT 8
@@ -141,26 +133,15 @@ static inline bool htab_is_prealloc(const struct bpf_htab *htab)
 	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
 }
 
-static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
-{
-	return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
-}
-
 static void htab_init_buckets(struct bpf_htab *htab)
 {
 	unsigned int i;
 
 	for (i = 0; i < htab->n_buckets; i++) {
 		INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
-		if (htab_use_raw_lock(htab)) {
-			raw_spin_lock_init(&htab->buckets[i].raw_lock);
-			lockdep_set_class(&htab->buckets[i].raw_lock,
+		raw_spin_lock_init(&htab->buckets[i].raw_lock);
+		lockdep_set_class(&htab->buckets[i].raw_lock,
 					  &htab->lockdep_key);
-		} else {
-			spin_lock_init(&htab->buckets[i].lock);
-			lockdep_set_class(&htab->buckets[i].lock,
-					  &htab->lockdep_key);
-		}
 		cond_resched();
 	}
 }
@@ -170,28 +151,17 @@ static inline int htab_lock_bucket(const struct bpf_htab *htab,
 				   unsigned long *pflags)
 {
 	unsigned long flags;
-	bool use_raw_lock;
 
 	hash = hash & HASHTAB_MAP_LOCK_MASK;
 
-	use_raw_lock = htab_use_raw_lock(htab);
-	if (use_raw_lock)
-		preempt_disable();
-	else
-		migrate_disable();
+	preempt_disable();
 	if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
 		__this_cpu_dec(*(htab->map_locked[hash]));
-		if (use_raw_lock)
-			preempt_enable();
-		else
-			migrate_enable();
+		preempt_enable();
 		return -EBUSY;
 	}
 
-	if (use_raw_lock)
-		raw_spin_lock_irqsave(&b->raw_lock, flags);
-	else
-		spin_lock_irqsave(&b->lock, flags);
+	raw_spin_lock_irqsave(&b->raw_lock, flags);
 	*pflags = flags;
 
 	return 0;
@@ -201,18 +171,10 @@ static inline void htab_unlock_bucket(const struct bpf_htab *htab,
 				      struct bucket *b, u32 hash,
 				      unsigned long flags)
 {
-	bool use_raw_lock = htab_use_raw_lock(htab);
-
 	hash = hash & HASHTAB_MAP_LOCK_MASK;
-	if (use_raw_lock)
-		raw_spin_unlock_irqrestore(&b->raw_lock, flags);
-	else
-		spin_unlock_irqrestore(&b->lock, flags);
+	raw_spin_unlock_irqrestore(&b->raw_lock, flags);
 	__this_cpu_dec(*(htab->map_locked[hash]));
-	if (use_raw_lock)
-		preempt_enable();
-	else
-		migrate_enable();
+	preempt_enable();
 }
 
 static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
-- 
cgit v1.2.3


From eed807f626101f6a4227bd53942892c5983b95a7 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 21 Sep 2022 18:48:25 +0200
Subject: bpf: Tweak definition of KF_TRUSTED_ARGS

Instead of forcing all arguments to be referenced pointers with non-zero
reg->ref_obj_id, tweak the definition of KF_TRUSTED_ARGS to mean that
only PTR_TO_BTF_ID (and socket types translated to PTR_TO_BTF_ID) have
that constraint, and require their offset to be set to 0.

The rest of pointer types are also accomodated in this definition of
trusted pointers, but with more relaxed rules regarding offsets.

The inherent meaning of setting this flag is that all kfunc pointer
arguments have a guranteed lifetime, and kernel object pointers
(PTR_TO_BTF_ID, PTR_TO_CTX) are passed in their unmodified form (with
offset 0). In general, this is not true for PTR_TO_BTF_ID as it can be
obtained using pointer walks.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/cdede0043c47ed7a357f0a915d16f9ce06a1d589.1663778601.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/kfuncs.rst | 24 ++++++++++++++++--------
 kernel/bpf/btf.c             | 18 +++++++++++++-----
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 781731749e55..0f858156371d 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -137,14 +137,22 @@ KF_ACQUIRE and KF_RET_NULL flags.
 --------------------------
 
 The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It
-indicates that the all pointer arguments will always be refcounted, and have
-their offset set to 0. It can be used to enforce that a pointer to a refcounted
-object acquired from a kfunc or BPF helper is passed as an argument to this
-kfunc without any modifications (e.g. pointer arithmetic) such that it is
-trusted and points to the original object. This flag is often used for kfuncs
-that operate (change some property, perform some operation) on an object that
-was obtained using an acquire kfunc. Such kfuncs need an unchanged pointer to
-ensure the integrity of the operation being performed on the expected object.
+indicates that the all pointer arguments will always have a guaranteed lifetime,
+and pointers to kernel objects are always passed to helpers in their unmodified
+form (as obtained from acquire kfuncs).
+
+It can be used to enforce that a pointer to a refcounted object acquired from a
+kfunc or BPF helper is passed as an argument to this kfunc without any
+modifications (e.g. pointer arithmetic) such that it is trusted and points to
+the original object.
+
+Meanwhile, it is also allowed pass pointers to normal memory to such kfuncs,
+but those can have a non-zero offset.
+
+This flag is often used for kfuncs that operate (change some property, perform
+some operation) on an object that was obtained using an acquire kfunc. Such
+kfuncs need an unchanged pointer to ensure the integrity of the operation being
+performed on the expected object.
 
 2.4.6 KF_SLEEPABLE flag
 -----------------------
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 13faede0f2b4..a44ad4b347ff 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6227,7 +6227,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 				    bool processing_call)
 {
 	enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
-	bool rel = false, kptr_get = false, trusted_arg = false;
+	bool rel = false, kptr_get = false, trusted_args = false;
 	bool sleepable = false;
 	struct bpf_verifier_log *log = &env->log;
 	u32 i, nargs, ref_id, ref_obj_id = 0;
@@ -6265,7 +6265,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		/* Only kfunc can be release func */
 		rel = kfunc_meta->flags & KF_RELEASE;
 		kptr_get = kfunc_meta->flags & KF_KPTR_GET;
-		trusted_arg = kfunc_meta->flags & KF_TRUSTED_ARGS;
+		trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
 		sleepable = kfunc_meta->flags & KF_SLEEPABLE;
 	}
 
@@ -6276,6 +6276,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		enum bpf_arg_type arg_type = ARG_DONTCARE;
 		u32 regno = i + 1;
 		struct bpf_reg_state *reg = &regs[regno];
+		bool obj_ptr = false;
 
 		t = btf_type_skip_modifiers(btf, args[i].type, NULL);
 		if (btf_type_is_scalar(t)) {
@@ -6323,10 +6324,17 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 			return -EINVAL;
 		}
 
+		/* These register types have special constraints wrt ref_obj_id
+		 * and offset checks. The rest of trusted args don't.
+		 */
+		obj_ptr = reg->type == PTR_TO_CTX || reg->type == PTR_TO_BTF_ID ||
+			  reg2btf_ids[base_type(reg->type)];
+
 		/* Check if argument must be a referenced pointer, args + i has
 		 * been verified to be a pointer (after skipping modifiers).
+		 * PTR_TO_CTX is ok without having non-zero ref_obj_id.
 		 */
-		if (is_kfunc && trusted_arg && !reg->ref_obj_id) {
+		if (is_kfunc && trusted_args && (obj_ptr && reg->type != PTR_TO_CTX) && !reg->ref_obj_id) {
 			bpf_log(log, "R%d must be referenced\n", regno);
 			return -EINVAL;
 		}
@@ -6335,7 +6343,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 		ref_tname = btf_name_by_offset(btf, ref_t->name_off);
 
 		/* Trusted args have the same offset checks as release arguments */
-		if (trusted_arg || (rel && reg->ref_obj_id))
+		if ((trusted_args && obj_ptr) || (rel && reg->ref_obj_id))
 			arg_type |= OBJ_RELEASE;
 		ret = check_func_arg_reg_off(env, reg, regno, arg_type);
 		if (ret < 0)
@@ -6435,7 +6443,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
 							   reg_ref_t->name_off);
 			if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
 						  reg->off, btf, ref_id,
-						  trusted_arg || (rel && reg->ref_obj_id))) {
+						  trusted_args || (rel && reg->ref_obj_id))) {
 				bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
 					func_name, i,
 					btf_type_str(ref_t), ref_tname,
-- 
cgit v1.2.3


From 0fabd2aa199faeb8754aee94658f2c48ccb2c8c3 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Wed, 21 Sep 2022 18:48:26 +0200
Subject: net: netfilter: add bpf_ct_set_nat_info kfunc helper

Introduce bpf_ct_set_nat_info kfunc helper in order to set source and
destination nat addresses/ports in a new allocated ct entry not inserted
in the connection tracking table yet.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/9567db2fdfa5bebe7b7cc5870f7a34549418b4fc.1663778601.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/netfilter/nf_conntrack_bpf.c | 47 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 67df64283aef..756ea818574e 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -17,6 +17,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_bpf.h>
 #include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_nat.h>
 
 /* bpf_ct_opts - Options for CT lookup helpers
  *
@@ -137,7 +138,6 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
 
 	memset(&ct->proto, 0, sizeof(ct->proto));
 	__nf_ct_set_timeout(ct, timeout * HZ);
-	ct->status |= IPS_CONFIRMED;
 
 out:
 	if (opts->netns_id >= 0)
@@ -390,6 +390,7 @@ struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
 	struct nf_conn *nfct = (struct nf_conn *)nfct_i;
 	int err;
 
+	nfct->status |= IPS_CONFIRMED;
 	err = nf_conntrack_hash_check_insert(nfct);
 	if (err < 0) {
 		nf_conntrack_free(nfct);
@@ -475,6 +476,49 @@ int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
 	return nf_ct_change_status_common(nfct, status);
 }
 
+/* bpf_ct_set_nat_info - Set source or destination nat address
+ *
+ * Set source or destination nat address of the newly allocated
+ * nf_conn before insertion. This must be invoked for referenced
+ * PTR_TO_BTF_ID to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct	- Pointer to referenced nf_conn object, obtained using
+ *		  bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @addr	- Nat source/destination address
+ * @port	- Nat source/destination port. Non-positive values are
+ *		  interpreted as select a random port.
+ * @manip	- NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST
+ */
+int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
+			union nf_inet_addr *addr, int port,
+			enum nf_nat_manip_type manip)
+{
+#if ((IS_MODULE(CONFIG_NF_NAT) && IS_MODULE(CONFIG_NF_CONNTRACK)) || \
+     IS_BUILTIN(CONFIG_NF_NAT))
+	struct nf_conn *ct = (struct nf_conn *)nfct;
+	u16 proto = nf_ct_l3num(ct);
+	struct nf_nat_range2 range;
+
+	if (proto != NFPROTO_IPV4 && proto != NFPROTO_IPV6)
+		return -EINVAL;
+
+	memset(&range, 0, sizeof(struct nf_nat_range2));
+	range.flags = NF_NAT_RANGE_MAP_IPS;
+	range.min_addr = *addr;
+	range.max_addr = range.min_addr;
+	if (port > 0) {
+		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+		range.min_proto.all = cpu_to_be16(port);
+		range.max_proto.all = range.min_proto.all;
+	}
+
+	return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
 __diag_pop()
 
 BTF_SET8_START(nf_ct_kfunc_set)
@@ -488,6 +532,7 @@ BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
 BTF_SET8_END(nf_ct_kfunc_set)
 
 static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
-- 
cgit v1.2.3


From b06b45e82b59b69f5ac6b3916ac5dbd0294efc95 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Wed, 21 Sep 2022 18:48:27 +0200
Subject: selftests/bpf: add tests for bpf_ct_set_nat_info kfunc

Introduce self-tests for bpf_ct_set_nat_info kfunc used to set the
source or destination nat addresses/ports.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Link: https://lore.kernel.org/r/803e33294e247744d466943105879414344d3235.1663778601.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/config              |  1 +
 tools/testing/selftests/bpf/prog_tests/bpf_nf.c | 10 ++++++---
 tools/testing/selftests/bpf/progs/test_bpf_nf.c | 27 +++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index 905a9be8d0a2..9213565c0311 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -63,6 +63,7 @@ CONFIG_NF_CONNTRACK=y
 CONFIG_NF_CONNTRACK_MARK=y
 CONFIG_NF_DEFRAG_IPV4=y
 CONFIG_NF_DEFRAG_IPV6=y
+CONFIG_NF_NAT=y
 CONFIG_RC_CORE=y
 CONFIG_SECURITY=y
 CONFIG_SECURITYFS=y
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
index 0677a51694c9..8a838ea8bdf3 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
@@ -26,7 +26,10 @@ enum {
 	TEST_TC_BPF,
 };
 
-#define TIMEOUT_MS 3000
+#define TIMEOUT_MS		3000
+#define IPS_STATUS_MASK		(IPS_CONFIRMED | IPS_SEEN_REPLY | \
+				 IPS_SRC_NAT_DONE | IPS_DST_NAT_DONE | \
+				 IPS_SRC_NAT | IPS_DST_NAT)
 
 static int connect_to_server(int srv_fd)
 {
@@ -114,10 +117,11 @@ static void test_bpf_nf_ct(int mode)
 	ASSERT_GT(skel->bss->test_delta_timeout, 8, "Test for min ct timeout update");
 	ASSERT_LE(skel->bss->test_delta_timeout, 10, "Test for max ct timeout update");
 	ASSERT_EQ(skel->bss->test_insert_lookup_mark, 77, "Test for insert and lookup mark value");
-	ASSERT_EQ(skel->bss->test_status, IPS_CONFIRMED | IPS_SEEN_REPLY,
-		  "Test for ct status update ");
+	ASSERT_EQ(skel->bss->test_status, IPS_STATUS_MASK, "Test for ct status update ");
 	ASSERT_EQ(skel->data->test_exist_lookup, 0, "Test existing connection lookup");
 	ASSERT_EQ(skel->bss->test_exist_lookup_mark, 43, "Test existing connection lookup ctmark");
+	ASSERT_EQ(skel->data->test_snat_addr, 0, "Test for source natting");
+	ASSERT_EQ(skel->data->test_dnat_addr, 0, "Test for destination natting");
 end:
 	if (srv_client_fd != -1)
 		close(srv_client_fd);
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
index 88842da86ddc..227e85e85dda 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <vmlinux.h>
 #include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
 
 #define EAFNOSUPPORT 97
 #define EPROTO 71
@@ -24,6 +25,8 @@ int test_succ_lookup = -ENOENT;
 u32 test_delta_timeout = 0;
 u32 test_status = 0;
 u32 test_insert_lookup_mark = 0;
+int test_snat_addr = -EINVAL;
+int test_dnat_addr = -EINVAL;
 __be32 saddr = 0;
 __be16 sport = 0;
 __be32 daddr = 0;
@@ -54,6 +57,8 @@ void bpf_ct_set_timeout(struct nf_conn *, u32) __ksym;
 int bpf_ct_change_timeout(struct nf_conn *, u32) __ksym;
 int bpf_ct_set_status(struct nf_conn *, u32) __ksym;
 int bpf_ct_change_status(struct nf_conn *, u32) __ksym;
+int bpf_ct_set_nat_info(struct nf_conn *, union nf_inet_addr *,
+			int port, enum nf_nat_manip_type) __ksym;
 
 static __always_inline void
 nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
@@ -141,11 +146,22 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
 	ct = alloc_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
 		      sizeof(opts_def));
 	if (ct) {
+		__u16 sport = bpf_get_prandom_u32();
+		__u16 dport = bpf_get_prandom_u32();
+		union nf_inet_addr saddr = {};
+		union nf_inet_addr daddr = {};
 		struct nf_conn *ct_ins;
 
 		bpf_ct_set_timeout(ct, 10000);
 		ct->mark = 77;
 
+		/* snat */
+		saddr.ip = bpf_get_prandom_u32();
+		bpf_ct_set_nat_info(ct, &saddr, sport, NF_NAT_MANIP_SRC);
+		/* dnat */
+		daddr.ip = bpf_get_prandom_u32();
+		bpf_ct_set_nat_info(ct, &daddr, dport, NF_NAT_MANIP_DST);
+
 		ct_ins = bpf_ct_insert_entry(ct);
 		if (ct_ins) {
 			struct nf_conn *ct_lk;
@@ -153,6 +169,17 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
 			ct_lk = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4),
 					  &opts_def, sizeof(opts_def));
 			if (ct_lk) {
+				struct nf_conntrack_tuple *tuple;
+
+				/* check snat and dnat addresses */
+				tuple = &ct_lk->tuplehash[IP_CT_DIR_REPLY].tuple;
+				if (tuple->dst.u3.ip == saddr.ip &&
+				    tuple->dst.u.all == bpf_htons(sport))
+					test_snat_addr = 0;
+				if (tuple->src.u3.ip == daddr.ip &&
+				    tuple->src.u.all == bpf_htons(dport))
+					test_dnat_addr = 0;
+
 				/* update ct entry timeout */
 				bpf_ct_change_timeout(ct_lk, 10000);
 				test_delta_timeout = ct_lk->timeout - bpf_jiffies64();
-- 
cgit v1.2.3


From f338ac9105679df504c3809784f0716c25e87b31 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 21 Sep 2022 09:42:51 -0700
Subject: selftests/bpf: fix double bpf_object__close() in veristate

bpf_object__close(obj) is called twice for BPF object files with single
BPF program in it. This causes crash. Fix this by not calling
bpf_object__close() unnecessarily.

Fixes: c8bc5e050976 ("selftests/bpf: Add veristat tool for mass-verifying BPF object files")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220921164254.3630690-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/veristat.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 39e6dc41e504..c0c8a65cda52 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -300,7 +300,6 @@ static int process_obj(const char *filename)
 		prog = bpf_object__next_program(obj, NULL);
 		bpf_program__set_autoload(prog, true);
 		process_prog(filename, obj, prog);
-		bpf_object__close(obj);
 		goto cleanup;
 	}
 
-- 
cgit v1.2.3


From e5eb08d8fe469c0da8643042893a0b7481807443 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 21 Sep 2022 09:42:52 -0700
Subject: selftests/bpf: add CSV output mode for veristat

Teach veristat to output results as CSV table for easier programmatic
processing. Change what was --output/-o argument to now be --emit/-e.
And then use --output-format/-o <fmt> to specify output format.
Currently "table" and "csv" is supported, table being default.

For CSV output mode veristat is using spec identifiers as column names.
E.g., instead of "Total states" veristat uses "total_states" as a CSV
header name.

Internally veristat recognizes three formats, one of them
(RESFMT_TABLE_CALCLEN) is a special format instructing veristat to
calculate column widths for table output. This felt a bit cleaner and
more uniform than either creating separate functions just for this.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220921164254.3630690-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/veristat.c | 111 ++++++++++++++++++++++-----------
 1 file changed, 75 insertions(+), 36 deletions(-)

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index c0c8a65cda52..0472bfae3c9d 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -46,10 +46,17 @@ struct stat_specs {
 	int lens[ALL_STATS_CNT];
 };
 
+enum resfmt {
+	RESFMT_TABLE,
+	RESFMT_TABLE_CALCLEN, /* fake format to pre-calculate table's column widths */
+	RESFMT_CSV,
+};
+
 static struct env {
 	char **filenames;
 	int filename_cnt;
 	bool verbose;
+	enum resfmt out_fmt;
 
 	struct verif_stats *prog_stats;
 	int prog_stat_cnt;
@@ -78,8 +85,9 @@ const char argp_program_doc[] =
 static const struct argp_option opts[] = {
 	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
 	{ "verbose", 'v', NULL, 0, "Verbose mode" },
-	{ "output", 'o', "SPEC", 0, "Specify output stats" },
+	{ "emit", 'e', "SPEC", 0, "Specify stats to be emitted" },
 	{ "sort", 's', "SPEC", 0, "Specify sort order" },
+	{ "output-format", 'o', "FMT", 0, "Result output format (table, csv), default is table." },
 	{},
 };
 
@@ -97,7 +105,7 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 	case 'v':
 		env.verbose = true;
 		break;
-	case 'o':
+	case 'e':
 		err = parse_stats(arg, &env.output_spec);
 		if (err)
 			return err;
@@ -107,6 +115,16 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 		if (err)
 			return err;
 		break;
+	case 'o':
+		if (strcmp(arg, "table") == 0) {
+			env.out_fmt = RESFMT_TABLE;
+		} else if (strcmp(arg, "csv") == 0) {
+			env.out_fmt = RESFMT_CSV;
+		} else {
+			fprintf(stderr, "Unrecognized output format '%s'\n", arg);
+			return -EINVAL;
+		}
+		break;
 	case ARGP_KEY_ARG:
 		tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames));
 		if (!tmp)
@@ -147,7 +165,7 @@ static struct stat_def {
 	[FILE_NAME] = { "File", {"file_name", "filename", "file"}, true /* asc */ },
 	[PROG_NAME] = { "Program", {"prog_name", "progname", "prog"}, true /* asc */ },
 	[VERDICT] = { "Verdict", {"verdict"}, true /* asc: failure, success */ },
-	[DURATION] = { "Duration, us", {"duration", "dur"}, },
+	[DURATION] = { "Duration (us)", {"duration", "dur"}, },
 	[TOTAL_INSNS] = { "Total insns", {"total_insns", "insns"}, },
 	[TOTAL_STATES] = { "Total states", {"total_states", "states"}, },
 	[PEAK_STATES] = { "Peak states", {"peak_states"}, },
@@ -385,7 +403,21 @@ static int cmp_prog_stats(const void *v1, const void *v2)
 #define HEADER_CHAR '-'
 #define COLUMN_SEP "  "
 
-static void output_headers(bool calc_len)
+static void output_header_underlines(void)
+{
+	int i, j, len;
+
+	for (i = 0; i < env.output_spec.spec_cnt; i++) {
+		len = env.output_spec.lens[i];
+
+		printf("%s", i == 0 ? "" : COLUMN_SEP);
+		for (j = 0; j < len; j++)
+			printf("%c", HEADER_CHAR);
+	}
+	printf("\n");
+}
+
+static void output_headers(enum resfmt fmt)
 {
 	int i, len;
 
@@ -393,34 +425,30 @@ static void output_headers(bool calc_len)
 		int id = env.output_spec.ids[i];
 		int *max_len = &env.output_spec.lens[i];
 
-		if (calc_len) {
+		switch (fmt) {
+		case RESFMT_TABLE_CALCLEN:
 			len = snprintf(NULL, 0, "%s", stat_defs[id].header);
 			if (len > *max_len)
 				*max_len = len;
-		} else {
+			break;
+		case RESFMT_TABLE:
 			printf("%s%-*s", i == 0 ? "" : COLUMN_SEP,  *max_len, stat_defs[id].header);
+			if (i == env.output_spec.spec_cnt - 1)
+				printf("\n");
+			break;
+		case RESFMT_CSV:
+			printf("%s%s", i == 0 ? "" : ",", stat_defs[id].names[0]);
+			if (i == env.output_spec.spec_cnt - 1)
+				printf("\n");
+			break;
 		}
 	}
 
-	if (!calc_len)
-		printf("\n");
+	if (fmt == RESFMT_TABLE)
+		output_header_underlines();
 }
 
-static void output_header_underlines(void)
-{
-	int i, j, len;
-
-	for (i = 0; i < env.output_spec.spec_cnt; i++) {
-		len = env.output_spec.lens[i];
-
-		printf("%s", i == 0 ? "" : COLUMN_SEP);
-		for (j = 0; j < len; j++)
-			printf("%c", HEADER_CHAR);
-	}
-	printf("\n");
-}
-
-static void output_stats(const struct verif_stats *s, bool calc_len)
+static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last)
 {
 	int i;
 
@@ -453,23 +481,36 @@ static void output_stats(const struct verif_stats *s, bool calc_len)
 			exit(1);
 		}
 
-		if (calc_len) {
+		switch (fmt) {
+		case RESFMT_TABLE_CALCLEN:
 			if (str)
 				len = snprintf(NULL, 0, "%s", str);
 			else
 				len = snprintf(NULL, 0, "%ld", val);
 			if (len > *max_len)
 				*max_len = len;
-		} else {
+			break;
+		case RESFMT_TABLE:
 			if (str)
 				printf("%s%-*s", i == 0 ? "" : COLUMN_SEP, *max_len, str);
 			else
 				printf("%s%*ld", i == 0 ? "" : COLUMN_SEP,  *max_len, val);
+			if (i == env.output_spec.spec_cnt - 1)
+				printf("\n");
+			break;
+		case RESFMT_CSV:
+			if (str)
+				printf("%s%s", i == 0 ? "" : ",", str);
+			else
+				printf("%s%ld", i == 0 ? "" : ",", val);
+			if (i == env.output_spec.spec_cnt - 1)
+				printf("\n");
+			break;
 		}
 	}
 
-	if (!calc_len)
-		printf("\n");
+	if (last && fmt == RESFMT_TABLE)
+		output_header_underlines();
 }
 
 int main(int argc, char **argv)
@@ -505,20 +546,18 @@ int main(int argc, char **argv)
 
 	qsort(env.prog_stats, env.prog_stat_cnt, sizeof(*env.prog_stats), cmp_prog_stats);
 
-	/* calculate column widths */
-	output_headers(true);
-	for (i = 0; i < env.prog_stat_cnt; i++) {
-		output_stats(&env.prog_stats[i], true);
+	if (env.out_fmt == RESFMT_TABLE) {
+		/* calculate column widths */
+		output_headers(RESFMT_TABLE_CALCLEN);
+		for (i = 0; i < env.prog_stat_cnt; i++)
+			output_stats(&env.prog_stats[i], RESFMT_TABLE_CALCLEN, false);
 	}
 
 	/* actually output the table */
-	output_headers(false);
-	output_header_underlines();
+	output_headers(env.out_fmt);
 	for (i = 0; i < env.prog_stat_cnt; i++) {
-		output_stats(&env.prog_stats[i], false);
+		output_stats(&env.prog_stats[i], env.out_fmt, i == env.prog_stat_cnt - 1);
 	}
-	output_header_underlines();
-	printf("\n");
 
 	printf("Done. Processed %d object files, %d programs.\n",
 	       env.filename_cnt, env.prog_stat_cnt);
-- 
cgit v1.2.3


From 394169b079b558cf91a9c23ffb6b55c14cd927e1 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 21 Sep 2022 09:42:53 -0700
Subject: selftests/bpf: add comparison mode to veristat

Add ability to compare and contrast two veristat runs, previously
recorded with veristat using CSV output format.

When veristat is called with -C (--compare) flag, veristat expects
exactly two input files specified, both should be in CSV format.
Expectation is that it's output from previous veristat runs, but as long
as column names and formats match, it should just work. First CSV file
is designated as a "baseline" provided, and the second one is
comparison (experiment) data set. Establishing baseline matters later
when calculating difference percentages, see below.

Veristat parses these two CSV files and "reconstructs" verifier stats
(it could be just a subset of all possible stats). File and program
names are mandatory as they are used as joining key (these two "stats"
are designated as "key stats" in the code).

Veristat currently enforces that the set of stats recorded in both CSV
has to exactly match, down to exact order. This is just a simplifying
condition which can be lifted with a bit of additional pre-processing to
reorded stat specs internally, which I didn't bother doing, yet.

For all the non-key stats, veristat will output three columns: one for
baseline data, one for comparison data, and one with an absolute and
relative percentage difference. If either baseline or comparison values
are missing (that is, respective CSV file doesn't have a row with
*exactly* matching file and program name), those values are assumed to
be empty or zero. In such case relative percentages are forced to +100%
or -100% output, for consistency with a typical case.

Veristat's -e (--emit) and -s (--sort) specs still apply, so even if CSV
contains lots of stats, user can request to compare only a subset of
them (and specify desired column order as well). Similarly, both CSV and
human-readable table output is honored. Note that input is currently
always expected to be CSV.

Here's an example shell session, recording data for biosnoop tool on two
different kernels and comparing them afterwards, outputting data in table
format.

  # on slightly older production kernel
  $ sudo ./veristat biosnoop_bpf.o
  File            Program                   Verdict  Duration (us)  Total insns  Total states  Peak states
  --------------  ------------------------  -------  -------------  -----------  ------------  -----------
  biosnoop_bpf.o  blk_account_io_merge_bio  success             37           24             1            1
  biosnoop_bpf.o  blk_account_io_start      failure              0            0             0            0
  biosnoop_bpf.o  block_rq_complete         success             76          104             6            6
  biosnoop_bpf.o  block_rq_insert           success             83           85             7            7
  biosnoop_bpf.o  block_rq_issue            success             79           85             7            7
  --------------  ------------------------  -------  -------------  -----------  ------------  -----------
  Done. Processed 1 object files, 5 programs.
  $ sudo ./veristat ~/local/tmp/fbcode-bpf-objs/biosnoop_bpf.o -o csv > baseline.csv
  $ cat baseline.csv
  file_name,prog_name,verdict,duration,total_insns,total_states,peak_states
  biosnoop_bpf.o,blk_account_io_merge_bio,success,36,24,1,1
  biosnoop_bpf.o,blk_account_io_start,failure,0,0,0,0
  biosnoop_bpf.o,block_rq_complete,success,82,104,6,6
  biosnoop_bpf.o,block_rq_insert,success,78,85,7,7
  biosnoop_bpf.o,block_rq_issue,success,74,85,7,7

  # on latest bpf-next kernel
  $ sudo ./veristat biosnoop_bpf.o
  File            Program                   Verdict  Duration (us)  Total insns  Total states  Peak states
  --------------  ------------------------  -------  -------------  -----------  ------------  -----------
  biosnoop_bpf.o  blk_account_io_merge_bio  success             31           24             1            1
  biosnoop_bpf.o  blk_account_io_start      failure              0            0             0            0
  biosnoop_bpf.o  block_rq_complete         success             76          104             6            6
  biosnoop_bpf.o  block_rq_insert           success             83           91             7            7
  biosnoop_bpf.o  block_rq_issue            success             74           91             7            7
  --------------  ------------------------  -------  -------------  -----------  ------------  -----------
  Done. Processed 1 object files, 5 programs.
  $ sudo ./veristat biosnoop_bpf.o -o csv > comparison.csv
  $ cat comparison.csv
  file_name,prog_name,verdict,duration,total_insns,total_states,peak_states
  biosnoop_bpf.o,blk_account_io_merge_bio,success,71,24,1,1
  biosnoop_bpf.o,blk_account_io_start,failure,0,0,0,0
  biosnoop_bpf.o,block_rq_complete,success,82,104,6,6
  biosnoop_bpf.o,block_rq_insert,success,83,91,7,7
  biosnoop_bpf.o,block_rq_issue,success,87,91,7,7

  # now let's compare with human-readable output (note that no sudo needed)
  # we also ignore verification duration in this case to shortned output
  $ ./veristat -C baseline.csv comparison.csv -e file,prog,verdict,insns
  File            Program                   Verdict (A)  Verdict (B)  Verdict (DIFF)  Total insns (A)  Total insns (B)  Total insns (DIFF)
  --------------  ------------------------  -----------  -----------  --------------  ---------------  ---------------  ------------------
  biosnoop_bpf.o  blk_account_io_merge_bio  success      success      MATCH                        24               24         +0 (+0.00%)
  biosnoop_bpf.o  blk_account_io_start      failure      failure      MATCH                         0                0       +0 (+100.00%)
  biosnoop_bpf.o  block_rq_complete         success      success      MATCH                       104              104         +0 (+0.00%)
  biosnoop_bpf.o  block_rq_insert           success      success      MATCH                        91               85         -6 (-6.59%)
  biosnoop_bpf.o  block_rq_issue            success      success      MATCH                        91               85         -6 (-6.59%)
  --------------  ------------------------  -----------  -----------  --------------  ---------------  ---------------  ------------------

While not particularly exciting example (it turned out to be kind of hard to
quickly find a nice example with significant difference just because of kernel
version bump), it should demonstrate main features.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220921164254.3630690-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/veristat.c | 543 +++++++++++++++++++++++++++++----
 1 file changed, 492 insertions(+), 51 deletions(-)

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 0472bfae3c9d..c6837bac357f 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -43,7 +43,7 @@ struct stat_specs {
 	int spec_cnt;
 	enum stat_id ids[ALL_STATS_CNT];
 	bool asc[ALL_STATS_CNT];
-	int lens[ALL_STATS_CNT];
+	int lens[ALL_STATS_CNT * 3]; /* 3x for comparison mode */
 };
 
 enum resfmt {
@@ -57,16 +57,20 @@ static struct env {
 	int filename_cnt;
 	bool verbose;
 	enum resfmt out_fmt;
+	bool comparison_mode;
 
 	struct verif_stats *prog_stats;
 	int prog_stat_cnt;
 
+	/* baseline_stats is allocated and used only in comparsion mode */
+	struct verif_stats *baseline_stats;
+	int baseline_stat_cnt;
+
 	struct stat_specs output_spec;
 	struct stat_specs sort_spec;
 } env;
 
-static int libbpf_print_fn(enum libbpf_print_level level,
-		    const char *format, va_list args)
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
 {
 	if (!env.verbose)
 		return 0;
@@ -78,9 +82,10 @@ static int libbpf_print_fn(enum libbpf_print_level level,
 const char *argp_program_version = "veristat";
 const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
 const char argp_program_doc[] =
-"veristat    BPF verifier stats collection tool.\n"
+"veristat    BPF verifier stats collection and comparison tool.\n"
 "\n"
-"USAGE: veristat <obj-file> [<obj-file>...]\n";
+"USAGE: veristat <obj-file> [<obj-file>...]\n"
+"   OR: veristat -C <baseline.csv> <comparison.csv>\n";
 
 static const struct argp_option opts[] = {
 	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
@@ -88,6 +93,7 @@ static const struct argp_option opts[] = {
 	{ "emit", 'e', "SPEC", 0, "Specify stats to be emitted" },
 	{ "sort", 's', "SPEC", 0, "Specify sort order" },
 	{ "output-format", 'o', "FMT", 0, "Result output format (table, csv), default is table." },
+	{ "compare", 'C', NULL, 0, "Comparison mode" },
 	{},
 };
 
@@ -125,6 +131,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 			return -EINVAL;
 		}
 		break;
+	case 'C':
+		env.comparison_mode = true;
+		break;
 	case ARGP_KEY_ARG:
 		tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames));
 		if (!tmp)
@@ -141,6 +150,12 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 	return 0;
 }
 
+static const struct argp argp = {
+	.options = opts,
+	.parser = parse_arg,
+	.doc = argp_program_doc,
+};
+
 static const struct stat_specs default_output_spec = {
 	.spec_cnt = 7,
 	.ids = {
@@ -219,6 +234,20 @@ static int parse_stats(const char *stats_str, struct stat_specs *specs)
 	return 0;
 }
 
+static void free_verif_stats(struct verif_stats *stats, size_t stat_cnt)
+{
+	int i;
+
+	if (!stats)
+		return;
+
+	for (i = 0; i < stat_cnt; i++) {
+		free(stats[i].file_name);
+		free(stats[i].prog_name);
+	}
+	free(stats);
+}
+
 static char verif_log_buf[64 * 1024];
 
 static int parse_verif_log(const char *buf, size_t buf_sz, struct verif_stats *s)
@@ -448,6 +477,33 @@ static void output_headers(enum resfmt fmt)
 		output_header_underlines();
 }
 
+static void prepare_value(const struct verif_stats *s, enum stat_id id,
+			  const char **str, long *val)
+{
+	switch (id) {
+	case FILE_NAME:
+		*str = s->file_name;
+		break;
+	case PROG_NAME:
+		*str = s->prog_name;
+		break;
+	case VERDICT:
+		*str = s->stats[VERDICT] ? "success" : "failure";
+		break;
+	case DURATION:
+	case TOTAL_INSNS:
+	case TOTAL_STATES:
+	case PEAK_STATES:
+	case MAX_STATES_PER_INSN:
+	case MARK_READ_MAX_LEN:
+		*val = s->stats[id];
+		break;
+	default:
+		fprintf(stderr, "Unrecognized stat #%d\n", id);
+		exit(1);
+	}
+}
+
 static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last)
 {
 	int i;
@@ -458,28 +514,7 @@ static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last
 		const char *str = NULL;
 		long val = 0;
 
-		switch (id) {
-		case FILE_NAME:
-			str = s->file_name;
-			break;
-		case PROG_NAME:
-			str = s->prog_name;
-			break;
-		case VERDICT:
-			str = s->stats[VERDICT] ? "success" : "failure";
-			break;
-		case DURATION:
-		case TOTAL_INSNS:
-		case TOTAL_STATES:
-		case PEAK_STATES:
-		case MAX_STATES_PER_INSN:
-		case MARK_READ_MAX_LEN:
-			val = s->stats[id];
-			break;
-		default:
-			fprintf(stderr, "Unrecognized stat #%d\n", id);
-			exit(1);
-		}
+		prepare_value(s, id, &str, &val);
 
 		switch (fmt) {
 		case RESFMT_TABLE_CALCLEN:
@@ -509,38 +544,28 @@ static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last
 		}
 	}
 
-	if (last && fmt == RESFMT_TABLE)
+	if (last && fmt == RESFMT_TABLE) {
 		output_header_underlines();
+		printf("Done. Processed %d object files, %d programs.\n",
+		       env.filename_cnt, env.prog_stat_cnt);
+	}
 }
 
-int main(int argc, char **argv)
+static int handle_verif_mode(void)
 {
-	static const struct argp argp = {
-		.options = opts,
-		.parser = parse_arg,
-		.doc = argp_program_doc,
-	};
-	int err = 0, i;
-
-	if (argp_parse(&argp, argc, argv, 0, NULL, NULL))
-		return 1;
+	int i, err;
 
 	if (env.filename_cnt == 0) {
 		fprintf(stderr, "Please provide path to BPF object file!\n");
 		argp_help(&argp, stderr, ARGP_HELP_USAGE, "veristat");
-		return 1;
+		return -EINVAL;
 	}
 
-	if (env.output_spec.spec_cnt == 0)
-		env.output_spec = default_output_spec;
-	if (env.sort_spec.spec_cnt == 0)
-		env.sort_spec = default_sort_spec;
-
 	for (i = 0; i < env.filename_cnt; i++) {
 		err = process_obj(env.filenames[i]);
 		if (err) {
 			fprintf(stderr, "Failed to process '%s': %d\n", env.filenames[i], err);
-			goto cleanup;
+			return err;
 		}
 	}
 
@@ -559,15 +584,431 @@ int main(int argc, char **argv)
 		output_stats(&env.prog_stats[i], env.out_fmt, i == env.prog_stat_cnt - 1);
 	}
 
-	printf("Done. Processed %d object files, %d programs.\n",
-	       env.filename_cnt, env.prog_stat_cnt);
+	return 0;
+}
+
+static int parse_stat_value(const char *str, enum stat_id id, struct verif_stats *st)
+{
+	switch (id) {
+	case FILE_NAME:
+		st->file_name = strdup(str);
+		if (!st->file_name)
+			return -ENOMEM;
+		break;
+	case PROG_NAME:
+		st->prog_name = strdup(str);
+		if (!st->prog_name)
+			return -ENOMEM;
+		break;
+	case VERDICT:
+		if (strcmp(str, "success") == 0) {
+			st->stats[VERDICT] = true;
+		} else if (strcmp(str, "failure") == 0) {
+			st->stats[VERDICT] = false;
+		} else {
+			fprintf(stderr, "Unrecognized verification verdict '%s'\n", str);
+			return -EINVAL;
+		}
+		break;
+	case DURATION:
+	case TOTAL_INSNS:
+	case TOTAL_STATES:
+	case PEAK_STATES:
+	case MAX_STATES_PER_INSN:
+	case MARK_READ_MAX_LEN: {
+		long val;
+		int err, n;
+
+		if (sscanf(str, "%ld %n", &val, &n) != 1 || n != strlen(str)) {
+			err = -errno;
+			fprintf(stderr, "Failed to parse '%s' as integer\n", str);
+			return err;
+		}
+
+		st->stats[id] = val;
+		break;
+	}
+	default:
+		fprintf(stderr, "Unrecognized stat #%d\n", id);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int parse_stats_csv(const char *filename, struct stat_specs *specs,
+			   struct verif_stats **statsp, int *stat_cntp)
+{
+	char line[4096];
+	FILE *f;
+	int err = 0;
+	bool header = true;
+
+	f = fopen(filename, "r");
+	if (!f) {
+		err = -errno;
+		fprintf(stderr, "Failed to open '%s': %d\n", filename, err);
+		return err;
+	}
+
+	*stat_cntp = 0;
+
+	while (fgets(line, sizeof(line), f)) {
+		char *input = line, *state = NULL, *next;
+		struct verif_stats *st = NULL;
+		int col = 0;
+
+		if (!header) {
+			void *tmp;
+
+			tmp = realloc(*statsp, (*stat_cntp + 1) * sizeof(**statsp));
+			if (!tmp) {
+				err = -ENOMEM;
+				goto cleanup;
+			}
+			*statsp = tmp;
+			st = &(*statsp)[*stat_cntp];
+			*stat_cntp += 1;
+		}
+
+		while ((next = strtok_r(state ? NULL : input, ",\n", &state))) {
+			if (header) {
+				/* for the first line, set up spec stats */
+				err = parse_stat(next, specs);
+				if (err)
+					goto cleanup;
+				continue;
+			}
+
+			/* for all other lines, parse values based on spec */
+			if (col >= specs->spec_cnt) {
+				fprintf(stderr, "Found extraneous column #%d in row #%d of '%s'\n",
+					col, *stat_cntp, filename);
+				err = -EINVAL;
+				goto cleanup;
+			}
+			err = parse_stat_value(next, specs->ids[col], st);
+			if (err)
+				goto cleanup;
+			col++;
+		}
+
+		if (!header && col < specs->spec_cnt) {
+			fprintf(stderr, "Not enough columns in row #%d in '%s'\n",
+				*stat_cntp, filename);
+			err = -EINVAL;
+			goto cleanup;
+		}
+
+		header = false;
+	}
+
+	if (!feof(f)) {
+		err = -errno;
+		fprintf(stderr, "Failed I/O for '%s': %d\n", filename, err);
+	}
 
 cleanup:
-	for (i = 0; i < env.prog_stat_cnt; i++) {
-		free(env.prog_stats[i].file_name);
-		free(env.prog_stats[i].prog_name);
+	fclose(f);
+	return err;
+}
+
+/* empty/zero stats for mismatched rows */
+static const struct verif_stats fallback_stats = { .file_name = "", .prog_name = "" };
+
+static bool is_key_stat(enum stat_id id)
+{
+	return id == FILE_NAME || id == PROG_NAME;
+}
+
+static void output_comp_header_underlines(void)
+{
+	int i, j, k;
+
+	for (i = 0; i < env.output_spec.spec_cnt; i++) {
+		int id = env.output_spec.ids[i];
+		int max_j = is_key_stat(id) ? 1 : 3;
+
+		for (j = 0; j < max_j; j++) {
+			int len = env.output_spec.lens[3 * i + j];
+
+			printf("%s", i + j == 0 ? "" : COLUMN_SEP);
+
+			for (k = 0; k < len; k++)
+				printf("%c", HEADER_CHAR);
+		}
+	}
+	printf("\n");
+}
+
+static void output_comp_headers(enum resfmt fmt)
+{
+	static const char *table_sfxs[3] = {" (A)", " (B)", " (DIFF)"};
+	static const char *name_sfxs[3] = {"_base", "_comp", "_diff"};
+	int i, j, len;
+
+	for (i = 0; i < env.output_spec.spec_cnt; i++) {
+		int id = env.output_spec.ids[i];
+		/* key stats don't have A/B/DIFF columns, they are common for both data sets */
+		int max_j = is_key_stat(id) ? 1 : 3;
+
+		for (j = 0; j < max_j; j++) {
+			int *max_len = &env.output_spec.lens[3 * i + j];
+			bool last = (i == env.output_spec.spec_cnt - 1) && (j == max_j - 1);
+			const char *sfx;
+
+			switch (fmt) {
+			case RESFMT_TABLE_CALCLEN:
+				sfx = is_key_stat(id) ? "" : table_sfxs[j];
+				len = snprintf(NULL, 0, "%s%s", stat_defs[id].header, sfx);
+				if (len > *max_len)
+					*max_len = len;
+				break;
+			case RESFMT_TABLE:
+				sfx = is_key_stat(id) ? "" : table_sfxs[j];
+				printf("%s%-*s%s", i + j == 0 ? "" : COLUMN_SEP,
+				       *max_len - (int)strlen(sfx), stat_defs[id].header, sfx);
+				if (last)
+					printf("\n");
+				break;
+			case RESFMT_CSV:
+				sfx = is_key_stat(id) ? "" : name_sfxs[j];
+				printf("%s%s%s", i + j == 0 ? "" : ",", stat_defs[id].names[0], sfx);
+				if (last)
+					printf("\n");
+				break;
+			}
+		}
+	}
+
+	if (fmt == RESFMT_TABLE)
+		output_comp_header_underlines();
+}
+
+static void output_comp_stats(const struct verif_stats *base, const struct verif_stats *comp,
+			      enum resfmt fmt, bool last)
+{
+	char base_buf[1024] = {}, comp_buf[1024] = {}, diff_buf[1024] = {};
+	int i;
+
+	for (i = 0; i < env.output_spec.spec_cnt; i++) {
+		int id = env.output_spec.ids[i], len;
+		int *max_len_base = &env.output_spec.lens[3 * i + 0];
+		int *max_len_comp = &env.output_spec.lens[3 * i + 1];
+		int *max_len_diff = &env.output_spec.lens[3 * i + 2];
+		const char *base_str = NULL, *comp_str = NULL;
+		long base_val = 0, comp_val = 0, diff_val = 0;
+
+		prepare_value(base, id, &base_str, &base_val);
+		prepare_value(comp, id, &comp_str, &comp_val);
+
+		/* normalize all the outputs to be in string buffers for simplicity */
+		if (is_key_stat(id)) {
+			/* key stats (file and program name) are always strings */
+			if (base != &fallback_stats)
+				snprintf(base_buf, sizeof(base_buf), "%s", base_str);
+			else
+				snprintf(base_buf, sizeof(base_buf), "%s", comp_str);
+		} else if (base_str) {
+			snprintf(base_buf, sizeof(base_buf), "%s", base_str);
+			snprintf(comp_buf, sizeof(comp_buf), "%s", comp_str);
+			if (strcmp(base_str, comp_str) == 0)
+				snprintf(diff_buf, sizeof(diff_buf), "%s", "MATCH");
+			else
+				snprintf(diff_buf, sizeof(diff_buf), "%s", "MISMATCH");
+		} else {
+			snprintf(base_buf, sizeof(base_buf), "%ld", base_val);
+			snprintf(comp_buf, sizeof(comp_buf), "%ld", comp_val);
+
+			diff_val = comp_val - base_val;
+			if (base == &fallback_stats || comp == &fallback_stats || base_val == 0) {
+				snprintf(diff_buf, sizeof(diff_buf), "%+ld (%+.2lf%%)",
+					 diff_val, comp_val < base_val ? -100.0 : 100.0);
+			} else {
+				snprintf(diff_buf, sizeof(diff_buf), "%+ld (%+.2lf%%)",
+					 diff_val, diff_val * 100.0 / base_val);
+			}
+		}
+
+		switch (fmt) {
+		case RESFMT_TABLE_CALCLEN:
+			len = strlen(base_buf);
+			if (len > *max_len_base)
+				*max_len_base = len;
+			if (!is_key_stat(id)) {
+				len = strlen(comp_buf);
+				if (len > *max_len_comp)
+					*max_len_comp = len;
+				len = strlen(diff_buf);
+				if (len > *max_len_diff)
+					*max_len_diff = len;
+			}
+			break;
+		case RESFMT_TABLE: {
+			/* string outputs are left-aligned, number outputs are right-aligned */
+			const char *fmt = base_str ? "%s%-*s" : "%s%*s";
+
+			printf(fmt, i == 0 ? "" : COLUMN_SEP, *max_len_base, base_buf);
+			if (!is_key_stat(id)) {
+				printf(fmt, COLUMN_SEP, *max_len_comp, comp_buf);
+				printf(fmt, COLUMN_SEP, *max_len_diff, diff_buf);
+			}
+			if (i == env.output_spec.spec_cnt - 1)
+				printf("\n");
+			break;
+		}
+		case RESFMT_CSV:
+			printf("%s%s", i == 0 ? "" : ",", base_buf);
+			if (!is_key_stat(id)) {
+				printf("%s%s", i == 0 ? "" : ",", comp_buf);
+				printf("%s%s", i == 0 ? "" : ",", diff_buf);
+			}
+			if (i == env.output_spec.spec_cnt - 1)
+				printf("\n");
+			break;
+		}
+	}
+
+	if (last && fmt == RESFMT_TABLE)
+		output_comp_header_underlines();
+}
+
+static int cmp_stats_key(const struct verif_stats *base, const struct verif_stats *comp)
+{
+	int r;
+
+	r = strcmp(base->file_name, comp->file_name);
+	if (r != 0)
+		return r;
+	return strcmp(base->prog_name, comp->prog_name);
+}
+
+static int handle_comparison_mode(void)
+{
+	struct stat_specs base_specs = {}, comp_specs = {};
+	enum resfmt cur_fmt;
+	int err, i, j;
+
+	if (env.filename_cnt != 2) {
+		fprintf(stderr, "Comparison mode expects exactly two input CSV files!\n");
+		argp_help(&argp, stderr, ARGP_HELP_USAGE, "veristat");
+		return -EINVAL;
+	}
+
+	err = parse_stats_csv(env.filenames[0], &base_specs,
+			      &env.baseline_stats, &env.baseline_stat_cnt);
+	if (err) {
+		fprintf(stderr, "Failed to parse stats from '%s': %d\n", env.filenames[0], err);
+		return err;
+	}
+	err = parse_stats_csv(env.filenames[1], &comp_specs,
+			      &env.prog_stats, &env.prog_stat_cnt);
+	if (err) {
+		fprintf(stderr, "Failed to parse stats from '%s': %d\n", env.filenames[1], err);
+		return err;
 	}
-	free(env.prog_stats);
+
+	/* To keep it simple we validate that the set and order of stats in
+	 * both CSVs are exactly the same. This can be lifted with a bit more
+	 * pre-processing later.
+	 */
+	if (base_specs.spec_cnt != comp_specs.spec_cnt) {
+		fprintf(stderr, "Number of stats in '%s' and '%s' differs (%d != %d)!\n",
+			env.filenames[0], env.filenames[1],
+			base_specs.spec_cnt, comp_specs.spec_cnt);
+		return -EINVAL;
+	}
+	for (i = 0; i < base_specs.spec_cnt; i++) {
+		if (base_specs.ids[i] != comp_specs.ids[i]) {
+			fprintf(stderr, "Stats composition differs between '%s' and '%s' (%s != %s)!\n",
+				env.filenames[0], env.filenames[1],
+				stat_defs[base_specs.ids[i]].names[0],
+				stat_defs[comp_specs.ids[i]].names[0]);
+			return -EINVAL;
+		}
+	}
+
+	qsort(env.prog_stats, env.prog_stat_cnt, sizeof(*env.prog_stats), cmp_prog_stats);
+	qsort(env.baseline_stats, env.baseline_stat_cnt, sizeof(*env.baseline_stats), cmp_prog_stats);
+
+	/* for human-readable table output we need to do extra pass to
+	 * calculate column widths, so we substitute current output format
+	 * with RESFMT_TABLE_CALCLEN and later revert it back to RESFMT_TABLE
+	 * and do everything again.
+	 */
+	if (env.out_fmt == RESFMT_TABLE)
+		cur_fmt = RESFMT_TABLE_CALCLEN;
+	else
+		cur_fmt = env.out_fmt;
+
+one_more_time:
+	output_comp_headers(cur_fmt);
+
+	/* If baseline and comparison datasets have different subset of rows
+	 * (we match by 'object + prog' as a unique key) then assume
+	 * empty/missing/zero value for rows that are missing in the opposite
+	 * data set
+	 */
+	i = j = 0;
+	while (i < env.baseline_stat_cnt || j < env.prog_stat_cnt) {
+		bool last = (i == env.baseline_stat_cnt - 1) || (j == env.prog_stat_cnt - 1);
+		const struct verif_stats *base, *comp;
+		int r;
+
+		base = i < env.baseline_stat_cnt ? &env.baseline_stats[i] : &fallback_stats;
+		comp = j < env.prog_stat_cnt ? &env.prog_stats[j] : &fallback_stats;
+
+		if (!base->file_name || !base->prog_name) {
+			fprintf(stderr, "Entry #%d in '%s' doesn't have file and/or program name specified!\n",
+				i, env.filenames[0]);
+			return -EINVAL;
+		}
+		if (!comp->file_name || !comp->prog_name) {
+			fprintf(stderr, "Entry #%d in '%s' doesn't have file and/or program name specified!\n",
+				j, env.filenames[1]);
+			return -EINVAL;
+		}
+
+		r = cmp_stats_key(base, comp);
+		if (r == 0) {
+			output_comp_stats(base, comp, cur_fmt, last);
+			i++;
+			j++;
+		} else if (comp == &fallback_stats || r < 0) {
+			output_comp_stats(base, &fallback_stats, cur_fmt, last);
+			i++;
+		} else {
+			output_comp_stats(&fallback_stats, comp, cur_fmt, last);
+			j++;
+		}
+	}
+
+	if (cur_fmt == RESFMT_TABLE_CALCLEN) {
+		cur_fmt = RESFMT_TABLE;
+		goto one_more_time; /* ... this time with feeling */
+	}
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int err = 0, i;
+
+	if (argp_parse(&argp, argc, argv, 0, NULL, NULL))
+		return 1;
+
+	if (env.output_spec.spec_cnt == 0)
+		env.output_spec = default_output_spec;
+	if (env.sort_spec.spec_cnt == 0)
+		env.sort_spec = default_sort_spec;
+
+	if (env.comparison_mode)
+		err = handle_comparison_mode();
+	else
+		err = handle_verif_mode();
+
+	free_verif_stats(env.prog_stats, env.prog_stat_cnt);
+	free_verif_stats(env.baseline_stats, env.baseline_stat_cnt);
 	for (i = 0; i < env.filename_cnt; i++)
 		free(env.filenames[i]);
 	free(env.filenames);
-- 
cgit v1.2.3


From bde4a96cdcadc1f9c92cc2715a0022545bfb3201 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 21 Sep 2022 09:42:54 -0700
Subject: selftests/bpf: add ability to filter programs in veristat

Add -f (--filter) argument which accepts glob-based filters for
narrowing down what BPF object files and programs within them should be
processed by veristat. This filtering applies both to comparison and
main (verification) mode.

Filter can be of two forms:
  - file (object) filter: 'strobemeta*'; in this case all the programs
    within matching files are implicitly allowed (or denied, depending
    if it's positive or negative rule, see below);
  - file and prog filter: 'strobemeta*/*unroll*' will further filter
    programs within matching files to only allow those program names that
    match '*unroll*' glob.

As mentioned, filters can be positive (allowlisting) and negative
(denylisting). Negative filters should start with '!': '!strobemeta*'
will deny any filename which basename starts with "strobemeta".

Further, one extra special syntax is supported to allow more convenient
use in practice. Instead of specifying rule on the command line,
veristat allows to specify file that contains rules, both positive and
negative, one line per one filter. This is achieved with -f @<filepath>
use, where <filepath> points to a text file containing rules (negative
and positive rules can be mixed). For convenience empty lines and lines
starting with '#' are ignored. This feature is useful to have some
pre-canned list of object files and program names that are tested
repeatedly, allowing to check in a list of rules and quickly specify
them on the command line.

As a demonstration (and a short cut for nearest future), create a small
list of "interesting" BPF object files from selftests/bpf and commit it
as veristat.cfg. It currently includes 73 programs, most of which are
the most complex and largest BPF programs in selftests, as judged by
total verified instruction count and verifier states total.

If there is overlap between positive or negative filters, negative
filter takes precedence (denylisting is stronger than allowlisting). If
no allow filter is specified, veristat implicitly assumes '*/*' rule. If
no deny rule is specified, veristat (logically) assumes no negative
filters.

Also note that -f (just like -e and -s) can be specified multiple times
and their effect is cumulative.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220921164254.3630690-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/veristat.c   | 212 ++++++++++++++++++++++++++++++-
 tools/testing/selftests/bpf/veristat.cfg |  17 +++
 2 files changed, 227 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/veristat.cfg

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index c6837bac357f..51030234b60a 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -52,6 +52,11 @@ enum resfmt {
 	RESFMT_CSV,
 };
 
+struct filter {
+	char *file_glob;
+	char *prog_glob;
+};
+
 static struct env {
 	char **filenames;
 	int filename_cnt;
@@ -68,6 +73,11 @@ static struct env {
 
 	struct stat_specs output_spec;
 	struct stat_specs sort_spec;
+
+	struct filter *allow_filters;
+	struct filter *deny_filters;
+	int allow_filter_cnt;
+	int deny_filter_cnt;
 } env;
 
 static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
@@ -94,10 +104,13 @@ static const struct argp_option opts[] = {
 	{ "sort", 's', "SPEC", 0, "Specify sort order" },
 	{ "output-format", 'o', "FMT", 0, "Result output format (table, csv), default is table." },
 	{ "compare", 'C', NULL, 0, "Comparison mode" },
+	{ "filter", 'f', "FILTER", 0, "Filter expressions (or @filename for file with expressions)." },
 	{},
 };
 
 static int parse_stats(const char *stats_str, struct stat_specs *specs);
+static int append_filter(struct filter **filters, int *cnt, const char *str);
+static int append_filter_file(const char *path);
 
 static error_t parse_arg(int key, char *arg, struct argp_state *state)
 {
@@ -134,6 +147,18 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 	case 'C':
 		env.comparison_mode = true;
 		break;
+	case 'f':
+		if (arg[0] == '@')
+			err = append_filter_file(arg + 1);
+		else if (arg[0] == '!')
+			err = append_filter(&env.deny_filters, &env.deny_filter_cnt, arg + 1);
+		else
+			err = append_filter(&env.allow_filters, &env.allow_filter_cnt, arg);
+		if (err) {
+			fprintf(stderr, "Failed to collect program filter expressions: %d\n", err);
+			return err;
+		}
+		break;
 	case ARGP_KEY_ARG:
 		tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames));
 		if (!tmp)
@@ -156,6 +181,150 @@ static const struct argp argp = {
 	.doc = argp_program_doc,
 };
 
+
+/* Adapted from perf/util/string.c */
+static bool glob_matches(const char *str, const char *pat)
+{
+	while (*str && *pat && *pat != '*') {
+		if (*str != *pat)
+			return false;
+		str++;
+		pat++;
+	}
+	/* Check wild card */
+	if (*pat == '*') {
+		while (*pat == '*')
+			pat++;
+		if (!*pat) /* Tail wild card matches all */
+			return true;
+		while (*str)
+			if (glob_matches(str++, pat))
+				return true;
+	}
+	return !*str && !*pat;
+}
+
+static bool should_process_file(const char *filename)
+{
+	int i;
+
+	if (env.deny_filter_cnt > 0) {
+		for (i = 0; i < env.deny_filter_cnt; i++) {
+			if (glob_matches(filename, env.deny_filters[i].file_glob))
+				return false;
+		}
+	}
+
+	if (env.allow_filter_cnt == 0)
+		return true;
+
+	for (i = 0; i < env.allow_filter_cnt; i++) {
+		if (glob_matches(filename, env.allow_filters[i].file_glob))
+			return true;
+	}
+
+	return false;
+}
+
+static bool should_process_prog(const char *filename, const char *prog_name)
+{
+	int i;
+
+	if (env.deny_filter_cnt > 0) {
+		for (i = 0; i < env.deny_filter_cnt; i++) {
+			if (glob_matches(filename, env.deny_filters[i].file_glob))
+				return false;
+			if (!env.deny_filters[i].prog_glob)
+				continue;
+			if (glob_matches(prog_name, env.deny_filters[i].prog_glob))
+				return false;
+		}
+	}
+
+	if (env.allow_filter_cnt == 0)
+		return true;
+
+	for (i = 0; i < env.allow_filter_cnt; i++) {
+		if (!glob_matches(filename, env.allow_filters[i].file_glob))
+			continue;
+		/* if filter specifies only filename glob part, it implicitly
+		 * allows all progs within that file
+		 */
+		if (!env.allow_filters[i].prog_glob)
+			return true;
+		if (glob_matches(prog_name, env.allow_filters[i].prog_glob))
+			return true;
+	}
+
+	return false;
+}
+
+static int append_filter(struct filter **filters, int *cnt, const char *str)
+{
+	struct filter *f;
+	void *tmp;
+	const char *p;
+
+	tmp = realloc(*filters, (*cnt + 1) * sizeof(**filters));
+	if (!tmp)
+		return -ENOMEM;
+	*filters = tmp;
+
+	f = &(*filters)[*cnt];
+	f->file_glob = f->prog_glob = NULL;
+
+	/* filter can be specified either as "<obj-glob>" or "<obj-glob>/<prog-glob>" */
+	p = strchr(str, '/');
+	if (!p) {
+		f->file_glob = strdup(str);
+		if (!f->file_glob)
+			return -ENOMEM;
+	} else {
+		f->file_glob = strndup(str, p - str);
+		f->prog_glob = strdup(p + 1);
+		if (!f->file_glob || !f->prog_glob) {
+			free(f->file_glob);
+			free(f->prog_glob);
+			f->file_glob = f->prog_glob = NULL;
+			return -ENOMEM;
+		}
+	}
+
+	*cnt = *cnt + 1;
+	return 0;
+}
+
+static int append_filter_file(const char *path)
+{
+	char buf[1024];
+	FILE *f;
+	int err = 0;
+
+	f = fopen(path, "r");
+	if (!f) {
+		err = -errno;
+		fprintf(stderr, "Failed to open '%s': %d\n", path, err);
+		return err;
+	}
+
+	while (fscanf(f, " %1023[^\n]\n", buf) == 1) {
+		/* lines starting with # are comments, skip them */
+		if (buf[0] == '\0' || buf[0] == '#')
+			continue;
+		/* lines starting with ! are negative match filters */
+		if (buf[0] == '!')
+			err = append_filter(&env.deny_filters, &env.deny_filter_cnt, buf + 1);
+		else
+			err = append_filter(&env.allow_filters, &env.allow_filter_cnt, buf);
+		if (err)
+			goto cleanup;
+	}
+
+cleanup:
+	fclose(f);
+	return err;
+}
+
 static const struct stat_specs default_output_spec = {
 	.spec_cnt = 7,
 	.ids = {
@@ -283,6 +452,9 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 	int err = 0;
 	void *tmp;
 
+	if (!should_process_prog(basename(filename), bpf_program__name(prog)))
+		return 0;
+
 	tmp = realloc(env.prog_stats, (env.prog_stat_cnt + 1) * sizeof(*env.prog_stats));
 	if (!tmp)
 		return -ENOMEM;
@@ -330,6 +502,9 @@ static int process_obj(const char *filename)
 	LIBBPF_OPTS(bpf_object_open_opts, opts);
 	int err = 0, prog_cnt = 0;
 
+	if (!should_process_file(basename(filename)))
+		return 0;
+
 	old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn);
 
 	obj = bpf_object__open_file(filename, &opts);
@@ -666,7 +841,10 @@ static int parse_stats_csv(const char *filename, struct stat_specs *specs,
 				goto cleanup;
 			}
 			*statsp = tmp;
+
 			st = &(*statsp)[*stat_cntp];
+			memset(st, 0, sizeof(*st));
+
 			*stat_cntp += 1;
 		}
 
@@ -692,14 +870,34 @@ static int parse_stats_csv(const char *filename, struct stat_specs *specs,
 			col++;
 		}
 
-		if (!header && col < specs->spec_cnt) {
+		if (header) {
+			header = false;
+			continue;
+		}
+
+		if (col < specs->spec_cnt) {
 			fprintf(stderr, "Not enough columns in row #%d in '%s'\n",
 				*stat_cntp, filename);
 			err = -EINVAL;
 			goto cleanup;
 		}
 
-		header = false;
+		if (!st->file_name || !st->prog_name) {
+			fprintf(stderr, "Row #%d in '%s' is missing file and/or program name\n",
+				*stat_cntp, filename);
+			err = -EINVAL;
+			goto cleanup;
+		}
+
+		/* in comparison mode we can only check filters after we
+		 * parsed entire line; if row should be ignored we pretend we
+		 * never parsed it
+		 */
+		if (!should_process_prog(st->file_name, st->prog_name)) {
+			free(st->file_name);
+			free(st->prog_name);
+			*stat_cntp -= 1;
+		}
 	}
 
 	if (!feof(f)) {
@@ -1012,5 +1210,15 @@ int main(int argc, char **argv)
 	for (i = 0; i < env.filename_cnt; i++)
 		free(env.filenames[i]);
 	free(env.filenames);
+	for (i = 0; i < env.allow_filter_cnt; i++) {
+		free(env.allow_filters[i].file_glob);
+		free(env.allow_filters[i].prog_glob);
+	}
+	free(env.allow_filters);
+	for (i = 0; i < env.deny_filter_cnt; i++) {
+		free(env.deny_filters[i].file_glob);
+		free(env.deny_filters[i].prog_glob);
+	}
+	free(env.deny_filters);
 	return -err;
 }
diff --git a/tools/testing/selftests/bpf/veristat.cfg b/tools/testing/selftests/bpf/veristat.cfg
new file mode 100644
index 000000000000..1a385061618d
--- /dev/null
+++ b/tools/testing/selftests/bpf/veristat.cfg
@@ -0,0 +1,17 @@
+# pre-canned list of rather complex selftests/bpf BPF object files to monitor
+# BPF verifier's performance on
+bpf_flow*
+bpf_loop_bench*
+loop*
+netif_receive_skb*
+profiler*
+pyperf*
+strobemeta*
+test_cls_redirect*
+test_l4lb
+test_sysctl*
+test_tcp_hdr_*
+test_usdt*
+test_verif_scale*
+test_xdp_noinline*
+xdp_synproxy*
-- 
cgit v1.2.3


From 1b64daf413acd86c2c13f5443f6b4ef3690c8061 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 7 Sep 2022 15:41:03 -0700
Subject: ARM: decompressor: Include .data.rel.ro.local

The .data.rel.ro.local section has the same semantics as .data.rel.ro
here, so include it in the .rodata section of the decompressor.
Additionally since the .printk_index section isn't usable outside of
the core kernel, discard it in the decompressor. Avoids these warnings:

arm-linux-gnueabi-ld: warning: orphan section `.data.rel.ro.local' from `arch/arm/boot/compressed/fdt_rw.o' being placed in section `.data.rel.ro.local'
arm-linux-gnueabi-ld: warning: orphan section `.printk_index' from `arch/arm/boot/compressed/fdt_rw.o' being placed in section `.printk_index'

Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/linux-mm/202209080545.qMIVj7YM-lkp@intel.com
Cc: Russell King <linux@armlinux.org.uk>
Cc: linux-arm-kernel@lists.infradead.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/arm/boot/compressed/vmlinux.lds.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/boot/compressed/vmlinux.lds.S b/arch/arm/boot/compressed/vmlinux.lds.S
index 1bcb68ac4b01..3fcb3e62dc56 100644
--- a/arch/arm/boot/compressed/vmlinux.lds.S
+++ b/arch/arm/boot/compressed/vmlinux.lds.S
@@ -23,6 +23,7 @@ SECTIONS
     *(.ARM.extab*)
     *(.note.*)
     *(.rel.*)
+    *(.printk_index)
     /*
      * Discard any r/w data - this produces a link error if we have any,
      * which is required for PIC decompression.  Local data generates
@@ -57,6 +58,7 @@ SECTIONS
     *(.rodata)
     *(.rodata.*)
     *(.data.rel.ro)
+    *(.data.rel.ro.*)
   }
   .piggydata : {
     *(.piggydata)
-- 
cgit v1.2.3


From d7e7b9af104c7b389a0c21eb26532511bce4b510 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 1 Sep 2022 12:32:06 -0700
Subject: fscrypt: stop using keyrings subsystem for fscrypt_master_key

The approach of fs/crypto/ internally managing the fscrypt_master_key
structs as the payloads of "struct key" objects contained in a
"struct key" keyring has outlived its usefulness.  The original idea was
to simplify the code by reusing code from the keyrings subsystem.
However, several issues have arisen that can't easily be resolved:

- When a master key struct is destroyed, blk_crypto_evict_key() must be
  called on any per-mode keys embedded in it.  (This started being the
  case when inline encryption support was added.)  Yet, the keyrings
  subsystem can arbitrarily delay the destruction of keys, even past the
  time the filesystem was unmounted.  Therefore, currently there is no
  easy way to call blk_crypto_evict_key() when a master key is
  destroyed.  Currently, this is worked around by holding an extra
  reference to the filesystem's request_queue(s).  But it was overlooked
  that the request_queue reference is *not* guaranteed to pin the
  corresponding blk_crypto_profile too; for device-mapper devices that
  support inline crypto, it doesn't.  This can cause a use-after-free.

- When the last inode that was using an incompletely-removed master key
  is evicted, the master key removal is completed by removing the key
  struct from the keyring.  Currently this is done via key_invalidate().
  Yet, key_invalidate() takes the key semaphore.  This can deadlock when
  called from the shrinker, since in fscrypt_ioctl_add_key(), memory is
  allocated with GFP_KERNEL under the same semaphore.

- More generally, the fact that the keyrings subsystem can arbitrarily
  delay the destruction of keys (via garbage collection delay, or via
  random processes getting temporary key references) is undesirable, as
  it means we can't strictly guarantee that all secrets are ever wiped.

- Doing the master key lookups via the keyrings subsystem results in the
  key_permission LSM hook being called.  fscrypt doesn't want this, as
  all access control for encrypted files is designed to happen via the
  files themselves, like any other files.  The workaround which SELinux
  users are using is to change their SELinux policy to grant key search
  access to all domains.  This works, but it is an odd extra step that
  shouldn't really have to be done.

The fix for all these issues is to change the implementation to what I
should have done originally: don't use the keyrings subsystem to keep
track of the filesystem's fscrypt_master_key structs.  Instead, just
store them in a regular kernel data structure, and rework the reference
counting, locking, and lifetime accordingly.  Retain support for
RCU-mode key lookups by using a hash table.  Replace fscrypt_sb_free()
with fscrypt_sb_delete(), which releases the keys synchronously and runs
a bit earlier during unmount, so that block devices are still available.

A side effect of this patch is that neither the master keys themselves
nor the filesystem keyrings will be listed in /proc/keys anymore.
("Master key users" and the master key users keyrings will still be
listed.)  However, this was mostly an implementation detail, and it was
intended just for debugging purposes.  I don't know of anyone using it.

This patch does *not* change how "master key users" (->mk_users) works;
that still uses the keyrings subsystem.  That is still needed for key
quotas, and changing that isn't necessary to solve the issues listed
above.  If we decide to change that too, it would be a separate patch.

I've marked this as fixing the original commit that added the fscrypt
keyring, but as noted above the most important issue that this patch
fixes wasn't introduced until the addition of inline encryption support.

Fixes: 22d94f493bfb ("fscrypt: add FS_IOC_ADD_ENCRYPTION_KEY ioctl")
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220901193208.138056-2-ebiggers@kernel.org
---
 fs/crypto/fscrypt_private.h |  71 +++++--
 fs/crypto/hooks.c           |  10 +-
 fs/crypto/keyring.c         | 486 ++++++++++++++++++++++++--------------------
 fs/crypto/keysetup.c        |  81 +++-----
 fs/crypto/policy.c          |   8 +-
 fs/super.c                  |   2 +-
 include/linux/fs.h          |   2 +-
 include/linux/fscrypt.h     |   4 +-
 8 files changed, 353 insertions(+), 311 deletions(-)

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 3afdaa084773..577cae7facb0 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -225,7 +225,7 @@ struct fscrypt_info {
 	 * will be NULL if the master key was found in a process-subscribed
 	 * keyring rather than in the filesystem-level keyring.
 	 */
-	struct key *ci_master_key;
+	struct fscrypt_master_key *ci_master_key;
 
 	/*
 	 * Link in list of inodes that were unlocked with the master key.
@@ -436,6 +436,40 @@ struct fscrypt_master_key_secret {
  */
 struct fscrypt_master_key {
 
+	/*
+	 * Back-pointer to the super_block of the filesystem to which this
+	 * master key has been added.  Only valid if ->mk_active_refs > 0.
+	 */
+	struct super_block			*mk_sb;
+
+	/*
+	 * Link in ->mk_sb->s_master_keys->key_hashtable.
+	 * Only valid if ->mk_active_refs > 0.
+	 */
+	struct hlist_node			mk_node;
+
+	/* Semaphore that protects ->mk_secret and ->mk_users */
+	struct rw_semaphore			mk_sem;
+
+	/*
+	 * Active and structural reference counts.  An active ref guarantees
+	 * that the struct continues to exist, continues to be in the keyring
+	 * ->mk_sb->s_master_keys, and that any embedded subkeys (e.g.
+	 * ->mk_direct_keys) that have been prepared continue to exist.
+	 * A structural ref only guarantees that the struct continues to exist.
+	 *
+	 * There is one active ref associated with ->mk_secret being present,
+	 * and one active ref for each inode in ->mk_decrypted_inodes.
+	 *
+	 * There is one structural ref associated with the active refcount being
+	 * nonzero.  Finding a key in the keyring also takes a structural ref,
+	 * which is then held temporarily while the key is operated on.
+	 */
+	refcount_t				mk_active_refs;
+	refcount_t				mk_struct_refs;
+
+	struct rcu_head				mk_rcu_head;
+
 	/*
 	 * The secret key material.  After FS_IOC_REMOVE_ENCRYPTION_KEY is
 	 * executed, this is wiped and no new inodes can be unlocked with this
@@ -444,7 +478,10 @@ struct fscrypt_master_key {
 	 * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or
 	 * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again.
 	 *
-	 * Locking: protected by this master key's key->sem.
+	 * While ->mk_secret is present, one ref in ->mk_active_refs is held.
+	 *
+	 * Locking: protected by ->mk_sem.  The manipulation of ->mk_active_refs
+	 *	    associated with this field is protected by ->mk_sem as well.
 	 */
 	struct fscrypt_master_key_secret	mk_secret;
 
@@ -465,22 +502,12 @@ struct fscrypt_master_key {
 	 *
 	 * This is NULL for v1 policy keys; those can only be added by root.
 	 *
-	 * Locking: in addition to this keyring's own semaphore, this is
-	 * protected by this master key's key->sem, so we can do atomic
-	 * search+insert.  It can also be searched without taking any locks, but
-	 * in that case the returned key may have already been removed.
+	 * Locking: protected by ->mk_sem.  (We don't just rely on the keyrings
+	 * subsystem semaphore ->mk_users->sem, as we need support for atomic
+	 * search+insert along with proper synchronization with ->mk_secret.)
 	 */
 	struct key		*mk_users;
 
-	/*
-	 * Length of ->mk_decrypted_inodes, plus one if mk_secret is present.
-	 * Once this goes to 0, the master key is removed from ->s_master_keys.
-	 * The 'struct fscrypt_master_key' will continue to live as long as the
-	 * 'struct key' whose payload it is, but we won't let this reference
-	 * count rise again.
-	 */
-	refcount_t		mk_refcount;
-
 	/*
 	 * List of inodes that were unlocked using this key.  This allows the
 	 * inodes to be evicted efficiently if the key is removed.
@@ -506,10 +533,10 @@ static inline bool
 is_master_key_secret_present(const struct fscrypt_master_key_secret *secret)
 {
 	/*
-	 * The READ_ONCE() is only necessary for fscrypt_drop_inode() and
-	 * fscrypt_key_describe().  These run in atomic context, so they can't
-	 * take the key semaphore and thus 'secret' can change concurrently
-	 * which would be a data race.  But they only need to know whether the
+	 * The READ_ONCE() is only necessary for fscrypt_drop_inode().
+	 * fscrypt_drop_inode() runs in atomic context, so it can't take the key
+	 * semaphore and thus 'secret' can change concurrently which would be a
+	 * data race.  But fscrypt_drop_inode() only need to know whether the
 	 * secret *was* present at the time of check, so READ_ONCE() suffices.
 	 */
 	return READ_ONCE(secret->size) != 0;
@@ -538,7 +565,11 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
 	return 0;
 }
 
-struct key *
+void fscrypt_put_master_key(struct fscrypt_master_key *mk);
+
+void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk);
+
+struct fscrypt_master_key *
 fscrypt_find_master_key(struct super_block *sb,
 			const struct fscrypt_key_specifier *mk_spec);
 
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 7c01025879b3..7b8c5a1104b5 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -5,8 +5,6 @@
  * Encryption hooks for higher-level filesystem operations.
  */
 
-#include <linux/key.h>
-
 #include "fscrypt_private.h"
 
 /**
@@ -142,7 +140,6 @@ int fscrypt_prepare_setflags(struct inode *inode,
 			     unsigned int oldflags, unsigned int flags)
 {
 	struct fscrypt_info *ci;
-	struct key *key;
 	struct fscrypt_master_key *mk;
 	int err;
 
@@ -158,14 +155,13 @@ int fscrypt_prepare_setflags(struct inode *inode,
 		ci = inode->i_crypt_info;
 		if (ci->ci_policy.version != FSCRYPT_POLICY_V2)
 			return -EINVAL;
-		key = ci->ci_master_key;
-		mk = key->payload.data[0];
-		down_read(&key->sem);
+		mk = ci->ci_master_key;
+		down_read(&mk->mk_sem);
 		if (is_master_key_secret_present(&mk->mk_secret))
 			err = fscrypt_derive_dirhash_key(ci, mk);
 		else
 			err = -ENOKEY;
-		up_read(&key->sem);
+		up_read(&mk->mk_sem);
 		return err;
 	}
 	return 0;
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index caee9f8620dd..9b98d6a576e6 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -18,6 +18,7 @@
  * information about these ioctls.
  */
 
+#include <asm/unaligned.h>
 #include <crypto/skcipher.h>
 #include <linux/key-type.h>
 #include <linux/random.h>
@@ -25,6 +26,18 @@
 
 #include "fscrypt_private.h"
 
+/* The master encryption keys for a filesystem (->s_master_keys) */
+struct fscrypt_keyring {
+	/*
+	 * Lock that protects ->key_hashtable.  It does *not* protect the
+	 * fscrypt_master_key structs themselves.
+	 */
+	spinlock_t lock;
+
+	/* Hash table that maps fscrypt_key_specifier to fscrypt_master_key */
+	struct hlist_head key_hashtable[128];
+};
+
 static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret)
 {
 	fscrypt_destroy_hkdf(&secret->hkdf);
@@ -38,20 +51,70 @@ static void move_master_key_secret(struct fscrypt_master_key_secret *dst,
 	memzero_explicit(src, sizeof(*src));
 }
 
-static void free_master_key(struct fscrypt_master_key *mk)
+static void fscrypt_free_master_key(struct rcu_head *head)
+{
+	struct fscrypt_master_key *mk =
+		container_of(head, struct fscrypt_master_key, mk_rcu_head);
+	/*
+	 * The master key secret and any embedded subkeys should have already
+	 * been wiped when the last active reference to the fscrypt_master_key
+	 * struct was dropped; doing it here would be unnecessarily late.
+	 * Nevertheless, use kfree_sensitive() in case anything was missed.
+	 */
+	kfree_sensitive(mk);
+}
+
+void fscrypt_put_master_key(struct fscrypt_master_key *mk)
+{
+	if (!refcount_dec_and_test(&mk->mk_struct_refs))
+		return;
+	/*
+	 * No structural references left, so free ->mk_users, and also free the
+	 * fscrypt_master_key struct itself after an RCU grace period ensures
+	 * that concurrent keyring lookups can no longer find it.
+	 */
+	WARN_ON(refcount_read(&mk->mk_active_refs) != 0);
+	key_put(mk->mk_users);
+	mk->mk_users = NULL;
+	call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key);
+}
+
+void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk)
 {
+	struct super_block *sb = mk->mk_sb;
+	struct fscrypt_keyring *keyring = sb->s_master_keys;
 	size_t i;
 
-	wipe_master_key_secret(&mk->mk_secret);
+	if (!refcount_dec_and_test(&mk->mk_active_refs))
+		return;
+	/*
+	 * No active references left, so complete the full removal of this
+	 * fscrypt_master_key struct by removing it from the keyring and
+	 * destroying any subkeys embedded in it.
+	 */
+
+	spin_lock(&keyring->lock);
+	hlist_del_rcu(&mk->mk_node);
+	spin_unlock(&keyring->lock);
+
+	/*
+	 * ->mk_active_refs == 0 implies that ->mk_secret is not present and
+	 * that ->mk_decrypted_inodes is empty.
+	 */
+	WARN_ON(is_master_key_secret_present(&mk->mk_secret));
+	WARN_ON(!list_empty(&mk->mk_decrypted_inodes));
 
 	for (i = 0; i <= FSCRYPT_MODE_MAX; i++) {
 		fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]);
 		fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]);
 		fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]);
 	}
+	memzero_explicit(&mk->mk_ino_hash_key,
+			 sizeof(mk->mk_ino_hash_key));
+	mk->mk_ino_hash_key_initialized = false;
 
-	key_put(mk->mk_users);
-	kfree_sensitive(mk);
+	/* Drop the structural ref associated with the active refs. */
+	fscrypt_put_master_key(mk);
 }
 
 static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
@@ -61,44 +124,6 @@ static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
 	return master_key_spec_len(spec) != 0;
 }
 
-static int fscrypt_key_instantiate(struct key *key,
-				   struct key_preparsed_payload *prep)
-{
-	key->payload.data[0] = (struct fscrypt_master_key *)prep->data;
-	return 0;
-}
-
-static void fscrypt_key_destroy(struct key *key)
-{
-	free_master_key(key->payload.data[0]);
-}
-
-static void fscrypt_key_describe(const struct key *key, struct seq_file *m)
-{
-	seq_puts(m, key->description);
-
-	if (key_is_positive(key)) {
-		const struct fscrypt_master_key *mk = key->payload.data[0];
-
-		if (!is_master_key_secret_present(&mk->mk_secret))
-			seq_puts(m, ": secret removed");
-	}
-}
-
-/*
- * Type of key in ->s_master_keys.  Each key of this type represents a master
- * key which has been added to the filesystem.  Its payload is a
- * 'struct fscrypt_master_key'.  The "." prefix in the key type name prevents
- * users from adding keys of this type via the keyrings syscalls rather than via
- * the intended method of FS_IOC_ADD_ENCRYPTION_KEY.
- */
-static struct key_type key_type_fscrypt = {
-	.name			= "._fscrypt",
-	.instantiate		= fscrypt_key_instantiate,
-	.destroy		= fscrypt_key_destroy,
-	.describe		= fscrypt_key_describe,
-};
-
 static int fscrypt_user_key_instantiate(struct key *key,
 					struct key_preparsed_payload *prep)
 {
@@ -131,32 +156,6 @@ static struct key_type key_type_fscrypt_user = {
 	.describe		= fscrypt_user_key_describe,
 };
 
-/* Search ->s_master_keys or ->mk_users */
-static struct key *search_fscrypt_keyring(struct key *keyring,
-					  struct key_type *type,
-					  const char *description)
-{
-	/*
-	 * We need to mark the keyring reference as "possessed" so that we
-	 * acquire permission to search it, via the KEY_POS_SEARCH permission.
-	 */
-	key_ref_t keyref = make_key_ref(keyring, true /* possessed */);
-
-	keyref = keyring_search(keyref, type, description, false);
-	if (IS_ERR(keyref)) {
-		if (PTR_ERR(keyref) == -EAGAIN || /* not found */
-		    PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */
-			keyref = ERR_PTR(-ENOKEY);
-		return ERR_CAST(keyref);
-	}
-	return key_ref_to_ptr(keyref);
-}
-
-#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE	\
-	(CONST_STRLEN("fscrypt-") + sizeof_field(struct super_block, s_id))
-
-#define FSCRYPT_MK_DESCRIPTION_SIZE	(2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1)
-
 #define FSCRYPT_MK_USERS_DESCRIPTION_SIZE	\
 	(CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \
 	 CONST_STRLEN("-users") + 1)
@@ -164,21 +163,6 @@ static struct key *search_fscrypt_keyring(struct key *keyring,
 #define FSCRYPT_MK_USER_DESCRIPTION_SIZE	\
 	(2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1)
 
-static void format_fs_keyring_description(
-			char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE],
-			const struct super_block *sb)
-{
-	sprintf(description, "fscrypt-%s", sb->s_id);
-}
-
-static void format_mk_description(
-			char description[FSCRYPT_MK_DESCRIPTION_SIZE],
-			const struct fscrypt_key_specifier *mk_spec)
-{
-	sprintf(description, "%*phN",
-		master_key_spec_len(mk_spec), (u8 *)&mk_spec->u);
-}
-
 static void format_mk_users_keyring_description(
 			char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE],
 			const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
@@ -199,20 +183,15 @@ static void format_mk_user_description(
 /* Create ->s_master_keys if needed.  Synchronized by fscrypt_add_key_mutex. */
 static int allocate_filesystem_keyring(struct super_block *sb)
 {
-	char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE];
-	struct key *keyring;
+	struct fscrypt_keyring *keyring;
 
 	if (sb->s_master_keys)
 		return 0;
 
-	format_fs_keyring_description(description, sb);
-	keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
-				current_cred(), KEY_POS_SEARCH |
-				  KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW,
-				KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
-	if (IS_ERR(keyring))
-		return PTR_ERR(keyring);
-
+	keyring = kzalloc(sizeof(*keyring), GFP_KERNEL);
+	if (!keyring)
+		return -ENOMEM;
+	spin_lock_init(&keyring->lock);
 	/*
 	 * Pairs with the smp_load_acquire() in fscrypt_find_master_key().
 	 * I.e., here we publish ->s_master_keys with a RELEASE barrier so that
@@ -222,21 +201,75 @@ static int allocate_filesystem_keyring(struct super_block *sb)
 	return 0;
 }
 
-void fscrypt_sb_free(struct super_block *sb)
+/*
+ * This is called at unmount time to release all encryption keys that have been
+ * added to the filesystem, along with the keyring that contains them.
+ *
+ * Note that besides clearing and freeing memory, this might need to evict keys
+ * from the keyslots of an inline crypto engine.  Therefore, this must be called
+ * while the filesystem's underlying block device(s) are still available.
+ */
+void fscrypt_sb_delete(struct super_block *sb)
 {
-	key_put(sb->s_master_keys);
+	struct fscrypt_keyring *keyring = sb->s_master_keys;
+	size_t i;
+
+	if (!keyring)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(keyring->key_hashtable); i++) {
+		struct hlist_head *bucket = &keyring->key_hashtable[i];
+		struct fscrypt_master_key *mk;
+		struct hlist_node *tmp;
+
+		hlist_for_each_entry_safe(mk, tmp, bucket, mk_node) {
+			/*
+			 * Since all inodes were already evicted, every key
+			 * remaining in the keyring should have an empty inode
+			 * list, and should only still be in the keyring due to
+			 * the single active ref associated with ->mk_secret.
+			 * There should be no structural refs beyond the one
+			 * associated with the active ref.
+			 */
+			WARN_ON(refcount_read(&mk->mk_active_refs) != 1);
+			WARN_ON(refcount_read(&mk->mk_struct_refs) != 1);
+			WARN_ON(!is_master_key_secret_present(&mk->mk_secret));
+			wipe_master_key_secret(&mk->mk_secret);
+			fscrypt_put_master_key_activeref(mk);
+		}
+	}
+	kfree_sensitive(keyring);
 	sb->s_master_keys = NULL;
 }
 
+static struct hlist_head *
+fscrypt_mk_hash_bucket(struct fscrypt_keyring *keyring,
+		       const struct fscrypt_key_specifier *mk_spec)
+{
+	/*
+	 * Since key specifiers should be "random" values, it is sufficient to
+	 * use a trivial hash function that just takes the first several bits of
+	 * the key specifier.
+	 */
+	unsigned long i = get_unaligned((unsigned long *)&mk_spec->u);
+
+	return &keyring->key_hashtable[i % ARRAY_SIZE(keyring->key_hashtable)];
+}
+
 /*
- * Find the specified master key in ->s_master_keys.
- * Returns ERR_PTR(-ENOKEY) if not found.
+ * Find the specified master key struct in ->s_master_keys and take a structural
+ * ref to it.  The structural ref guarantees that the key struct continues to
+ * exist, but it does *not* guarantee that ->s_master_keys continues to contain
+ * the key struct.  The structural ref needs to be dropped by
+ * fscrypt_put_master_key().  Returns NULL if the key struct is not found.
  */
-struct key *fscrypt_find_master_key(struct super_block *sb,
-				    const struct fscrypt_key_specifier *mk_spec)
+struct fscrypt_master_key *
+fscrypt_find_master_key(struct super_block *sb,
+			const struct fscrypt_key_specifier *mk_spec)
 {
-	struct key *keyring;
-	char description[FSCRYPT_MK_DESCRIPTION_SIZE];
+	struct fscrypt_keyring *keyring;
+	struct hlist_head *bucket;
+	struct fscrypt_master_key *mk;
 
 	/*
 	 * Pairs with the smp_store_release() in allocate_filesystem_keyring().
@@ -246,10 +279,38 @@ struct key *fscrypt_find_master_key(struct super_block *sb,
 	 */
 	keyring = smp_load_acquire(&sb->s_master_keys);
 	if (keyring == NULL)
-		return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */
-
-	format_mk_description(description, mk_spec);
-	return search_fscrypt_keyring(keyring, &key_type_fscrypt, description);
+		return NULL; /* No keyring yet, so no keys yet. */
+
+	bucket = fscrypt_mk_hash_bucket(keyring, mk_spec);
+	rcu_read_lock();
+	switch (mk_spec->type) {
+	case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+		hlist_for_each_entry_rcu(mk, bucket, mk_node) {
+			if (mk->mk_spec.type ==
+				FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR &&
+			    memcmp(mk->mk_spec.u.descriptor,
+				   mk_spec->u.descriptor,
+				   FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 &&
+			    refcount_inc_not_zero(&mk->mk_struct_refs))
+				goto out;
+		}
+		break;
+	case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+		hlist_for_each_entry_rcu(mk, bucket, mk_node) {
+			if (mk->mk_spec.type ==
+				FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER &&
+			    memcmp(mk->mk_spec.u.identifier,
+				   mk_spec->u.identifier,
+				   FSCRYPT_KEY_IDENTIFIER_SIZE) == 0 &&
+			    refcount_inc_not_zero(&mk->mk_struct_refs))
+				goto out;
+		}
+		break;
+	}
+	mk = NULL;
+out:
+	rcu_read_unlock();
+	return mk;
 }
 
 static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk)
@@ -277,17 +338,30 @@ static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk)
 static struct key *find_master_key_user(struct fscrypt_master_key *mk)
 {
 	char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE];
+	key_ref_t keyref;
 
 	format_mk_user_description(description, mk->mk_spec.u.identifier);
-	return search_fscrypt_keyring(mk->mk_users, &key_type_fscrypt_user,
-				      description);
+
+	/*
+	 * We need to mark the keyring reference as "possessed" so that we
+	 * acquire permission to search it, via the KEY_POS_SEARCH permission.
+	 */
+	keyref = keyring_search(make_key_ref(mk->mk_users, true /*possessed*/),
+				&key_type_fscrypt_user, description, false);
+	if (IS_ERR(keyref)) {
+		if (PTR_ERR(keyref) == -EAGAIN || /* not found */
+		    PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */
+			keyref = ERR_PTR(-ENOKEY);
+		return ERR_CAST(keyref);
+	}
+	return key_ref_to_ptr(keyref);
 }
 
 /*
  * Give the current user a "key" in ->mk_users.  This charges the user's quota
  * and marks the master key as added by the current user, so that it cannot be
- * removed by another user with the key.  Either the master key's key->sem must
- * be held for write, or the master key must be still undergoing initialization.
+ * removed by another user with the key.  Either ->mk_sem must be held for
+ * write, or the master key must be still undergoing initialization.
  */
 static int add_master_key_user(struct fscrypt_master_key *mk)
 {
@@ -309,7 +383,7 @@ static int add_master_key_user(struct fscrypt_master_key *mk)
 
 /*
  * Remove the current user's "key" from ->mk_users.
- * The master key's key->sem must be held for write.
+ * ->mk_sem must be held for write.
  *
  * Returns 0 if removed, -ENOKEY if not found, or another -errno code.
  */
@@ -327,63 +401,49 @@ static int remove_master_key_user(struct fscrypt_master_key *mk)
 }
 
 /*
- * Allocate a new fscrypt_master_key which contains the given secret, set it as
- * the payload of a new 'struct key' of type fscrypt, and link the 'struct key'
- * into the given keyring.  Synchronized by fscrypt_add_key_mutex.
+ * Allocate a new fscrypt_master_key, transfer the given secret over to it, and
+ * insert it into sb->s_master_keys.
  */
-static int add_new_master_key(struct fscrypt_master_key_secret *secret,
-			      const struct fscrypt_key_specifier *mk_spec,
-			      struct key *keyring)
+static int add_new_master_key(struct super_block *sb,
+			      struct fscrypt_master_key_secret *secret,
+			      const struct fscrypt_key_specifier *mk_spec)
 {
+	struct fscrypt_keyring *keyring = sb->s_master_keys;
 	struct fscrypt_master_key *mk;
-	char description[FSCRYPT_MK_DESCRIPTION_SIZE];
-	struct key *key;
 	int err;
 
 	mk = kzalloc(sizeof(*mk), GFP_KERNEL);
 	if (!mk)
 		return -ENOMEM;
 
+	mk->mk_sb = sb;
+	init_rwsem(&mk->mk_sem);
+	refcount_set(&mk->mk_struct_refs, 1);
 	mk->mk_spec = *mk_spec;
 
-	move_master_key_secret(&mk->mk_secret, secret);
-
-	refcount_set(&mk->mk_refcount, 1); /* secret is present */
 	INIT_LIST_HEAD(&mk->mk_decrypted_inodes);
 	spin_lock_init(&mk->mk_decrypted_inodes_lock);
 
 	if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) {
 		err = allocate_master_key_users_keyring(mk);
 		if (err)
-			goto out_free_mk;
+			goto out_put;
 		err = add_master_key_user(mk);
 		if (err)
-			goto out_free_mk;
+			goto out_put;
 	}
 
-	/*
-	 * Note that we don't charge this key to anyone's quota, since when
-	 * ->mk_users is in use those keys are charged instead, and otherwise
-	 * (when ->mk_users isn't in use) only root can add these keys.
-	 */
-	format_mk_description(description, mk_spec);
-	key = key_alloc(&key_type_fscrypt, description,
-			GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
-			KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW,
-			KEY_ALLOC_NOT_IN_QUOTA, NULL);
-	if (IS_ERR(key)) {
-		err = PTR_ERR(key);
-		goto out_free_mk;
-	}
-	err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL);
-	key_put(key);
-	if (err)
-		goto out_free_mk;
+	move_master_key_secret(&mk->mk_secret, secret);
+	refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */
 
+	spin_lock(&keyring->lock);
+	hlist_add_head_rcu(&mk->mk_node,
+			   fscrypt_mk_hash_bucket(keyring, mk_spec));
+	spin_unlock(&keyring->lock);
 	return 0;
 
-out_free_mk:
-	free_master_key(mk);
+out_put:
+	fscrypt_put_master_key(mk);
 	return err;
 }
 
@@ -392,42 +452,34 @@ out_free_mk:
 static int add_existing_master_key(struct fscrypt_master_key *mk,
 				   struct fscrypt_master_key_secret *secret)
 {
-	struct key *mk_user;
-	bool rekey;
 	int err;
 
 	/*
 	 * If the current user is already in ->mk_users, then there's nothing to
-	 * do.  (Not applicable for v1 policy keys, which have NULL ->mk_users.)
+	 * do.  Otherwise, we need to add the user to ->mk_users.  (Neither is
+	 * applicable for v1 policy keys, which have NULL ->mk_users.)
 	 */
 	if (mk->mk_users) {
-		mk_user = find_master_key_user(mk);
+		struct key *mk_user = find_master_key_user(mk);
+
 		if (mk_user != ERR_PTR(-ENOKEY)) {
 			if (IS_ERR(mk_user))
 				return PTR_ERR(mk_user);
 			key_put(mk_user);
 			return 0;
 		}
-	}
-
-	/* If we'll be re-adding ->mk_secret, try to take the reference. */
-	rekey = !is_master_key_secret_present(&mk->mk_secret);
-	if (rekey && !refcount_inc_not_zero(&mk->mk_refcount))
-		return KEY_DEAD;
-
-	/* Add the current user to ->mk_users, if applicable. */
-	if (mk->mk_users) {
 		err = add_master_key_user(mk);
-		if (err) {
-			if (rekey && refcount_dec_and_test(&mk->mk_refcount))
-				return KEY_DEAD;
+		if (err)
 			return err;
-		}
 	}
 
 	/* Re-add the secret if needed. */
-	if (rekey)
+	if (!is_master_key_secret_present(&mk->mk_secret)) {
+		if (!refcount_inc_not_zero(&mk->mk_active_refs))
+			return KEY_DEAD;
 		move_master_key_secret(&mk->mk_secret, secret);
+	}
+
 	return 0;
 }
 
@@ -436,38 +488,36 @@ static int do_add_master_key(struct super_block *sb,
 			     const struct fscrypt_key_specifier *mk_spec)
 {
 	static DEFINE_MUTEX(fscrypt_add_key_mutex);
-	struct key *key;
+	struct fscrypt_master_key *mk;
 	int err;
 
 	mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */
-retry:
-	key = fscrypt_find_master_key(sb, mk_spec);
-	if (IS_ERR(key)) {
-		err = PTR_ERR(key);
-		if (err != -ENOKEY)
-			goto out_unlock;
+
+	mk = fscrypt_find_master_key(sb, mk_spec);
+	if (!mk) {
 		/* Didn't find the key in ->s_master_keys.  Add it. */
 		err = allocate_filesystem_keyring(sb);
-		if (err)
-			goto out_unlock;
-		err = add_new_master_key(secret, mk_spec, sb->s_master_keys);
+		if (!err)
+			err = add_new_master_key(sb, secret, mk_spec);
 	} else {
 		/*
 		 * Found the key in ->s_master_keys.  Re-add the secret if
 		 * needed, and add the user to ->mk_users if needed.
 		 */
-		down_write(&key->sem);
-		err = add_existing_master_key(key->payload.data[0], secret);
-		up_write(&key->sem);
+		down_write(&mk->mk_sem);
+		err = add_existing_master_key(mk, secret);
+		up_write(&mk->mk_sem);
 		if (err == KEY_DEAD) {
-			/* Key being removed or needs to be removed */
-			key_invalidate(key);
-			key_put(key);
-			goto retry;
+			/*
+			 * We found a key struct, but it's already been fully
+			 * removed.  Ignore the old struct and add a new one.
+			 * fscrypt_add_key_mutex means we don't need to worry
+			 * about concurrent adds.
+			 */
+			err = add_new_master_key(sb, secret, mk_spec);
 		}
-		key_put(key);
+		fscrypt_put_master_key(mk);
 	}
-out_unlock:
 	mutex_unlock(&fscrypt_add_key_mutex);
 	return err;
 }
@@ -771,19 +821,19 @@ int fscrypt_verify_key_added(struct super_block *sb,
 			     const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
 {
 	struct fscrypt_key_specifier mk_spec;
-	struct key *key, *mk_user;
 	struct fscrypt_master_key *mk;
+	struct key *mk_user;
 	int err;
 
 	mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
 	memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE);
 
-	key = fscrypt_find_master_key(sb, &mk_spec);
-	if (IS_ERR(key)) {
-		err = PTR_ERR(key);
+	mk = fscrypt_find_master_key(sb, &mk_spec);
+	if (!mk) {
+		err = -ENOKEY;
 		goto out;
 	}
-	mk = key->payload.data[0];
+	down_read(&mk->mk_sem);
 	mk_user = find_master_key_user(mk);
 	if (IS_ERR(mk_user)) {
 		err = PTR_ERR(mk_user);
@@ -791,7 +841,8 @@ int fscrypt_verify_key_added(struct super_block *sb,
 		key_put(mk_user);
 		err = 0;
 	}
-	key_put(key);
+	up_read(&mk->mk_sem);
+	fscrypt_put_master_key(mk);
 out:
 	if (err == -ENOKEY && capable(CAP_FOWNER))
 		err = 0;
@@ -953,11 +1004,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
 	struct super_block *sb = file_inode(filp)->i_sb;
 	struct fscrypt_remove_key_arg __user *uarg = _uarg;
 	struct fscrypt_remove_key_arg arg;
-	struct key *key;
 	struct fscrypt_master_key *mk;
 	u32 status_flags = 0;
 	int err;
-	bool dead;
+	bool inodes_remain;
 
 	if (copy_from_user(&arg, uarg, sizeof(arg)))
 		return -EFAULT;
@@ -977,12 +1027,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
 		return -EACCES;
 
 	/* Find the key being removed. */
-	key = fscrypt_find_master_key(sb, &arg.key_spec);
-	if (IS_ERR(key))
-		return PTR_ERR(key);
-	mk = key->payload.data[0];
-
-	down_write(&key->sem);
+	mk = fscrypt_find_master_key(sb, &arg.key_spec);
+	if (!mk)
+		return -ENOKEY;
+	down_write(&mk->mk_sem);
 
 	/* If relevant, remove current user's (or all users) claim to the key */
 	if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) {
@@ -991,7 +1039,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
 		else
 			err = remove_master_key_user(mk);
 		if (err) {
-			up_write(&key->sem);
+			up_write(&mk->mk_sem);
 			goto out_put_key;
 		}
 		if (mk->mk_users->keys.nr_leaves_on_tree != 0) {
@@ -1003,26 +1051,22 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
 			status_flags |=
 				FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS;
 			err = 0;
-			up_write(&key->sem);
+			up_write(&mk->mk_sem);
 			goto out_put_key;
 		}
 	}
 
 	/* No user claims remaining.  Go ahead and wipe the secret. */
-	dead = false;
+	err = -ENOKEY;
 	if (is_master_key_secret_present(&mk->mk_secret)) {
 		wipe_master_key_secret(&mk->mk_secret);
-		dead = refcount_dec_and_test(&mk->mk_refcount);
-	}
-	up_write(&key->sem);
-	if (dead) {
-		/*
-		 * No inodes reference the key, and we wiped the secret, so the
-		 * key object is free to be removed from the keyring.
-		 */
-		key_invalidate(key);
+		fscrypt_put_master_key_activeref(mk);
 		err = 0;
-	} else {
+	}
+	inodes_remain = refcount_read(&mk->mk_active_refs) > 0;
+	up_write(&mk->mk_sem);
+
+	if (inodes_remain) {
 		/* Some inodes still reference this key; try to evict them. */
 		err = try_to_lock_encrypted_files(sb, mk);
 		if (err == -EBUSY) {
@@ -1038,7 +1082,7 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users)
 	 * has been fully removed including all files locked.
 	 */
 out_put_key:
-	key_put(key);
+	fscrypt_put_master_key(mk);
 	if (err == 0)
 		err = put_user(status_flags, &uarg->removal_status_flags);
 	return err;
@@ -1085,7 +1129,6 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
 {
 	struct super_block *sb = file_inode(filp)->i_sb;
 	struct fscrypt_get_key_status_arg arg;
-	struct key *key;
 	struct fscrypt_master_key *mk;
 	int err;
 
@@ -1102,19 +1145,18 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
 	arg.user_count = 0;
 	memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved));
 
-	key = fscrypt_find_master_key(sb, &arg.key_spec);
-	if (IS_ERR(key)) {
-		if (key != ERR_PTR(-ENOKEY))
-			return PTR_ERR(key);
+	mk = fscrypt_find_master_key(sb, &arg.key_spec);
+	if (!mk) {
 		arg.status = FSCRYPT_KEY_STATUS_ABSENT;
 		err = 0;
 		goto out;
 	}
-	mk = key->payload.data[0];
-	down_read(&key->sem);
+	down_read(&mk->mk_sem);
 
 	if (!is_master_key_secret_present(&mk->mk_secret)) {
-		arg.status = FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED;
+		arg.status = refcount_read(&mk->mk_active_refs) > 0 ?
+			FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED :
+			FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */;
 		err = 0;
 		goto out_release_key;
 	}
@@ -1136,8 +1178,8 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg)
 	}
 	err = 0;
 out_release_key:
-	up_read(&key->sem);
-	key_put(key);
+	up_read(&mk->mk_sem);
+	fscrypt_put_master_key(mk);
 out:
 	if (!err && copy_to_user(uarg, &arg, sizeof(arg)))
 		err = -EFAULT;
@@ -1149,13 +1191,9 @@ int __init fscrypt_init_keyring(void)
 {
 	int err;
 
-	err = register_key_type(&key_type_fscrypt);
-	if (err)
-		return err;
-
 	err = register_key_type(&key_type_fscrypt_user);
 	if (err)
-		goto err_unregister_fscrypt;
+		return err;
 
 	err = register_key_type(&key_type_fscrypt_provisioning);
 	if (err)
@@ -1165,7 +1203,5 @@ int __init fscrypt_init_keyring(void)
 
 err_unregister_fscrypt_user:
 	unregister_key_type(&key_type_fscrypt_user);
-err_unregister_fscrypt:
-	unregister_key_type(&key_type_fscrypt);
 	return err;
 }
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index fbc71abdabe3..e037a7b8e9e4 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -9,7 +9,6 @@
  */
 
 #include <crypto/skcipher.h>
-#include <linux/key.h>
 #include <linux/random.h>
 
 #include "fscrypt_private.h"
@@ -159,6 +158,7 @@ void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key)
 {
 	crypto_free_skcipher(prep_key->tfm);
 	fscrypt_destroy_inline_crypt_key(prep_key);
+	memzero_explicit(prep_key, sizeof(*prep_key));
 }
 
 /* Given a per-file encryption key, set up the file's crypto transform object */
@@ -412,20 +412,18 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk,
 /*
  * Find the master key, then set up the inode's actual encryption key.
  *
- * If the master key is found in the filesystem-level keyring, then the
- * corresponding 'struct key' is returned in *master_key_ret with its semaphore
- * read-locked.  This is needed to ensure that only one task links the
- * fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race to create
- * an fscrypt_info for the same inode), and to synchronize the master key being
- * removed with a new inode starting to use it.
+ * If the master key is found in the filesystem-level keyring, then it is
+ * returned in *mk_ret with its semaphore read-locked.  This is needed to ensure
+ * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as
+ * multiple tasks may race to create an fscrypt_info for the same inode), and to
+ * synchronize the master key being removed with a new inode starting to use it.
  */
 static int setup_file_encryption_key(struct fscrypt_info *ci,
 				     bool need_dirhash_key,
-				     struct key **master_key_ret)
+				     struct fscrypt_master_key **mk_ret)
 {
-	struct key *key;
-	struct fscrypt_master_key *mk = NULL;
 	struct fscrypt_key_specifier mk_spec;
+	struct fscrypt_master_key *mk;
 	int err;
 
 	err = fscrypt_select_encryption_impl(ci);
@@ -436,11 +434,10 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
 	if (err)
 		return err;
 
-	key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
-	if (IS_ERR(key)) {
-		if (key != ERR_PTR(-ENOKEY) ||
-		    ci->ci_policy.version != FSCRYPT_POLICY_V1)
-			return PTR_ERR(key);
+	mk = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
+	if (!mk) {
+		if (ci->ci_policy.version != FSCRYPT_POLICY_V1)
+			return -ENOKEY;
 
 		/*
 		 * As a legacy fallback for v1 policies, search for the key in
@@ -450,9 +447,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
 		 */
 		return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci);
 	}
-
-	mk = key->payload.data[0];
-	down_read(&key->sem);
+	down_read(&mk->mk_sem);
 
 	/* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */
 	if (!is_master_key_secret_present(&mk->mk_secret)) {
@@ -480,18 +475,18 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
 	if (err)
 		goto out_release_key;
 
-	*master_key_ret = key;
+	*mk_ret = mk;
 	return 0;
 
 out_release_key:
-	up_read(&key->sem);
-	key_put(key);
+	up_read(&mk->mk_sem);
+	fscrypt_put_master_key(mk);
 	return err;
 }
 
 static void put_crypt_info(struct fscrypt_info *ci)
 {
-	struct key *key;
+	struct fscrypt_master_key *mk;
 
 	if (!ci)
 		return;
@@ -501,24 +496,18 @@ static void put_crypt_info(struct fscrypt_info *ci)
 	else if (ci->ci_owns_key)
 		fscrypt_destroy_prepared_key(&ci->ci_enc_key);
 
-	key = ci->ci_master_key;
-	if (key) {
-		struct fscrypt_master_key *mk = key->payload.data[0];
-
+	mk = ci->ci_master_key;
+	if (mk) {
 		/*
 		 * Remove this inode from the list of inodes that were unlocked
-		 * with the master key.
-		 *
-		 * In addition, if we're removing the last inode from a key that
-		 * already had its secret removed, invalidate the key so that it
-		 * gets removed from ->s_master_keys.
+		 * with the master key.  In addition, if we're removing the last
+		 * inode from a master key struct that already had its secret
+		 * removed, then complete the full removal of the struct.
 		 */
 		spin_lock(&mk->mk_decrypted_inodes_lock);
 		list_del(&ci->ci_master_key_link);
 		spin_unlock(&mk->mk_decrypted_inodes_lock);
-		if (refcount_dec_and_test(&mk->mk_refcount))
-			key_invalidate(key);
-		key_put(key);
+		fscrypt_put_master_key_activeref(mk);
 	}
 	memzero_explicit(ci, sizeof(*ci));
 	kmem_cache_free(fscrypt_info_cachep, ci);
@@ -532,7 +521,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
 {
 	struct fscrypt_info *crypt_info;
 	struct fscrypt_mode *mode;
-	struct key *master_key = NULL;
+	struct fscrypt_master_key *mk = NULL;
 	int res;
 
 	res = fscrypt_initialize(inode->i_sb->s_cop->flags);
@@ -555,8 +544,7 @@ fscrypt_setup_encryption_info(struct inode *inode,
 	WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
 	crypt_info->ci_mode = mode;
 
-	res = setup_file_encryption_key(crypt_info, need_dirhash_key,
-					&master_key);
+	res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk);
 	if (res)
 		goto out;
 
@@ -571,12 +559,9 @@ fscrypt_setup_encryption_info(struct inode *inode,
 		 * We won the race and set ->i_crypt_info to our crypt_info.
 		 * Now link it into the master key's inode list.
 		 */
-		if (master_key) {
-			struct fscrypt_master_key *mk =
-				master_key->payload.data[0];
-
-			refcount_inc(&mk->mk_refcount);
-			crypt_info->ci_master_key = key_get(master_key);
+		if (mk) {
+			crypt_info->ci_master_key = mk;
+			refcount_inc(&mk->mk_active_refs);
 			spin_lock(&mk->mk_decrypted_inodes_lock);
 			list_add(&crypt_info->ci_master_key_link,
 				 &mk->mk_decrypted_inodes);
@@ -586,9 +571,9 @@ fscrypt_setup_encryption_info(struct inode *inode,
 	}
 	res = 0;
 out:
-	if (master_key) {
-		up_read(&master_key->sem);
-		key_put(master_key);
+	if (mk) {
+		up_read(&mk->mk_sem);
+		fscrypt_put_master_key(mk);
 	}
 	put_crypt_info(crypt_info);
 	return res;
@@ -753,7 +738,6 @@ EXPORT_SYMBOL(fscrypt_free_inode);
 int fscrypt_drop_inode(struct inode *inode)
 {
 	const struct fscrypt_info *ci = fscrypt_get_info(inode);
-	const struct fscrypt_master_key *mk;
 
 	/*
 	 * If ci is NULL, then the inode doesn't have an encryption key set up
@@ -763,7 +747,6 @@ int fscrypt_drop_inode(struct inode *inode)
 	 */
 	if (!ci || !ci->ci_master_key)
 		return 0;
-	mk = ci->ci_master_key->payload.data[0];
 
 	/*
 	 * With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes
@@ -782,6 +765,6 @@ int fscrypt_drop_inode(struct inode *inode)
 	 * then the thread removing the key will either evict the inode itself
 	 * or will correctly detect that it wasn't evicted due to the race.
 	 */
-	return !is_master_key_secret_present(&mk->mk_secret);
+	return !is_master_key_secret_present(&ci->ci_master_key->mk_secret);
 }
 EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 55d57181cd9e..46757c3052ef 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -744,12 +744,8 @@ int fscrypt_set_context(struct inode *inode, void *fs_data)
 	 * delayed key setup that requires the inode number.
 	 */
 	if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
-	    (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
-		const struct fscrypt_master_key *mk =
-			ci->ci_master_key->payload.data[0];
-
-		fscrypt_hash_inode_number(ci, mk);
-	}
+	    (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
+		fscrypt_hash_inode_number(ci, ci->ci_master_key);
 
 	return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data);
 }
diff --git a/fs/super.c b/fs/super.c
index 734ed584a946..6a82660e1adb 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -291,7 +291,6 @@ static void __put_super(struct super_block *s)
 		WARN_ON(s->s_inode_lru.node);
 		WARN_ON(!list_empty(&s->s_mounts));
 		security_sb_free(s);
-		fscrypt_sb_free(s);
 		put_user_ns(s->s_user_ns);
 		kfree(s->s_subtype);
 		call_rcu(&s->rcu, destroy_super_rcu);
@@ -480,6 +479,7 @@ void generic_shutdown_super(struct super_block *sb)
 		evict_inodes(sb);
 		/* only nonzero refcount inodes can have marks */
 		fsnotify_sb_delete(sb);
+		fscrypt_sb_delete(sb);
 		security_sb_delete(sb);
 
 		if (sb->s_dio_done_wq) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eced4cc286e..0830486f47ef 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1472,7 +1472,7 @@ struct super_block {
 	const struct xattr_handler **s_xattr;
 #ifdef CONFIG_FS_ENCRYPTION
 	const struct fscrypt_operations	*s_cop;
-	struct key		*s_master_keys; /* master crypto keys in use */
+	struct fscrypt_keyring	*s_master_keys; /* master crypto keys in use */
 #endif
 #ifdef CONFIG_FS_VERITY
 	const struct fsverity_operations *s_vop;
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 488fd8c8f8af..db5bb5650bf2 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -310,7 +310,7 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
 }
 
 /* keyring.c */
-void fscrypt_sb_free(struct super_block *sb);
+void fscrypt_sb_delete(struct super_block *sb);
 int fscrypt_ioctl_add_key(struct file *filp, void __user *arg);
 int fscrypt_add_test_dummy_key(struct super_block *sb,
 			       const struct fscrypt_dummy_policy *dummy_policy);
@@ -524,7 +524,7 @@ fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
 }
 
 /* keyring.c */
-static inline void fscrypt_sb_free(struct super_block *sb)
+static inline void fscrypt_sb_delete(struct super_block *sb)
 {
 }
 
-- 
cgit v1.2.3


From 22e9947a4b2ba255888541bd0111cf00b9b16586 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Thu, 1 Sep 2022 12:32:07 -0700
Subject: fscrypt: stop holding extra request_queue references

Now that the fscrypt_master_key lifetime has been reworked to not be
subject to the quirks of the keyrings subsystem, blk_crypto_evict_key()
no longer gets called after the filesystem has already been unmounted.
Therefore, there is no longer any need to hold extra references to the
filesystem's request_queue(s).  (And these references didn't always do
their intended job anyway, as pinning a request_queue doesn't
necessarily pin the corresponding blk_crypto_profile.)

Stop taking these extra references.  Instead, just pass the super_block
to fscrypt_destroy_inline_crypt_key(), and use it to get the list of
block devices the key needs to be evicted from.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220901193208.138056-3-ebiggers@kernel.org
---
 fs/crypto/fscrypt_private.h | 11 ++++--
 fs/crypto/inline_crypt.c    | 92 +++++++++++++++++++++------------------------
 fs/crypto/keyring.c         |  9 +++--
 fs/crypto/keysetup.c        |  8 ++--
 fs/crypto/keysetup_v1.c     |  4 +-
 5 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 577cae7facb0..d5f68a0c5d15 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -184,7 +184,7 @@ struct fscrypt_symlink_data {
 struct fscrypt_prepared_key {
 	struct crypto_skcipher *tfm;
 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
-	struct fscrypt_blk_crypto_key *blk_key;
+	struct blk_crypto_key *blk_key;
 #endif
 };
 
@@ -344,7 +344,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 				     const u8 *raw_key,
 				     const struct fscrypt_info *ci);
 
-void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key);
+void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
+				      struct fscrypt_prepared_key *prep_key);
 
 /*
  * Check whether the crypto transform or blk-crypto key has been allocated in
@@ -390,7 +391,8 @@ fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 }
 
 static inline void
-fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key)
+fscrypt_destroy_inline_crypt_key(struct super_block *sb,
+				 struct fscrypt_prepared_key *prep_key)
 {
 }
 
@@ -600,7 +602,8 @@ extern struct fscrypt_mode fscrypt_modes[];
 int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
 			const u8 *raw_key, const struct fscrypt_info *ci);
 
-void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key);
+void fscrypt_destroy_prepared_key(struct super_block *sb,
+				  struct fscrypt_prepared_key *prep_key);
 
 int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
 
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 90f3e68f166e..7d1e2ec72253 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -21,12 +21,6 @@
 
 #include "fscrypt_private.h"
 
-struct fscrypt_blk_crypto_key {
-	struct blk_crypto_key base;
-	int num_devs;
-	struct request_queue *devs[];
-};
-
 static int fscrypt_get_num_devices(struct super_block *sb)
 {
 	if (sb->s_cop->get_num_devices)
@@ -162,49 +156,42 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 	const struct inode *inode = ci->ci_inode;
 	struct super_block *sb = inode->i_sb;
 	enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode;
-	int num_devs = fscrypt_get_num_devices(sb);
-	int queue_refs = 0;
-	struct fscrypt_blk_crypto_key *blk_key;
+	struct blk_crypto_key *blk_key;
+	struct request_queue **devs;
+	unsigned int num_devs;
+	unsigned int i;
 	int err;
-	int i;
 
-	blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL);
+	blk_key = kmalloc(sizeof(*blk_key), GFP_KERNEL);
 	if (!blk_key)
 		return -ENOMEM;
 
-	blk_key->num_devs = num_devs;
-	fscrypt_get_devices(sb, num_devs, blk_key->devs);
-
-	err = blk_crypto_init_key(&blk_key->base, raw_key, crypto_mode,
+	err = blk_crypto_init_key(blk_key, raw_key, crypto_mode,
 				  fscrypt_get_dun_bytes(ci), sb->s_blocksize);
 	if (err) {
 		fscrypt_err(inode, "error %d initializing blk-crypto key", err);
 		goto fail;
 	}
 
-	/*
-	 * We have to start using blk-crypto on all the filesystem's devices.
-	 * We also have to save all the request_queue's for later so that the
-	 * key can be evicted from them.  This is needed because some keys
-	 * aren't destroyed until after the filesystem was already unmounted
-	 * (namely, the per-mode keys in struct fscrypt_master_key).
-	 */
+	/* Start using blk-crypto on all the filesystem's block devices. */
+	num_devs = fscrypt_get_num_devices(sb);
+	devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL);
+	if (!devs) {
+		err = -ENOMEM;
+		goto fail;
+	}
+	fscrypt_get_devices(sb, num_devs, devs);
 	for (i = 0; i < num_devs; i++) {
-		if (!blk_get_queue(blk_key->devs[i])) {
-			fscrypt_err(inode, "couldn't get request_queue");
-			err = -EAGAIN;
-			goto fail;
-		}
-		queue_refs++;
-
-		err = blk_crypto_start_using_key(&blk_key->base,
-						 blk_key->devs[i]);
-		if (err) {
-			fscrypt_err(inode,
-				    "error %d starting to use blk-crypto", err);
-			goto fail;
-		}
+		err = blk_crypto_start_using_key(blk_key, devs[i]);
+		if (err)
+			break;
 	}
+	kfree(devs);
+	if (err) {
+		fscrypt_err(inode, "error %d starting to use blk-crypto", err);
+		goto fail;
+	}
+
 	/*
 	 * Pairs with the smp_load_acquire() in fscrypt_is_key_prepared().
 	 * I.e., here we publish ->blk_key with a RELEASE barrier so that
@@ -215,24 +202,31 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 	return 0;
 
 fail:
-	for (i = 0; i < queue_refs; i++)
-		blk_put_queue(blk_key->devs[i]);
 	kfree_sensitive(blk_key);
 	return err;
 }
 
-void fscrypt_destroy_inline_crypt_key(struct fscrypt_prepared_key *prep_key)
+void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
+				      struct fscrypt_prepared_key *prep_key)
 {
-	struct fscrypt_blk_crypto_key *blk_key = prep_key->blk_key;
-	int i;
+	struct blk_crypto_key *blk_key = prep_key->blk_key;
+	struct request_queue **devs;
+	unsigned int num_devs;
+	unsigned int i;
 
-	if (blk_key) {
-		for (i = 0; i < blk_key->num_devs; i++) {
-			blk_crypto_evict_key(blk_key->devs[i], &blk_key->base);
-			blk_put_queue(blk_key->devs[i]);
-		}
-		kfree_sensitive(blk_key);
+	if (!blk_key)
+		return;
+
+	/* Evict the key from all the filesystem's block devices. */
+	num_devs = fscrypt_get_num_devices(sb);
+	devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL);
+	if (devs) {
+		fscrypt_get_devices(sb, num_devs, devs);
+		for (i = 0; i < num_devs; i++)
+			blk_crypto_evict_key(devs[i], blk_key);
+		kfree(devs);
 	}
+	kfree_sensitive(blk_key);
 }
 
 bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
@@ -282,7 +276,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode,
 	ci = inode->i_crypt_info;
 
 	fscrypt_generate_dun(ci, first_lblk, dun);
-	bio_crypt_set_ctx(bio, &ci->ci_enc_key.blk_key->base, dun, gfp_mask);
+	bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask);
 }
 EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx);
 
@@ -369,7 +363,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
 	 * uses the same pointer.  I.e., there's currently no need to support
 	 * merging requests where the keys are the same but the pointers differ.
 	 */
-	if (bc->bc_key != &inode->i_crypt_info->ci_enc_key.blk_key->base)
+	if (bc->bc_key != inode->i_crypt_info->ci_enc_key.blk_key)
 		return false;
 
 	fscrypt_generate_dun(inode->i_crypt_info, next_lblk, next_dun);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 9b98d6a576e6..1cca09aa43f8 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -105,9 +105,12 @@ void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk)
 	WARN_ON(!list_empty(&mk->mk_decrypted_inodes));
 
 	for (i = 0; i <= FSCRYPT_MODE_MAX; i++) {
-		fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]);
-		fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]);
-		fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]);
+		fscrypt_destroy_prepared_key(
+				sb, &mk->mk_direct_keys[i]);
+		fscrypt_destroy_prepared_key(
+				sb, &mk->mk_iv_ino_lblk_64_keys[i]);
+		fscrypt_destroy_prepared_key(
+				sb, &mk->mk_iv_ino_lblk_32_keys[i]);
 	}
 	memzero_explicit(&mk->mk_ino_hash_key,
 			 sizeof(mk->mk_ino_hash_key));
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index e037a7b8e9e4..f7407071a952 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -154,10 +154,11 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
 }
 
 /* Destroy a crypto transform object and/or blk-crypto key. */
-void fscrypt_destroy_prepared_key(struct fscrypt_prepared_key *prep_key)
+void fscrypt_destroy_prepared_key(struct super_block *sb,
+				  struct fscrypt_prepared_key *prep_key)
 {
 	crypto_free_skcipher(prep_key->tfm);
-	fscrypt_destroy_inline_crypt_key(prep_key);
+	fscrypt_destroy_inline_crypt_key(sb, prep_key);
 	memzero_explicit(prep_key, sizeof(*prep_key));
 }
 
@@ -494,7 +495,8 @@ static void put_crypt_info(struct fscrypt_info *ci)
 	if (ci->ci_direct_key)
 		fscrypt_put_direct_key(ci->ci_direct_key);
 	else if (ci->ci_owns_key)
-		fscrypt_destroy_prepared_key(&ci->ci_enc_key);
+		fscrypt_destroy_prepared_key(ci->ci_inode->i_sb,
+					     &ci->ci_enc_key);
 
 	mk = ci->ci_master_key;
 	if (mk) {
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index 2762c5350432..75dabd9b27f9 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -143,6 +143,7 @@ invalid:
 
 /* Master key referenced by DIRECT_KEY policy */
 struct fscrypt_direct_key {
+	struct super_block		*dk_sb;
 	struct hlist_node		dk_node;
 	refcount_t			dk_refcount;
 	const struct fscrypt_mode	*dk_mode;
@@ -154,7 +155,7 @@ struct fscrypt_direct_key {
 static void free_direct_key(struct fscrypt_direct_key *dk)
 {
 	if (dk) {
-		fscrypt_destroy_prepared_key(&dk->dk_key);
+		fscrypt_destroy_prepared_key(dk->dk_sb, &dk->dk_key);
 		kfree_sensitive(dk);
 	}
 }
@@ -231,6 +232,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
 	dk = kzalloc(sizeof(*dk), GFP_KERNEL);
 	if (!dk)
 		return ERR_PTR(-ENOMEM);
+	dk->dk_sb = ci->ci_inode->i_sb;
 	refcount_set(&dk->dk_refcount, 1);
 	dk->dk_mode = ci->ci_mode;
 	err = fscrypt_prepare_key(&dk->dk_key, raw_key, ci);
-- 
cgit v1.2.3


From 0e91fc1e0f5c70ce575451103ec66c2ec21f1a6e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Sep 2022 12:32:08 -0700
Subject: fscrypt: work on block_devices instead of request_queues

request_queues are a block layer implementation detail that should not
leak into file systems.  Change the fscrypt inline crypto code to
retrieve block devices instead of request_queues from the file system.
As part of that, clean up the interaction with multi-device file systems
by returning both the number of devices and the actual device array in a
single method call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
[ebiggers: bug fixes and minor tweaks]
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20220901193208.138056-4-ebiggers@kernel.org
---
 fs/crypto/inline_crypt.c | 81 ++++++++++++++++++++++++------------------------
 fs/f2fs/super.c          | 24 +++++++-------
 include/linux/fscrypt.h  | 21 ++++++-------
 3 files changed, 62 insertions(+), 64 deletions(-)

diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 7d1e2ec72253..c40bd55bc781 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -21,20 +21,22 @@
 
 #include "fscrypt_private.h"
 
-static int fscrypt_get_num_devices(struct super_block *sb)
+static struct block_device **fscrypt_get_devices(struct super_block *sb,
+						 unsigned int *num_devs)
 {
-	if (sb->s_cop->get_num_devices)
-		return sb->s_cop->get_num_devices(sb);
-	return 1;
-}
+	struct block_device **devs;
 
-static void fscrypt_get_devices(struct super_block *sb, int num_devs,
-				struct request_queue **devs)
-{
-	if (num_devs == 1)
-		devs[0] = bdev_get_queue(sb->s_bdev);
-	else
-		sb->s_cop->get_devices(sb, devs);
+	if (sb->s_cop->get_devices) {
+		devs = sb->s_cop->get_devices(sb, num_devs);
+		if (devs)
+			return devs;
+	}
+	devs = kmalloc(sizeof(*devs), GFP_KERNEL);
+	if (!devs)
+		return ERR_PTR(-ENOMEM);
+	devs[0] = sb->s_bdev;
+	*num_devs = 1;
+	return devs;
 }
 
 static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
@@ -68,15 +70,17 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci)
  * helpful for debugging problems where the "wrong" implementation is used.
  */
 static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
-					struct request_queue **devs,
-					int num_devs,
+					struct block_device **devs,
+					unsigned int num_devs,
 					const struct blk_crypto_config *cfg)
 {
-	int i;
+	unsigned int i;
 
 	for (i = 0; i < num_devs; i++) {
+		struct request_queue *q = bdev_get_queue(devs[i]);
+
 		if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
-		    __blk_crypto_cfg_supported(devs[i]->crypto_profile, cfg)) {
+		    __blk_crypto_cfg_supported(q->crypto_profile, cfg)) {
 			if (!xchg(&mode->logged_blk_crypto_native, 1))
 				pr_info("fscrypt: %s using blk-crypto (native)\n",
 					mode->friendly_name);
@@ -93,9 +97,9 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
 	const struct inode *inode = ci->ci_inode;
 	struct super_block *sb = inode->i_sb;
 	struct blk_crypto_config crypto_cfg;
-	int num_devs;
-	struct request_queue **devs;
-	int i;
+	struct block_device **devs;
+	unsigned int num_devs;
+	unsigned int i;
 
 	/* The file must need contents encryption, not filenames encryption */
 	if (!S_ISREG(inode->i_mode))
@@ -123,20 +127,20 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
 		return 0;
 
 	/*
-	 * On all the filesystem's devices, blk-crypto must support the crypto
-	 * configuration that the file would use.
+	 * On all the filesystem's block devices, blk-crypto must support the
+	 * crypto configuration that the file would use.
 	 */
 	crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
 	crypto_cfg.data_unit_size = sb->s_blocksize;
 	crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
-	num_devs = fscrypt_get_num_devices(sb);
-	devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL);
-	if (!devs)
-		return -ENOMEM;
-	fscrypt_get_devices(sb, num_devs, devs);
+
+	devs = fscrypt_get_devices(sb, &num_devs);
+	if (IS_ERR(devs))
+		return PTR_ERR(devs);
 
 	for (i = 0; i < num_devs; i++) {
-		if (!blk_crypto_config_supported(devs[i], &crypto_cfg))
+		if (!blk_crypto_config_supported(bdev_get_queue(devs[i]),
+						 &crypto_cfg))
 			goto out_free_devs;
 	}
 
@@ -157,7 +161,7 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 	struct super_block *sb = inode->i_sb;
 	enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode;
 	struct blk_crypto_key *blk_key;
-	struct request_queue **devs;
+	struct block_device **devs;
 	unsigned int num_devs;
 	unsigned int i;
 	int err;
@@ -174,15 +178,14 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 	}
 
 	/* Start using blk-crypto on all the filesystem's block devices. */
-	num_devs = fscrypt_get_num_devices(sb);
-	devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL);
-	if (!devs) {
-		err = -ENOMEM;
+	devs = fscrypt_get_devices(sb, &num_devs);
+	if (IS_ERR(devs)) {
+		err = PTR_ERR(devs);
 		goto fail;
 	}
-	fscrypt_get_devices(sb, num_devs, devs);
 	for (i = 0; i < num_devs; i++) {
-		err = blk_crypto_start_using_key(blk_key, devs[i]);
+		err = blk_crypto_start_using_key(blk_key,
+						 bdev_get_queue(devs[i]));
 		if (err)
 			break;
 	}
@@ -210,7 +213,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 				      struct fscrypt_prepared_key *prep_key)
 {
 	struct blk_crypto_key *blk_key = prep_key->blk_key;
-	struct request_queue **devs;
+	struct block_device **devs;
 	unsigned int num_devs;
 	unsigned int i;
 
@@ -218,12 +221,10 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
 		return;
 
 	/* Evict the key from all the filesystem's block devices. */
-	num_devs = fscrypt_get_num_devices(sb);
-	devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL);
-	if (devs) {
-		fscrypt_get_devices(sb, num_devs, devs);
+	devs = fscrypt_get_devices(sb, &num_devs);
+	if (!IS_ERR(devs)) {
 		for (i = 0; i < num_devs; i++)
-			blk_crypto_evict_key(devs[i], blk_key);
+			blk_crypto_evict_key(bdev_get_queue(devs[i]), blk_key);
 		kfree(devs);
 	}
 	kfree_sensitive(blk_key);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 2451623c05a7..26817b5aeac7 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -3039,23 +3039,24 @@ static void f2fs_get_ino_and_lblk_bits(struct super_block *sb,
 	*lblk_bits_ret = 8 * sizeof(block_t);
 }
 
-static int f2fs_get_num_devices(struct super_block *sb)
+static struct block_device **f2fs_get_devices(struct super_block *sb,
+					      unsigned int *num_devs)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	struct block_device **devs;
+	int i;
 
-	if (f2fs_is_multi_device(sbi))
-		return sbi->s_ndevs;
-	return 1;
-}
+	if (!f2fs_is_multi_device(sbi))
+		return NULL;
 
-static void f2fs_get_devices(struct super_block *sb,
-			     struct request_queue **devs)
-{
-	struct f2fs_sb_info *sbi = F2FS_SB(sb);
-	int i;
+	devs = kmalloc_array(sbi->s_ndevs, sizeof(*devs), GFP_KERNEL);
+	if (!devs)
+		return ERR_PTR(-ENOMEM);
 
 	for (i = 0; i < sbi->s_ndevs; i++)
-		devs[i] = bdev_get_queue(FDEV(i).bdev);
+		devs[i] = FDEV(i).bdev;
+	*num_devs = sbi->s_ndevs;
+	return devs;
 }
 
 static const struct fscrypt_operations f2fs_cryptops = {
@@ -3066,7 +3067,6 @@ static const struct fscrypt_operations f2fs_cryptops = {
 	.empty_dir		= f2fs_empty_dir,
 	.has_stable_inodes	= f2fs_has_stable_inodes,
 	.get_ino_and_lblk_bits	= f2fs_get_ino_and_lblk_bits,
-	.get_num_devices	= f2fs_get_num_devices,
 	.get_devices		= f2fs_get_devices,
 };
 #endif
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index db5bb5650bf2..1f12ebb4a69d 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -161,24 +161,21 @@ struct fscrypt_operations {
 				      int *ino_bits_ret, int *lblk_bits_ret);
 
 	/*
-	 * Return the number of block devices to which the filesystem may write
-	 * encrypted file contents.
+	 * Return an array of pointers to the block devices to which the
+	 * filesystem may write encrypted file contents, NULL if the filesystem
+	 * only has a single such block device, or an ERR_PTR() on error.
+	 *
+	 * On successful non-NULL return, *num_devs is set to the number of
+	 * devices in the returned array.  The caller must free the returned
+	 * array using kfree().
 	 *
 	 * If the filesystem can use multiple block devices (other than block
 	 * devices that aren't used for encrypted file contents, such as
 	 * external journal devices), and wants to support inline encryption,
 	 * then it must implement this function.  Otherwise it's not needed.
 	 */
-	int (*get_num_devices)(struct super_block *sb);
-
-	/*
-	 * If ->get_num_devices() returns a value greater than 1, then this
-	 * function is called to get the array of request_queues that the
-	 * filesystem is using -- one per block device.  (There may be duplicate
-	 * entries in this array, as block devices can share a request_queue.)
-	 */
-	void (*get_devices)(struct super_block *sb,
-			    struct request_queue **devs);
+	struct block_device **(*get_devices)(struct super_block *sb,
+					     unsigned int *num_devs);
 };
 
 static inline struct fscrypt_info *fscrypt_get_info(const struct inode *inode)
-- 
cgit v1.2.3


From 6999aae17a7b66c56e6cc8e05b3cd51718c3bfe3 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Wed, 14 Sep 2022 19:04:00 +0200
Subject: xfrm: add extack support to verify_newsa_info

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 90 ++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 69 insertions(+), 21 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 772a051feedb..4167c189d35b 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -149,7 +149,8 @@ static inline int verify_replay(struct xfrm_usersa_info *p,
 }
 
 static int verify_newsa_info(struct xfrm_usersa_info *p,
-			     struct nlattr **attrs)
+			     struct nlattr **attrs,
+			     struct netlink_ext_ack *extack)
 {
 	int err;
 
@@ -163,10 +164,12 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		break;
 #else
 		err = -EAFNOSUPPORT;
+		NL_SET_ERR_MSG(extack, "IPv6 support disabled");
 		goto out;
 #endif
 
 	default:
+		NL_SET_ERR_MSG(extack, "Invalid address family");
 		goto out;
 	}
 
@@ -175,65 +178,98 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		break;
 
 	case AF_INET:
-		if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32)
+		if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) {
+			NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 32 for IPv4)");
 			goto out;
+		}
 
 		break;
 
 	case AF_INET6:
 #if IS_ENABLED(CONFIG_IPV6)
-		if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128)
+		if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) {
+			NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 128 for IPv6)");
 			goto out;
+		}
 
 		break;
 #else
+		NL_SET_ERR_MSG(extack, "IPv6 support disabled");
 		err = -EAFNOSUPPORT;
 		goto out;
 #endif
 
 	default:
+		NL_SET_ERR_MSG(extack, "Invalid address family in selector");
 		goto out;
 	}
 
 	err = -EINVAL;
 	switch (p->id.proto) {
 	case IPPROTO_AH:
-		if ((!attrs[XFRMA_ALG_AUTH]	&&
-		     !attrs[XFRMA_ALG_AUTH_TRUNC]) ||
-		    attrs[XFRMA_ALG_AEAD]	||
+		if (!attrs[XFRMA_ALG_AUTH]	&&
+		    !attrs[XFRMA_ALG_AUTH_TRUNC]) {
+			NL_SET_ERR_MSG(extack, "Missing required attribute for AH: AUTH_TRUNC or AUTH");
+			goto out;
+		}
+
+		if (attrs[XFRMA_ALG_AEAD]	||
 		    attrs[XFRMA_ALG_CRYPT]	||
 		    attrs[XFRMA_ALG_COMP]	||
-		    attrs[XFRMA_TFCPAD])
+		    attrs[XFRMA_TFCPAD]) {
+			NL_SET_ERR_MSG(extack, "Invalid attributes for AH: AEAD, CRYPT, COMP, TFCPAD");
 			goto out;
+		}
 		break;
 
 	case IPPROTO_ESP:
-		if (attrs[XFRMA_ALG_COMP])
+		if (attrs[XFRMA_ALG_COMP]) {
+			NL_SET_ERR_MSG(extack, "Invalid attribute for ESP: COMP");
 			goto out;
+		}
+
 		if (!attrs[XFRMA_ALG_AUTH] &&
 		    !attrs[XFRMA_ALG_AUTH_TRUNC] &&
 		    !attrs[XFRMA_ALG_CRYPT] &&
-		    !attrs[XFRMA_ALG_AEAD])
+		    !attrs[XFRMA_ALG_AEAD]) {
+			NL_SET_ERR_MSG(extack, "Missing required attribute for ESP: at least one of AUTH, AUTH_TRUNC, CRYPT, AEAD");
 			goto out;
+		}
+
 		if ((attrs[XFRMA_ALG_AUTH] ||
 		     attrs[XFRMA_ALG_AUTH_TRUNC] ||
 		     attrs[XFRMA_ALG_CRYPT]) &&
-		    attrs[XFRMA_ALG_AEAD])
+		    attrs[XFRMA_ALG_AEAD]) {
+			NL_SET_ERR_MSG(extack, "Invalid attribute combination for ESP: AEAD can't be used with AUTH, AUTH_TRUNC, CRYPT");
 			goto out;
+		}
+
 		if (attrs[XFRMA_TFCPAD] &&
-		    p->mode != XFRM_MODE_TUNNEL)
+		    p->mode != XFRM_MODE_TUNNEL) {
+			NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode");
 			goto out;
+		}
 		break;
 
 	case IPPROTO_COMP:
-		if (!attrs[XFRMA_ALG_COMP]	||
-		    attrs[XFRMA_ALG_AEAD]	||
+		if (!attrs[XFRMA_ALG_COMP]) {
+			NL_SET_ERR_MSG(extack, "Missing required attribute for COMP: COMP");
+			goto out;
+		}
+
+		if (attrs[XFRMA_ALG_AEAD]	||
 		    attrs[XFRMA_ALG_AUTH]	||
 		    attrs[XFRMA_ALG_AUTH_TRUNC]	||
 		    attrs[XFRMA_ALG_CRYPT]	||
-		    attrs[XFRMA_TFCPAD]		||
-		    (ntohl(p->id.spi) >= 0x10000))
+		    attrs[XFRMA_TFCPAD]) {
+			NL_SET_ERR_MSG(extack, "Invalid attributes for COMP: AEAD, AUTH, AUTH_TRUNC, CRYPT, TFCPAD");
+			goto out;
+		}
+
+		if (ntohl(p->id.spi) >= 0x10000) {
+			NL_SET_ERR_MSG(extack, "SPI is too large for COMP (must be < 0x10000)");
 			goto out;
+		}
 		break;
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -246,13 +282,20 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		    attrs[XFRMA_ALG_CRYPT]	||
 		    attrs[XFRMA_ENCAP]		||
 		    attrs[XFRMA_SEC_CTX]	||
-		    attrs[XFRMA_TFCPAD]		||
-		    !attrs[XFRMA_COADDR])
+		    attrs[XFRMA_TFCPAD]) {
+			NL_SET_ERR_MSG(extack, "Invalid attributes for DSTOPTS/ROUTING");
+			goto out;
+		}
+
+		if (!attrs[XFRMA_COADDR]) {
+			NL_SET_ERR_MSG(extack, "Missing required COADDR attribute for DSTOPTS/ROUTING");
 			goto out;
+		}
 		break;
 #endif
 
 	default:
+		NL_SET_ERR_MSG(extack, "Unsupported protocol");
 		goto out;
 	}
 
@@ -266,7 +309,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		goto out;
 	if ((err = verify_one_alg(attrs, XFRMA_ALG_COMP)))
 		goto out;
-	if ((err = verify_sec_ctx_len(attrs, NULL)))
+	if ((err = verify_sec_ctx_len(attrs, extack)))
 		goto out;
 	if ((err = verify_replay(p, attrs)))
 		goto out;
@@ -280,14 +323,19 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		break;
 
 	default:
+		NL_SET_ERR_MSG(extack, "Unsupported mode");
 		goto out;
 	}
 
 	err = 0;
 
-	if (attrs[XFRMA_MTIMER_THRESH])
-		if (!attrs[XFRMA_ENCAP])
+	if (attrs[XFRMA_MTIMER_THRESH]) {
+		if (!attrs[XFRMA_ENCAP]) {
+			NL_SET_ERR_MSG(extack, "MTIMER_THRESH attribute can only be set on ENCAP states");
 			err = -EINVAL;
+			goto out;
+		}
+	}
 
 out:
 	return err;
@@ -688,7 +736,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
 	int err;
 	struct km_event c;
 
-	err = verify_newsa_info(p, attrs);
+	err = verify_newsa_info(p, attrs, extack);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 785b87b220859170d8ca0e95c6396c7cfadce627 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Wed, 14 Sep 2022 19:04:01 +0200
Subject: xfrm: add extack to verify_replay

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 4167c189d35b..048c1e150b4e 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -121,29 +121,43 @@ static inline int verify_sec_ctx_len(struct nlattr **attrs, struct netlink_ext_a
 }
 
 static inline int verify_replay(struct xfrm_usersa_info *p,
-				struct nlattr **attrs)
+				struct nlattr **attrs,
+				struct netlink_ext_ack *extack)
 {
 	struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL];
 	struct xfrm_replay_state_esn *rs;
 
-	if (!rt)
-		return (p->flags & XFRM_STATE_ESN) ? -EINVAL : 0;
+	if (!rt) {
+		if (p->flags & XFRM_STATE_ESN) {
+			NL_SET_ERR_MSG(extack, "Missing required attribute for ESN");
+			return -EINVAL;
+		}
+		return 0;
+	}
 
 	rs = nla_data(rt);
 
-	if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8)
+	if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8) {
+		NL_SET_ERR_MSG(extack, "ESN bitmap length must be <= 128");
 		return -EINVAL;
+	}
 
 	if (nla_len(rt) < (int)xfrm_replay_state_esn_len(rs) &&
-	    nla_len(rt) != sizeof(*rs))
+	    nla_len(rt) != sizeof(*rs)) {
+		NL_SET_ERR_MSG(extack, "ESN attribute is too short to fit the full bitmap length");
 		return -EINVAL;
+	}
 
 	/* As only ESP and AH support ESN feature. */
-	if ((p->id.proto != IPPROTO_ESP) && (p->id.proto != IPPROTO_AH))
+	if ((p->id.proto != IPPROTO_ESP) && (p->id.proto != IPPROTO_AH)) {
+		NL_SET_ERR_MSG(extack, "ESN only supported for ESP and AH");
 		return -EINVAL;
+	}
 
-	if (p->replay_window != 0)
+	if (p->replay_window != 0) {
+		NL_SET_ERR_MSG(extack, "ESN not compatible with legacy replay_window");
 		return -EINVAL;
+	}
 
 	return 0;
 }
@@ -311,7 +325,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		goto out;
 	if ((err = verify_sec_ctx_len(attrs, extack)))
 		goto out;
-	if ((err = verify_replay(p, attrs)))
+	if ((err = verify_replay(p, attrs, extack)))
 		goto out;
 
 	err = -EINVAL;
-- 
cgit v1.2.3


From 1fc8fde553917bca7c9b65fafb045a2a5c97e683 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Wed, 14 Sep 2022 19:04:02 +0200
Subject: xfrm: add extack to verify_one_alg, verify_auth_trunc, verify_aead

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 048c1e150b4e..3c150e1f8a2a 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -35,7 +35,8 @@
 #endif
 #include <asm/unaligned.h>
 
-static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type)
+static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type,
+			  struct netlink_ext_ack *extack)
 {
 	struct nlattr *rt = attrs[type];
 	struct xfrm_algo *algp;
@@ -44,8 +45,10 @@ static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type)
 		return 0;
 
 	algp = nla_data(rt);
-	if (nla_len(rt) < (int)xfrm_alg_len(algp))
+	if (nla_len(rt) < (int)xfrm_alg_len(algp)) {
+		NL_SET_ERR_MSG(extack, "Invalid AUTH/CRYPT/COMP attribute length");
 		return -EINVAL;
+	}
 
 	switch (type) {
 	case XFRMA_ALG_AUTH:
@@ -54,6 +57,7 @@ static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type)
 		break;
 
 	default:
+		NL_SET_ERR_MSG(extack, "Invalid algorithm attribute type");
 		return -EINVAL;
 	}
 
@@ -61,7 +65,8 @@ static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type)
 	return 0;
 }
 
-static int verify_auth_trunc(struct nlattr **attrs)
+static int verify_auth_trunc(struct nlattr **attrs,
+			     struct netlink_ext_ack *extack)
 {
 	struct nlattr *rt = attrs[XFRMA_ALG_AUTH_TRUNC];
 	struct xfrm_algo_auth *algp;
@@ -70,14 +75,16 @@ static int verify_auth_trunc(struct nlattr **attrs)
 		return 0;
 
 	algp = nla_data(rt);
-	if (nla_len(rt) < (int)xfrm_alg_auth_len(algp))
+	if (nla_len(rt) < (int)xfrm_alg_auth_len(algp)) {
+		NL_SET_ERR_MSG(extack, "Invalid AUTH_TRUNC attribute length");
 		return -EINVAL;
+	}
 
 	algp->alg_name[sizeof(algp->alg_name) - 1] = '\0';
 	return 0;
 }
 
-static int verify_aead(struct nlattr **attrs)
+static int verify_aead(struct nlattr **attrs, struct netlink_ext_ack *extack)
 {
 	struct nlattr *rt = attrs[XFRMA_ALG_AEAD];
 	struct xfrm_algo_aead *algp;
@@ -86,8 +93,10 @@ static int verify_aead(struct nlattr **attrs)
 		return 0;
 
 	algp = nla_data(rt);
-	if (nla_len(rt) < (int)aead_len(algp))
+	if (nla_len(rt) < (int)aead_len(algp)) {
+		NL_SET_ERR_MSG(extack, "Invalid AEAD attribute length");
 		return -EINVAL;
+	}
 
 	algp->alg_name[sizeof(algp->alg_name) - 1] = '\0';
 	return 0;
@@ -313,15 +322,15 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		goto out;
 	}
 
-	if ((err = verify_aead(attrs)))
+	if ((err = verify_aead(attrs, extack)))
 		goto out;
-	if ((err = verify_auth_trunc(attrs)))
+	if ((err = verify_auth_trunc(attrs, extack)))
 		goto out;
-	if ((err = verify_one_alg(attrs, XFRMA_ALG_AUTH)))
+	if ((err = verify_one_alg(attrs, XFRMA_ALG_AUTH, extack)))
 		goto out;
-	if ((err = verify_one_alg(attrs, XFRMA_ALG_CRYPT)))
+	if ((err = verify_one_alg(attrs, XFRMA_ALG_CRYPT, extack)))
 		goto out;
-	if ((err = verify_one_alg(attrs, XFRMA_ALG_COMP)))
+	if ((err = verify_one_alg(attrs, XFRMA_ALG_COMP, extack)))
 		goto out;
 	if ((err = verify_sec_ctx_len(attrs, extack)))
 		goto out;
-- 
cgit v1.2.3


From adb5c33e4d4c83fb848a402e2191fbf3e2bf50d1 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Wed, 14 Sep 2022 19:04:03 +0200
Subject: xfrm: add extack support to xfrm_dev_state_add

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  5 +++--
 net/xfrm/xfrm_device.c | 20 +++++++++++++++-----
 net/xfrm/xfrm_user.c   |  8 +++++---
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 28b988577ed2..9c1cccf85f12 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1886,7 +1886,8 @@ void xfrm_dev_resume(struct sk_buff *skb);
 void xfrm_dev_backlog(struct softnet_data *sd);
 struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again);
 int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
-		       struct xfrm_user_offload *xuo);
+		       struct xfrm_user_offload *xuo,
+		       struct netlink_ext_ack *extack);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
 
 static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
@@ -1949,7 +1950,7 @@ static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_fea
 	return skb;
 }
 
-static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo)
+static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo, struct netlink_ext_ack *extack)
 {
 	return 0;
 }
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 637ca8838436..5f5aafd418af 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -207,7 +207,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
 EXPORT_SYMBOL_GPL(validate_xmit_xfrm);
 
 int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
-		       struct xfrm_user_offload *xuo)
+		       struct xfrm_user_offload *xuo,
+		       struct netlink_ext_ack *extack)
 {
 	int err;
 	struct dst_entry *dst;
@@ -216,15 +217,21 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 	xfrm_address_t *saddr;
 	xfrm_address_t *daddr;
 
-	if (!x->type_offload)
+	if (!x->type_offload) {
+		NL_SET_ERR_MSG(extack, "Type doesn't support offload");
 		return -EINVAL;
+	}
 
 	/* We don't yet support UDP encapsulation and TFC padding. */
-	if (x->encap || x->tfcpad)
+	if (x->encap || x->tfcpad) {
+		NL_SET_ERR_MSG(extack, "Encapsulation and TFC padding can't be offloaded");
 		return -EINVAL;
+	}
 
-	if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND))
+	if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) {
+		NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
 		return -EINVAL;
+	}
 
 	dev = dev_get_by_index(net, xuo->ifindex);
 	if (!dev) {
@@ -256,6 +263,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 
 	if (x->props.flags & XFRM_STATE_ESN &&
 	    !dev->xfrmdev_ops->xdo_dev_state_advance_esn) {
+		NL_SET_ERR_MSG(extack, "Device doesn't support offload with ESN");
 		xso->dev = NULL;
 		dev_put(dev);
 		return -EINVAL;
@@ -277,8 +285,10 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		xso->real_dev = NULL;
 		netdev_put(dev, &xso->dev_tracker);
 
-		if (err != -EOPNOTSUPP)
+		if (err != -EOPNOTSUPP) {
+			NL_SET_ERR_MSG(extack, "Device failed to offload this state");
 			return err;
+		}
 	}
 
 	return 0;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 3c150e1f8a2a..c56b9442dffe 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -652,7 +652,8 @@ static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m)
 static struct xfrm_state *xfrm_state_construct(struct net *net,
 					       struct xfrm_usersa_info *p,
 					       struct nlattr **attrs,
-					       int *errp)
+					       int *errp,
+					       struct netlink_ext_ack *extack)
 {
 	struct xfrm_state *x = xfrm_state_alloc(net);
 	int err = -ENOMEM;
@@ -735,7 +736,8 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 	/* configure the hardware if offload is requested */
 	if (attrs[XFRMA_OFFLOAD_DEV]) {
 		err = xfrm_dev_state_add(net, x,
-					 nla_data(attrs[XFRMA_OFFLOAD_DEV]));
+					 nla_data(attrs[XFRMA_OFFLOAD_DEV]),
+					 extack);
 		if (err)
 			goto error;
 	}
@@ -763,7 +765,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err)
 		return err;
 
-	x = xfrm_state_construct(net, p, attrs, &err);
+	x = xfrm_state_construct(net, p, attrs, &err, extack);
 	if (!x)
 		return err;
 
-- 
cgit v1.2.3


From 2b9168266d15dbb4b083b52e775c0399db427d4c Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Wed, 14 Sep 2022 19:04:04 +0200
Subject: xfrm: add extack to attach_*

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_user.c | 46 ++++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index c56b9442dffe..2cf5956b562e 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -366,7 +366,7 @@ out:
 
 static int attach_one_algo(struct xfrm_algo **algpp, u8 *props,
 			   struct xfrm_algo_desc *(*get_byname)(const char *, int),
-			   struct nlattr *rta)
+			   struct nlattr *rta, struct netlink_ext_ack *extack)
 {
 	struct xfrm_algo *p, *ualg;
 	struct xfrm_algo_desc *algo;
@@ -377,8 +377,10 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props,
 	ualg = nla_data(rta);
 
 	algo = get_byname(ualg->alg_name, 1);
-	if (!algo)
+	if (!algo) {
+		NL_SET_ERR_MSG(extack, "Requested COMP algorithm not found");
 		return -ENOSYS;
+	}
 	*props = algo->desc.sadb_alg_id;
 
 	p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL);
@@ -390,7 +392,8 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props,
 	return 0;
 }
 
-static int attach_crypt(struct xfrm_state *x, struct nlattr *rta)
+static int attach_crypt(struct xfrm_state *x, struct nlattr *rta,
+			struct netlink_ext_ack *extack)
 {
 	struct xfrm_algo *p, *ualg;
 	struct xfrm_algo_desc *algo;
@@ -401,8 +404,10 @@ static int attach_crypt(struct xfrm_state *x, struct nlattr *rta)
 	ualg = nla_data(rta);
 
 	algo = xfrm_ealg_get_byname(ualg->alg_name, 1);
-	if (!algo)
+	if (!algo) {
+		NL_SET_ERR_MSG(extack, "Requested CRYPT algorithm not found");
 		return -ENOSYS;
+	}
 	x->props.ealgo = algo->desc.sadb_alg_id;
 
 	p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL);
@@ -416,7 +421,7 @@ static int attach_crypt(struct xfrm_state *x, struct nlattr *rta)
 }
 
 static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props,
-		       struct nlattr *rta)
+		       struct nlattr *rta, struct netlink_ext_ack *extack)
 {
 	struct xfrm_algo *ualg;
 	struct xfrm_algo_auth *p;
@@ -428,8 +433,10 @@ static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props,
 	ualg = nla_data(rta);
 
 	algo = xfrm_aalg_get_byname(ualg->alg_name, 1);
-	if (!algo)
+	if (!algo) {
+		NL_SET_ERR_MSG(extack, "Requested AUTH algorithm not found");
 		return -ENOSYS;
+	}
 	*props = algo->desc.sadb_alg_id;
 
 	p = kmalloc(sizeof(*p) + (ualg->alg_key_len + 7) / 8, GFP_KERNEL);
@@ -446,7 +453,7 @@ static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props,
 }
 
 static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props,
-			     struct nlattr *rta)
+			     struct nlattr *rta, struct netlink_ext_ack *extack)
 {
 	struct xfrm_algo_auth *p, *ualg;
 	struct xfrm_algo_desc *algo;
@@ -457,10 +464,14 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props,
 	ualg = nla_data(rta);
 
 	algo = xfrm_aalg_get_byname(ualg->alg_name, 1);
-	if (!algo)
+	if (!algo) {
+		NL_SET_ERR_MSG(extack, "Requested AUTH_TRUNC algorithm not found");
 		return -ENOSYS;
-	if (ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits)
+	}
+	if (ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits) {
+		NL_SET_ERR_MSG(extack, "Invalid length requested for truncated ICV");
 		return -EINVAL;
+	}
 	*props = algo->desc.sadb_alg_id;
 
 	p = kmemdup(ualg, xfrm_alg_auth_len(ualg), GFP_KERNEL);
@@ -475,7 +486,8 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props,
 	return 0;
 }
 
-static int attach_aead(struct xfrm_state *x, struct nlattr *rta)
+static int attach_aead(struct xfrm_state *x, struct nlattr *rta,
+		       struct netlink_ext_ack *extack)
 {
 	struct xfrm_algo_aead *p, *ualg;
 	struct xfrm_algo_desc *algo;
@@ -486,8 +498,10 @@ static int attach_aead(struct xfrm_state *x, struct nlattr *rta)
 	ualg = nla_data(rta);
 
 	algo = xfrm_aead_get_byname(ualg->alg_name, ualg->alg_icv_len, 1);
-	if (!algo)
+	if (!algo) {
+		NL_SET_ERR_MSG(extack, "Requested AEAD algorithm not found");
 		return -ENOSYS;
+	}
 	x->props.ealgo = algo->desc.sadb_alg_id;
 
 	p = kmemdup(ualg, aead_len(ualg), GFP_KERNEL);
@@ -680,21 +694,21 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 	if (attrs[XFRMA_SA_EXTRA_FLAGS])
 		x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]);
 
-	if ((err = attach_aead(x, attrs[XFRMA_ALG_AEAD])))
+	if ((err = attach_aead(x, attrs[XFRMA_ALG_AEAD], extack)))
 		goto error;
 	if ((err = attach_auth_trunc(&x->aalg, &x->props.aalgo,
-				     attrs[XFRMA_ALG_AUTH_TRUNC])))
+				     attrs[XFRMA_ALG_AUTH_TRUNC], extack)))
 		goto error;
 	if (!x->props.aalgo) {
 		if ((err = attach_auth(&x->aalg, &x->props.aalgo,
-				       attrs[XFRMA_ALG_AUTH])))
+				       attrs[XFRMA_ALG_AUTH], extack)))
 			goto error;
 	}
-	if ((err = attach_crypt(x, attrs[XFRMA_ALG_CRYPT])))
+	if ((err = attach_crypt(x, attrs[XFRMA_ALG_CRYPT], extack)))
 		goto error;
 	if ((err = attach_one_algo(&x->calg, &x->props.calgo,
 				   xfrm_calg_get_byname,
-				   attrs[XFRMA_ALG_COMP])))
+				   attrs[XFRMA_ALG_COMP], extack)))
 		goto error;
 
 	if (attrs[XFRMA_TFCPAD])
-- 
cgit v1.2.3


From 741f9a1064985512567eca1552643738ecfb5cc5 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Wed, 14 Sep 2022 19:04:05 +0200
Subject: xfrm: add extack to __xfrm_init_state

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h    |  3 ++-
 net/xfrm/xfrm_state.c | 26 +++++++++++++++++++-------
 net/xfrm/xfrm_user.c  |  2 +-
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 9c1cccf85f12..f427a74d571b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1582,7 +1582,8 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
 u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
 int xfrm_init_replay(struct xfrm_state *x);
 u32 xfrm_state_mtu(struct xfrm_state *x, int mtu);
-int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload);
+int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
+		      struct netlink_ext_ack *extack);
 int xfrm_init_state(struct xfrm_state *x);
 int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
 int xfrm_input_resume(struct sk_buff *skb, int nexthdr);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 52e60e607f8a..7470d2474796 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2610,7 +2610,8 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
 }
 EXPORT_SYMBOL_GPL(xfrm_state_mtu);
 
-int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
+int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
+		      struct netlink_ext_ack *extack)
 {
 	const struct xfrm_mode *inner_mode;
 	const struct xfrm_mode *outer_mode;
@@ -2625,12 +2626,16 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
 
 	if (x->sel.family != AF_UNSPEC) {
 		inner_mode = xfrm_get_mode(x->props.mode, x->sel.family);
-		if (inner_mode == NULL)
+		if (inner_mode == NULL) {
+			NL_SET_ERR_MSG(extack, "Requested mode not found");
 			goto error;
+		}
 
 		if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL) &&
-		    family != x->sel.family)
+		    family != x->sel.family) {
+			NL_SET_ERR_MSG(extack, "Only tunnel modes can accommodate a change of family");
 			goto error;
+		}
 
 		x->inner_mode = *inner_mode;
 	} else {
@@ -2638,11 +2643,15 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
 		int iafamily = AF_INET;
 
 		inner_mode = xfrm_get_mode(x->props.mode, x->props.family);
-		if (inner_mode == NULL)
+		if (inner_mode == NULL) {
+			NL_SET_ERR_MSG(extack, "Requested mode not found");
 			goto error;
+		}
 
-		if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL))
+		if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL)) {
+			NL_SET_ERR_MSG(extack, "Only tunnel modes can accommodate an AF_UNSPEC selector");
 			goto error;
+		}
 
 		x->inner_mode = *inner_mode;
 
@@ -2657,8 +2666,10 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
 	}
 
 	x->type = xfrm_get_type(x->id.proto, family);
-	if (x->type == NULL)
+	if (x->type == NULL) {
+		NL_SET_ERR_MSG(extack, "Requested type not found");
 		goto error;
+	}
 
 	x->type_offload = xfrm_get_type_offload(x->id.proto, family, offload);
 
@@ -2668,6 +2679,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload)
 
 	outer_mode = xfrm_get_mode(x->props.mode, family);
 	if (!outer_mode) {
+		NL_SET_ERR_MSG(extack, "Requested mode not found");
 		err = -EPROTONOSUPPORT;
 		goto error;
 	}
@@ -2689,7 +2701,7 @@ int xfrm_init_state(struct xfrm_state *x)
 {
 	int err;
 
-	err = __xfrm_init_state(x, true, false);
+	err = __xfrm_init_state(x, true, false, NULL);
 	if (!err)
 		x->km.state = XFRM_STATE_VALID;
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 2cf5956b562e..14e9b84f9dad 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -721,7 +721,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 	if (attrs[XFRMA_IF_ID])
 		x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
 
-	err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]);
+	err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack);
 	if (err)
 		goto error;
 
-- 
cgit v1.2.3


From 1cf9a3ae3e2de359471a7036f48ac59e48b15256 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Wed, 14 Sep 2022 19:04:06 +0200
Subject: xfrm: add extack support to xfrm_init_replay

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  2 +-
 net/xfrm/xfrm_replay.c | 10 +++++++---
 net/xfrm/xfrm_state.c  |  2 +-
 net/xfrm/xfrm_user.c   |  2 +-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index f427a74d571b..c504d07bcb7c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1580,7 +1580,7 @@ int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_vali
 void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si);
 void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
 u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
-int xfrm_init_replay(struct xfrm_state *x);
+int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack);
 u32 xfrm_state_mtu(struct xfrm_state *x, int mtu);
 int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
 		      struct netlink_ext_ack *extack);
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 9277d81b344c..9f4d42eb090f 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -766,18 +766,22 @@ int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb)
 }
 #endif
 
-int xfrm_init_replay(struct xfrm_state *x)
+int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
 
 	if (replay_esn) {
 		if (replay_esn->replay_window >
-		    replay_esn->bmp_len * sizeof(__u32) * 8)
+		    replay_esn->bmp_len * sizeof(__u32) * 8) {
+			NL_SET_ERR_MSG(extack, "ESN replay window is too large for the chosen bitmap size");
 			return -EINVAL;
+		}
 
 		if (x->props.flags & XFRM_STATE_ESN) {
-			if (replay_esn->replay_window == 0)
+			if (replay_esn->replay_window == 0) {
+				NL_SET_ERR_MSG(extack, "ESN replay window must be > 0");
 				return -EINVAL;
+			}
 			x->repl_mode = XFRM_REPLAY_MODE_ESN;
 		} else {
 			x->repl_mode = XFRM_REPLAY_MODE_BMP;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 7470d2474796..0b59ff7985e6 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2686,7 +2686,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
 
 	x->outer_mode = *outer_mode;
 	if (init_replay) {
-		err = xfrm_init_replay(x);
+		err = xfrm_init_replay(x, extack);
 		if (err)
 			goto error;
 	}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 14e9b84f9dad..e73f9efc54c1 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -741,7 +741,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 	/* sysctl_xfrm_aevent_etime is in 100ms units */
 	x->replay_maxage = (net->xfrm.sysctl_aevent_etime*HZ)/XFRM_AE_ETH_M;
 
-	if ((err = xfrm_init_replay(x)))
+	if ((err = xfrm_init_replay(x, extack)))
 		goto error;
 
 	/* override default values from above */
-- 
cgit v1.2.3


From 32844a8eecaa4a3e65841c53e43e04a9087d1ef6 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Mon, 12 Sep 2022 23:13:53 +0100
Subject: ARM: 9243/1: riscpc: Unbreak the build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes the following build error:

In file included from ./include/linux/io.h:13,
                 from ./arch/arm/mach-rpc/include/mach/uncompress.h:9,
                 from arch/arm/boot/compressed/misc.c:31:
./arch/arm/include/asm/io.h:85:22: error: conflicting types for ‘__raw_writeb’
   85 | #define __raw_writeb __raw_writeb
      |                      ^~~~~~~~~~~~
./arch/arm/include/asm/io.h:86:20: note: in expansion of macro ‘__raw_writeb’
   86 | static inline void __raw_writeb(u8 val, volatile void __iomem *addr)
      |                    ^~~~~~~~~~~~
In file included from arch/arm/boot/compressed/misc.c:26:
arch/arm/boot/compressed/misc-ep93xx.h:13:20: note: previous definition of ‘__raw_writeb’ was here
   13 | static inline void __raw_writeb(unsigned char value, unsigned int ptr)
      |                    ^~~~~~~~~~~~

To: Russell King <linux@armlinux.org.uk>

Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-arm-kernel@lists.infradead.org
Fixes: 0361c7e504b1 ("ARM: ep93xx: multiplatform support")
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 arch/arm/boot/compressed/misc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/boot/compressed/misc.c b/arch/arm/boot/compressed/misc.c
index cb2e069dc73f..abfed1aa2baa 100644
--- a/arch/arm/boot/compressed/misc.c
+++ b/arch/arm/boot/compressed/misc.c
@@ -23,7 +23,9 @@ unsigned int __machine_arch_type;
 #include <linux/types.h>
 #include <linux/linkage.h>
 #include "misc.h"
+#ifdef CONFIG_ARCH_EP93XX
 #include "misc-ep93xx.h"
+#endif
 
 static void putstr(const char *ptr);
 
-- 
cgit v1.2.3


From 2ccd19b3ffac07cc7e75a2bd1ed779728bb67197 Mon Sep 17 00:00:00 2001
From: Wang Kefeng <wangkefeng.wang@huawei.com>
Date: Tue, 13 Sep 2022 05:25:51 +0100
Subject: ARM: 9244/1: dump: Fix wrong pg_level in walk_pmd()

After ARM supports p4d page tables, the pg_level for note_page()
in walk_pmd() should be 4, not 3, fix it.

Fixes: 84e6ffb2c49c ("arm: add support for folded p4d page tables")
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 arch/arm/mm/dump.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c
index fb688003d156..712da6a81b23 100644
--- a/arch/arm/mm/dump.c
+++ b/arch/arm/mm/dump.c
@@ -346,7 +346,7 @@ static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
 		addr = start + i * PMD_SIZE;
 		domain = get_domain_name(pmd);
 		if (pmd_none(*pmd) || pmd_large(*pmd) || !pmd_present(*pmd))
-			note_page(st, addr, 3, pmd_val(*pmd), domain);
+			note_page(st, addr, 4, pmd_val(*pmd), domain);
 		else
 			walk_pte(st, pmd, addr, domain);
 
-- 
cgit v1.2.3


From 14ca1a4690750bb54e1049e49f3140ef48958a6e Mon Sep 17 00:00:00 2001
From: Wang Kefeng <wangkefeng.wang@huawei.com>
Date: Fri, 16 Sep 2022 12:10:49 +0100
Subject: ARM: 9247/1: mm: set readonly for MT_MEMORY_RO with ARM_LPAE

MT_MEMORY_RO is introduced by commit 598f0a99fa8a ("ARM: 9210/1:
Mark the FDT_FIXED sections as shareable"), which is a readonly
memory type for FDT area, but there are some different between
ARM_LPAE and non-ARM_LPAE, we need to setup PMD_SECT_AP2 and
L_PMD_SECT_RDONLY for MT_MEMORY_RO when ARM_LAPE enabled.

non-ARM_LPAE	0xff800000-0xffa00000           2M PGD KERNEL      ro NX SHD
ARM_LPAE	0xff800000-0xffc00000           4M PMD RW NX SHD
ARM_LPAE+fix	0xff800000-0xffc00000           4M PMD ro NX SHD

Fixes: 598f0a99fa8a ("ARM: 9210/1: Mark the FDT_FIXED sections as shareable")
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 arch/arm/mm/mmu.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index a49f0b9c0f75..463fc2a8448f 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -300,7 +300,11 @@ static struct mem_type mem_types[] __ro_after_init = {
 		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
 			     L_PTE_XN | L_PTE_RDONLY,
 		.prot_l1   = PMD_TYPE_TABLE,
+#ifdef CONFIG_ARM_LPAE
+		.prot_sect = PMD_TYPE_SECT | L_PMD_SECT_RDONLY | PMD_SECT_AP2,
+#else
 		.prot_sect = PMD_TYPE_SECT,
+#endif
 		.domain    = DOMAIN_KERNEL,
 	},
 	[MT_ROM] = {
-- 
cgit v1.2.3


From 31f87f705b3c1635345d8e8a493697099b43e508 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 21 Sep 2022 09:54:32 -0500
Subject: thunderbolt: Explicitly reset plug events delay back to USB4 spec
 value

If any software has interacted with the USB4 registers before the Linux
USB4 CM runs, it may have modified the plug events delay. It has been
observed that if this value too large, it's possible that hotplugged
devices will negotiate a fallback mode instead in Linux.

To prevent this, explicitly align the plug events delay with the USB4
spec value of 10ms.

Cc: stable@vger.kernel.org
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/thunderbolt/switch.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
index c63c1f4ff9dc..77d7f07ca075 100644
--- a/drivers/thunderbolt/switch.c
+++ b/drivers/thunderbolt/switch.c
@@ -2413,6 +2413,7 @@ int tb_switch_configure(struct tb_switch *sw)
 		 * additional capabilities.
 		 */
 		sw->config.cmuv = USB4_VERSION_1_0;
+		sw->config.plug_events_delay = 0xa;
 
 		/* Enumerate the switch */
 		ret = tb_sw_write(sw, (u32 *)&sw->config + 1, TB_CFG_SWITCH,
-- 
cgit v1.2.3


From 4d269ed485298e8a09485a664e7b35b370ab4ada Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:09 +0000
Subject: x86/resctrl: Kill off alloc_enabled

rdt_resources_all[] used to have extra entries for L2CODE/L2DATA.
These were hidden from resctrl by the alloc_enabled value.

Now that the L2/L2CODE/L2DATA resources have been merged together,
alloc_enabled doesn't mean anything, it always has the same value as
alloc_capable which indicates allocation is supported by this resource.

Remove alloc_enabled and its helpers.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-2-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/core.c        | 4 ----
 arch/x86/kernel/cpu/resctrl/internal.h    | 4 ----
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 6 +++---
 include/linux/resctrl.h                   | 2 --
 5 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index bb1c3f5f60c8..2f87177f1f69 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -147,7 +147,6 @@ static inline void cache_alloc_hsw_probe(void)
 	r->cache.shareable_bits = 0xc0000;
 	r->cache.min_cbm_bits = 2;
 	r->alloc_capable = true;
-	r->alloc_enabled = true;
 
 	rdt_alloc_capable = true;
 }
@@ -211,7 +210,6 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
 	thread_throttle_mode_init();
 
 	r->alloc_capable = true;
-	r->alloc_enabled = true;
 
 	return true;
 }
@@ -242,7 +240,6 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
 	r->data_width = 4;
 
 	r->alloc_capable = true;
-	r->alloc_enabled = true;
 
 	return true;
 }
@@ -261,7 +258,6 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
 	r->cache.shareable_bits = ebx & r->default_ctrl;
 	r->data_width = (r->cache.cbm_len + 3) / 4;
 	r->alloc_capable = true;
-	r->alloc_enabled = true;
 }
 
 static void rdt_get_cdp_config(int level)
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 1d647188a43b..53f3d275a98f 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -459,10 +459,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
 	for_each_rdt_resource(r)					      \
 		if (r->mon_capable)
 
-#define for_each_alloc_enabled_rdt_resource(r)				      \
-	for_each_rdt_resource(r)					      \
-		if (r->alloc_enabled)
-
 #define for_each_mon_enabled_rdt_resource(r)				      \
 	for_each_rdt_resource(r)					      \
 		if (r->mon_enabled)
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 4d8398986f78..d961ae3ed96e 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -837,7 +837,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
 	 * First determine which cpus have pseudo-locked regions
 	 * associated with them.
 	 */
-	for_each_alloc_enabled_rdt_resource(r) {
+	for_each_alloc_capable_rdt_resource(r) {
 		list_for_each_entry(d_i, &r->domains, list) {
 			if (d_i->plr)
 				cpumask_or(cpu_with_psl, cpu_with_psl,
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index f276aff521e8..526eb933333b 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1756,7 +1756,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 	if (ret)
 		goto out_destroy;
 
-	/* loop over enabled controls, these are all alloc_enabled */
+	/* loop over enabled controls, these are all alloc_capable */
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		r = s->res;
 		fflags =  r->fflags | RF_CTRL_INFO;
@@ -2106,7 +2106,7 @@ static int schemata_list_create(void)
 	struct rdt_resource *r;
 	int ret = 0;
 
-	for_each_alloc_enabled_rdt_resource(r) {
+	for_each_alloc_capable_rdt_resource(r) {
 		if (resctrl_arch_get_cdp_enabled(r->rid)) {
 			ret = schemata_list_add(r, CDP_CODE);
 			if (ret)
@@ -2452,7 +2452,7 @@ static void rdt_kill_sb(struct super_block *sb)
 	set_mba_sc(false);
 
 	/*Put everything back to default values. */
-	for_each_alloc_enabled_rdt_resource(r)
+	for_each_alloc_capable_rdt_resource(r)
 		reset_all_ctrls(r);
 	cdp_disable_all();
 	rmdir_all_sub();
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 21deb5212bbd..386ab3a41500 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -130,7 +130,6 @@ struct resctrl_schema;
 /**
  * struct rdt_resource - attributes of a resctrl resource
  * @rid:		The index of the resource
- * @alloc_enabled:	Is allocation enabled on this machine
  * @mon_enabled:	Is monitoring enabled for this feature
  * @alloc_capable:	Is allocation available on this machine
  * @mon_capable:	Is monitor feature available on this machine
@@ -150,7 +149,6 @@ struct resctrl_schema;
  */
 struct rdt_resource {
 	int			rid;
-	bool			alloc_enabled;
 	bool			mon_enabled;
 	bool			alloc_capable;
 	bool			mon_capable;
-- 
cgit v1.2.3


From bab6ee736873becc0216ba5fd159394e272d01b2 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:10 +0000
Subject: x86/resctrl: Merge mon_capable and mon_enabled

mon_enabled and mon_capable are always set as a pair by
rdt_get_mon_l3_config().

There is no point having two values.

Merge them together.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-3-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/internal.h | 4 ----
 arch/x86/kernel/cpu/resctrl/monitor.c  | 1 -
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 8 ++++----
 include/linux/resctrl.h                | 2 --
 4 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 53f3d275a98f..8828b5c1b6d2 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -459,10 +459,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
 	for_each_rdt_resource(r)					      \
 		if (r->mon_capable)
 
-#define for_each_mon_enabled_rdt_resource(r)				      \
-	for_each_rdt_resource(r)					      \
-		if (r->mon_enabled)
-
 /* CPUID.(EAX=10H, ECX=ResID=1).EAX */
 union cpuid_0x10_1_eax {
 	struct {
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index eaf25a234ff5..497cadf3285d 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -717,7 +717,6 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
 	l3_mon_evt_init(r);
 
 	r->mon_capable = true;
-	r->mon_enabled = true;
 
 	return 0;
 }
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 526eb933333b..def7c6681f8b 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1765,7 +1765,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
 			goto out_destroy;
 	}
 
-	for_each_mon_enabled_rdt_resource(r) {
+	for_each_mon_capable_rdt_resource(r) {
 		fflags =  r->fflags | RF_MON_INFO;
 		sprintf(name, "%s_MON", r->name);
 		ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
@@ -2504,7 +2504,7 @@ void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
 	struct rdtgroup *prgrp, *crgrp;
 	char name[32];
 
-	if (!r->mon_enabled)
+	if (!r->mon_capable)
 		return;
 
 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
@@ -2572,7 +2572,7 @@ void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
 	struct rdtgroup *prgrp, *crgrp;
 	struct list_head *head;
 
-	if (!r->mon_enabled)
+	if (!r->mon_capable)
 		return;
 
 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
@@ -2642,7 +2642,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
 	 * Create the subdirectories for each domain. Note that all events
 	 * in a domain like L3 are grouped into a resource whose domain is L3
 	 */
-	for_each_mon_enabled_rdt_resource(r) {
+	for_each_mon_capable_rdt_resource(r) {
 		ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
 		if (ret)
 			goto out_destroy;
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 386ab3a41500..8180c539800d 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -130,7 +130,6 @@ struct resctrl_schema;
 /**
  * struct rdt_resource - attributes of a resctrl resource
  * @rid:		The index of the resource
- * @mon_enabled:	Is monitoring enabled for this feature
  * @alloc_capable:	Is allocation available on this machine
  * @mon_capable:	Is monitor feature available on this machine
  * @num_rmid:		Number of RMIDs available
@@ -149,7 +148,6 @@ struct resctrl_schema;
  */
 struct rdt_resource {
 	int			rid;
-	bool			mon_enabled;
 	bool			alloc_capable;
 	bool			mon_capable;
 	int			num_rmid;
-- 
cgit v1.2.3


From 3a7232cdf19e39e7f24c493117b373788b348af2 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:11 +0000
Subject: x86/resctrl: Add domain online callback for resctrl work

Because domains are exposed to user-space via resctrl, the filesystem
must update its state when CPU hotplug callbacks are triggered.

Some of this work is common to any architecture that would support
resctrl, but the work is tied up with the architecture code to
allocate the memory.

Move domain_setup_mon_state(), the monitor subdir creation call and the
mbm/limbo workers into a new resctrl_online_domain() call. These bits
are not specific to the architecture. Grouping them in one function
allows that code to be moved to /fs/ and re-used by another architecture.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-4-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/core.c     | 53 ++++-----------------------
 arch/x86/kernel/cpu/resctrl/internal.h |  2 --
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 65 +++++++++++++++++++++++++++++++---
 include/linux/resctrl.h                |  1 +
 4 files changed, 67 insertions(+), 54 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 2f87177f1f69..25f30148478b 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -443,42 +443,6 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 	return 0;
 }
 
-static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
-{
-	size_t tsize;
-
-	if (is_llc_occupancy_enabled()) {
-		d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL);
-		if (!d->rmid_busy_llc)
-			return -ENOMEM;
-		INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
-	}
-	if (is_mbm_total_enabled()) {
-		tsize = sizeof(*d->mbm_total);
-		d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
-		if (!d->mbm_total) {
-			bitmap_free(d->rmid_busy_llc);
-			return -ENOMEM;
-		}
-	}
-	if (is_mbm_local_enabled()) {
-		tsize = sizeof(*d->mbm_local);
-		d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
-		if (!d->mbm_local) {
-			bitmap_free(d->rmid_busy_llc);
-			kfree(d->mbm_total);
-			return -ENOMEM;
-		}
-	}
-
-	if (is_mbm_enabled()) {
-		INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
-		mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
-	}
-
-	return 0;
-}
-
 /*
  * domain_add_cpu - Add a cpu to a resource's domain list.
  *
@@ -498,6 +462,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 	struct list_head *add_pos = NULL;
 	struct rdt_hw_domain *hw_dom;
 	struct rdt_domain *d;
+	int err;
 
 	d = rdt_find_domain(r, id, &add_pos);
 	if (IS_ERR(d)) {
@@ -527,21 +492,15 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 		return;
 	}
 
-	if (r->mon_capable && domain_setup_mon_state(r, d)) {
+	list_add_tail(&d->list, add_pos);
+
+	err = resctrl_online_domain(r, d);
+	if (err) {
+		list_del(&d->list);
 		kfree(hw_dom->ctrl_val);
 		kfree(hw_dom->mbps_val);
 		kfree(hw_dom);
-		return;
 	}
-
-	list_add_tail(&d->list, add_pos);
-
-	/*
-	 * If resctrl is mounted, add
-	 * per domain monitor data directories.
-	 */
-	if (static_branch_unlikely(&rdt_mon_enable_key))
-		mkdir_mondata_subdir_allrdtgrp(r, d);
 }
 
 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 8828b5c1b6d2..be48a682dbdb 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -524,8 +524,6 @@ void mon_event_count(void *info);
 int rdtgroup_mondata_show(struct seq_file *m, void *arg);
 void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
 				    unsigned int dom_id);
-void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
-				    struct rdt_domain *d);
 void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
 		    struct rdt_domain *d, struct rdtgroup *rdtgrp,
 		    int evtid, int first);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index def7c6681f8b..030a70326ccc 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2565,16 +2565,13 @@ out_destroy:
  * Add all subdirectories of mon_data for "ctrl_mon" groups
  * and "monitor" groups with given domain id.
  */
-void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
-				    struct rdt_domain *d)
+static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+					   struct rdt_domain *d)
 {
 	struct kernfs_node *parent_kn;
 	struct rdtgroup *prgrp, *crgrp;
 	struct list_head *head;
 
-	if (!r->mon_capable)
-		return;
-
 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
 		parent_kn = prgrp->mon.mon_data_kn;
 		mkdir_mondata_subdir(parent_kn, d, r, prgrp);
@@ -3236,6 +3233,64 @@ out:
 	return ret;
 }
 
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+{
+	size_t tsize;
+
+	if (is_llc_occupancy_enabled()) {
+		d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL);
+		if (!d->rmid_busy_llc)
+			return -ENOMEM;
+	}
+	if (is_mbm_total_enabled()) {
+		tsize = sizeof(*d->mbm_total);
+		d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+		if (!d->mbm_total) {
+			bitmap_free(d->rmid_busy_llc);
+			return -ENOMEM;
+		}
+	}
+	if (is_mbm_local_enabled()) {
+		tsize = sizeof(*d->mbm_local);
+		d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+		if (!d->mbm_local) {
+			bitmap_free(d->rmid_busy_llc);
+			kfree(d->mbm_total);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
+{
+	int err;
+
+	lockdep_assert_held(&rdtgroup_mutex);
+
+	if (!r->mon_capable)
+		return 0;
+
+	err = domain_setup_mon_state(r, d);
+	if (err)
+		return err;
+
+	if (is_mbm_enabled()) {
+		INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
+		mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
+	}
+
+	if (is_llc_occupancy_enabled())
+		INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
+
+	/* If resctrl is mounted, add per domain monitor data directories. */
+	if (static_branch_unlikely(&rdt_mon_enable_key))
+		mkdir_mondata_subdir_allrdtgrp(r, d);
+
+	return 0;
+}
+
 /*
  * rdtgroup_init - rdtgroup initialization
  *
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 8180c539800d..d512455b4c3a 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -192,5 +192,6 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
 u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			    u32 closid, enum resctrl_conf_type type);
+int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
 
 #endif /* _RESCTRL_H */
-- 
cgit v1.2.3


From 7add3af4178d9e25afc8d990a7d1000ccb22b6a0 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:12 +0000
Subject: x86/resctrl: Group struct rdt_hw_domain cleanup

domain_add_cpu() and domain_remove_cpu() need to kfree() the child
arrays that were allocated by domain_setup_ctrlval().

As this memory is moved around, and new arrays are created, adjusting
the error handling cleanup code becomes noisier.

To simplify this, move all the kfree() calls into a domain_free() helper.
This depends on struct rdt_hw_domain being kzalloc()d, allowing it to
unconditionally kfree() all the child arrays.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-5-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/core.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 25f30148478b..e37889f7a1a5 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -414,6 +414,13 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
 	}
 }
 
+static void domain_free(struct rdt_hw_domain *hw_dom)
+{
+	kfree(hw_dom->ctrl_val);
+	kfree(hw_dom->mbps_val);
+	kfree(hw_dom);
+}
+
 static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 {
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
@@ -488,7 +495,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 	rdt_domain_reconfigure_cdp(r);
 
 	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
-		kfree(hw_dom);
+		domain_free(hw_dom);
 		return;
 	}
 
@@ -497,9 +504,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 	err = resctrl_online_domain(r, d);
 	if (err) {
 		list_del(&d->list);
-		kfree(hw_dom->ctrl_val);
-		kfree(hw_dom->mbps_val);
-		kfree(hw_dom);
+		domain_free(hw_dom);
 	}
 }
 
@@ -547,12 +552,10 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 		if (d->plr)
 			d->plr->d = NULL;
 
-		kfree(hw_dom->ctrl_val);
-		kfree(hw_dom->mbps_val);
 		bitmap_free(d->rmid_busy_llc);
 		kfree(d->mbm_total);
 		kfree(d->mbm_local);
-		kfree(hw_dom);
+		domain_free(hw_dom);
 		return;
 	}
 
-- 
cgit v1.2.3


From 798fd4b9ac37fec571f55fb8592497b0dd5f7a73 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:13 +0000
Subject: x86/resctrl: Add domain offline callback for resctrl work

Because domains are exposed to user-space via resctrl, the filesystem
must update its state when CPU hotplug callbacks are triggered.

Some of this work is common to any architecture that would support
resctrl, but the work is tied up with the architecture code to
free the memory.

Move the monitor subdir removal and the cancelling of the mbm/limbo
works into a new resctrl_offline_domain() call. These bits are not
specific to the architecture. Grouping them in one function allows
that code to be moved to /fs/ and re-used by another architecture.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-6-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/core.c     | 26 ++------------------
 arch/x86/kernel/cpu/resctrl/internal.h |  2 --
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 45 +++++++++++++++++++++++++++++++---
 include/linux/resctrl.h                |  1 +
 4 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index e37889f7a1a5..f69182973175 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -523,27 +523,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 
 	cpumask_clear_cpu(cpu, &d->cpu_mask);
 	if (cpumask_empty(&d->cpu_mask)) {
-		/*
-		 * If resctrl is mounted, remove all the
-		 * per domain monitor data directories.
-		 */
-		if (static_branch_unlikely(&rdt_mon_enable_key))
-			rmdir_mondata_subdir_allrdtgrp(r, d->id);
+		resctrl_offline_domain(r, d);
 		list_del(&d->list);
-		if (r->mon_capable && is_mbm_enabled())
-			cancel_delayed_work(&d->mbm_over);
-		if (is_llc_occupancy_enabled() &&  has_busy_rmid(r, d)) {
-			/*
-			 * When a package is going down, forcefully
-			 * decrement rmid->ebusy. There is no way to know
-			 * that the L3 was flushed and hence may lead to
-			 * incorrect counts in rare scenarios, but leaving
-			 * the RMID as busy creates RMID leaks if the
-			 * package never comes back.
-			 */
-			__check_limbo(d, true);
-			cancel_delayed_work(&d->cqm_limbo);
-		}
 
 		/*
 		 * rdt_domain "d" is going to be freed below, so clear
@@ -551,11 +532,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 		 */
 		if (d->plr)
 			d->plr->d = NULL;
-
-		bitmap_free(d->rmid_busy_llc);
-		kfree(d->mbm_total);
-		kfree(d->mbm_local);
 		domain_free(hw_dom);
+
 		return;
 	}
 
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index be48a682dbdb..e12b55f815bf 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -522,8 +522,6 @@ void free_rmid(u32 rmid);
 int rdt_get_mon_l3_config(struct rdt_resource *r);
 void mon_event_count(void *info);
 int rdtgroup_mondata_show(struct seq_file *m, void *arg);
-void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
-				    unsigned int dom_id);
 void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
 		    struct rdt_domain *d, struct rdtgroup *rdtgrp,
 		    int evtid, int first);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 030a70326ccc..5830905a92d2 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2499,14 +2499,12 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
  * Remove all subdirectories of mon_data of ctrl_mon groups
  * and monitor groups with given domain id.
  */
-void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
+static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+					   unsigned int dom_id)
 {
 	struct rdtgroup *prgrp, *crgrp;
 	char name[32];
 
-	if (!r->mon_capable)
-		return;
-
 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
 		sprintf(name, "mon_%s_%02d", r->name, dom_id);
 		kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
@@ -3233,6 +3231,45 @@ out:
 	return ret;
 }
 
+static void domain_destroy_mon_state(struct rdt_domain *d)
+{
+	bitmap_free(d->rmid_busy_llc);
+	kfree(d->mbm_total);
+	kfree(d->mbm_local);
+}
+
+void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
+{
+	lockdep_assert_held(&rdtgroup_mutex);
+
+	if (!r->mon_capable)
+		return;
+
+	/*
+	 * If resctrl is mounted, remove all the
+	 * per domain monitor data directories.
+	 */
+	if (static_branch_unlikely(&rdt_mon_enable_key))
+		rmdir_mondata_subdir_allrdtgrp(r, d->id);
+
+	if (is_mbm_enabled())
+		cancel_delayed_work(&d->mbm_over);
+	if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
+		/*
+		 * When a package is going down, forcefully
+		 * decrement rmid->ebusy. There is no way to know
+		 * that the L3 was flushed and hence may lead to
+		 * incorrect counts in rare scenarios, but leaving
+		 * the RMID as busy creates RMID leaks if the
+		 * package never comes back.
+		 */
+		__check_limbo(d, true);
+		cancel_delayed_work(&d->cqm_limbo);
+	}
+
+	domain_destroy_mon_state(d);
+}
+
 static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
 {
 	size_t tsize;
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index d512455b4c3a..5d283bdd6162 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -193,5 +193,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
 u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			    u32 closid, enum resctrl_conf_type type);
 int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
+void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
 
 #endif /* _RESCTRL_H */
-- 
cgit v1.2.3


From 1644dfe727cb042ef7f2e773015747954fd0e746 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:14 +0000
Subject: x86/resctrl: Remove set_mba_sc()s control array re-initialisation

set_mba_sc() enables the 'software controller' to regulate the bandwidth
based on the byte counters. This can be managed entirely in the parts
of resctrl that move to /fs/, without any extra support from the
architecture specific code. set_mba_sc() is called by rdt_enable_ctx()
during mount and unmount. It currently resets the arch code's ctrl_val[]
and mbps_val[] arrays.

The ctrl_val[] was already reset when the domain was created, and by
reset_all_ctrls() when the filesystem was last unmounted. Doing the work
in set_mba_sc() is not necessary as the values are already at their
defaults due to the creation of the domain, or were previously reset
during umount(), or are about to reset during umount().

Add a reset of the mbps_val[] in reset_all_ctrls(), allowing the code in
set_mba_sc() that reaches in to the architecture specific structures to
be removed.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-7-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 5830905a92d2..b32ceff8325a 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1898,18 +1898,12 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
 static int set_mba_sc(bool mba_sc)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
-	struct rdt_hw_domain *hw_dom;
-	struct rdt_domain *d;
 
 	if (!is_mbm_enabled() || !is_mba_linear() ||
 	    mba_sc == is_mba_sc(r))
 		return -EINVAL;
 
 	r->membw.mba_sc = mba_sc;
-	list_for_each_entry(d, &r->domains, list) {
-		hw_dom = resctrl_to_arch_dom(d);
-		setup_default_ctrlval(r, hw_dom->ctrl_val, hw_dom->mbps_val);
-	}
 
 	return 0;
 }
@@ -2327,8 +2321,10 @@ static int reset_all_ctrls(struct rdt_resource *r)
 		hw_dom = resctrl_to_arch_dom(d);
 		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
 
-		for (i = 0; i < hw_res->num_closid; i++)
+		for (i = 0; i < hw_res->num_closid; i++) {
 			hw_dom->ctrl_val[i] = r->default_ctrl;
+			hw_dom->mbps_val[i] = MBA_MAX_MBPS;
+		}
 	}
 	cpu = get_cpu();
 	/* Update CBM on this cpu if it's in cpu_mask. */
-- 
cgit v1.2.3


From b045c215866393419fb960432ed6be69a0113ee1 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:15 +0000
Subject: x86/resctrl: Abstract and use supports_mba_mbps()

To determine whether the mba_MBps option to resctrl should be supported,
resctrl tests the boot CPUs' x86_vendor.

This isn't portable, and needs abstracting behind a helper so this check
can be part of the filesystem code that moves to /fs/.

Re-use the tests set_mba_sc() does to determine if the mba_sc is supported
on this system. An 'alloc_capable' test is added so that support for the
controls isn't implied by the 'delay_linear' property, which is always
true for MPAM. Because mbm_update() only update mba_sc if the mbm_local
counters are enabled, supports_mba_mbps() checks is_mbm_local_enabled().
(instead of using is_mbm_enabled(), which checks both).

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-8-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index b32ceff8325a..4ee26264ecfc 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1890,17 +1890,26 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
 }
 
 /*
- * Enable or disable the MBA software controller
- * which helps user specify bandwidth in MBps.
  * MBA software controller is supported only if
  * MBM is supported and MBA is in linear scale.
  */
+static bool supports_mba_mbps(void)
+{
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+
+	return (is_mbm_local_enabled() &&
+		r->alloc_capable && is_mba_linear());
+}
+
+/*
+ * Enable or disable the MBA software controller
+ * which helps user specify bandwidth in MBps.
+ */
 static int set_mba_sc(bool mba_sc)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
 
-	if (!is_mbm_enabled() || !is_mba_linear() ||
-	    mba_sc == is_mba_sc(r))
+	if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
 		return -EINVAL;
 
 	r->membw.mba_sc = mba_sc;
@@ -2255,7 +2264,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
 		ctx->enable_cdpl2 = true;
 		return 0;
 	case Opt_mba_mbps:
-		if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		if (!supports_mba_mbps())
 			return -EINVAL;
 		ctx->enable_mba_mbps = true;
 		return 0;
-- 
cgit v1.2.3


From 60240bc26114543fcbfcd8a28466e67e77b20388 Mon Sep 17 00:00:00 2001
From: Jalal Mostafa <jalal.a.mostapha@gmail.com>
Date: Wed, 21 Sep 2022 13:57:01 +0000
Subject: xsk: Inherit need_wakeup flag for shared sockets

The flag for need_wakeup is not set for xsks with `XDP_SHARED_UMEM`
flag and of different queue ids and/or devices. They should inherit
the flag from the first socket buffer pool since no flags can be
specified once `XDP_SHARED_UMEM` is specified.

Fixes: b5aea28dca134 ("xsk: Add shared umem support between queue ids")
Signed-off-by: Jalal Mostafa <jalal.a.mostapha@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Link: https://lore.kernel.org/bpf/20220921135701.10199-1-jalal.a.mostapha@gmail.com
---
 include/net/xsk_buff_pool.h | 2 +-
 net/xdp/xsk.c               | 4 ++--
 net/xdp/xsk_buff_pool.c     | 5 +++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 647722e847b4..f787c3f524b0 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -95,7 +95,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
 						struct xdp_umem *umem);
 int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev,
 		  u16 queue_id, u16 flags);
-int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
+int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs,
 			 struct net_device *dev, u16 queue_id);
 int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs);
 void xp_destroy(struct xsk_buff_pool *pool);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 639b2c3beb69..9f0561b67c12 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -954,8 +954,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 				goto out_unlock;
 			}
 
-			err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
-						   dev, qid);
+			err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
+						   qid);
 			if (err) {
 				xp_destroy(xs->pool);
 				xs->pool = NULL;
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index a71a8c6edf55..ed6c71826d31 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -212,17 +212,18 @@ err_unreg_pool:
 	return err;
 }
 
-int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem,
+int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs,
 			 struct net_device *dev, u16 queue_id)
 {
 	u16 flags;
+	struct xdp_umem *umem = umem_xs->umem;
 
 	/* One fill and completion ring required for each queue id. */
 	if (!pool->fq || !pool->cq)
 		return -EINVAL;
 
 	flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY;
-	if (pool->uses_need_wakeup)
+	if (umem_xs->pool->uses_need_wakeup)
 		flags |= XDP_USE_NEED_WAKEUP;
 
 	return xp_assign_dev(pool, dev, queue_id, flags);
-- 
cgit v1.2.3


From 781096d971dfe3c5f9401a300bdb0b148a600584 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:16 +0000
Subject: x86/resctrl: Create mba_sc configuration in the rdt_domain

To support resctrl's MBA software controller, the architecture must provide
a second configuration array to hold the mbps_val[] from user-space.

This complicates the interface between the architecture specific code and
the filesystem portions of resctrl that will move to /fs/, to allow
multiple architectures to support resctrl.

Make the filesystem parts of resctrl create an array for the mba_sc
values. The software controller can be changed to use this, allowing
the architecture code to only consider the values configured in hardware.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-9-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/internal.h |  1 -
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 39 ++++++++++++++++++++++++++++++++++
 include/linux/resctrl.h                |  7 ++++++
 3 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index e12b55f815bf..a7e2cbce29d5 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -36,7 +36,6 @@
 #define MBM_OVERFLOW_INTERVAL		1000
 #define MAX_MBA_BW			100u
 #define MBA_IS_LINEAR			0x4
-#define MBA_MAX_MBPS			U32_MAX
 #define MAX_MBA_BW_AMD			0x800
 #define MBM_CNTR_WIDTH_OFFSET_AMD	20
 
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 4ee26264ecfc..f7ebd019e7a5 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1889,6 +1889,30 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
 		l3_qos_cfg_update(&hw_res->cdp_enabled);
 }
 
+static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d)
+{
+	u32 num_closid = resctrl_arch_get_num_closid(r);
+	int cpu = cpumask_any(&d->cpu_mask);
+	int i;
+
+	d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
+				   GFP_KERNEL, cpu_to_node(cpu));
+	if (!d->mbps_val)
+		return -ENOMEM;
+
+	for (i = 0; i < num_closid; i++)
+		d->mbps_val[i] = MBA_MAX_MBPS;
+
+	return 0;
+}
+
+static void mba_sc_domain_destroy(struct rdt_resource *r,
+				  struct rdt_domain *d)
+{
+	kfree(d->mbps_val);
+	d->mbps_val = NULL;
+}
+
 /*
  * MBA software controller is supported only if
  * MBM is supported and MBA is in linear scale.
@@ -1908,12 +1932,20 @@ static bool supports_mba_mbps(void)
 static int set_mba_sc(bool mba_sc)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+	u32 num_closid = resctrl_arch_get_num_closid(r);
+	struct rdt_domain *d;
+	int i;
 
 	if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
 		return -EINVAL;
 
 	r->membw.mba_sc = mba_sc;
 
+	list_for_each_entry(d, &r->domains, list) {
+		for (i = 0; i < num_closid; i++)
+			d->mbps_val[i] = MBA_MAX_MBPS;
+	}
+
 	return 0;
 }
 
@@ -3247,6 +3279,9 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
 {
 	lockdep_assert_held(&rdtgroup_mutex);
 
+	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
+		mba_sc_domain_destroy(r, d);
+
 	if (!r->mon_capable)
 		return;
 
@@ -3311,6 +3346,10 @@ int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
 
 	lockdep_assert_held(&rdtgroup_mutex);
 
+	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
+		/* RDT_RESOURCE_MBA is never mon_capable */
+		return mba_sc_domain_allocate(r, d);
+
 	if (!r->mon_capable)
 		return 0;
 
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 5d283bdd6162..93dfe553b364 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -15,6 +15,9 @@ int proc_resctrl_show(struct seq_file *m,
 
 #endif
 
+/* max value for struct rdt_domain's mbps_val */
+#define MBA_MAX_MBPS   U32_MAX
+
 /**
  * enum resctrl_conf_type - The type of configuration.
  * @CDP_NONE:	No prioritisation, both code and data are controlled or monitored.
@@ -53,6 +56,9 @@ struct resctrl_staged_config {
  * @cqm_work_cpu:	worker CPU for CQM h/w counters
  * @plr:		pseudo-locked region (if any) associated with domain
  * @staged_config:	parsed configuration to be applied
+ * @mbps_val:		When mba_sc is enabled, this holds the array of user
+ *			specified control values for mba_sc in MBps, indexed
+ *			by closid
  */
 struct rdt_domain {
 	struct list_head		list;
@@ -67,6 +73,7 @@ struct rdt_domain {
 	int				cqm_work_cpu;
 	struct pseudo_lock_region	*plr;
 	struct resctrl_staged_config	staged_config[CDP_NUM_TYPES];
+	u32				*mbps_val;
 };
 
 /**
-- 
cgit v1.2.3


From 6ce1560d35f63a458fead11ac865bc39cea9bc46 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:17 +0000
Subject: x86/resctrl: Switch over to the resctrl mbps_val list

Updates to resctrl's software controller follow the same path as
other configuration updates, but they don't modify the hardware state.
rdtgroup_schemata_write() uses parse_line() and the resource's
parse_ctrlval() function to stage the configuration.
resctrl_arch_update_domains() then updates the mbps_val[] array
instead, and resctrl_arch_update_domains() skips the rdt_ctrl_update()
call that would update hardware.

This complicates the interface between resctrl's filesystem parts
and architecture specific code. It should be possible for mba_sc
to be completely implemented by the filesystem parts of resctrl. This
would allow it to work on a second architecture with no additional code.
resctrl_arch_update_domains() using the mbps_val[] array prevents this.

Change parse_bw() to write the configuration value directly to the
mbps_val[] array in the domain structure. Change rdtgroup_schemata_write()
to skip the call to resctrl_arch_update_domains(), meaning all the
mba_sc specific code in resctrl_arch_update_domains() can be removed.
On the read-side, show_doms() and update_mba_bw() are changed to read
the mbps_val[] array from the domain structure. With this,
resctrl_arch_get_config() no longer needs to consider mba_sc resources.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-10-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 44 +++++++++++++++++++------------
 arch/x86/kernel/cpu/resctrl/monitor.c     | 10 +++----
 arch/x86/kernel/cpu/resctrl/rdtgroup.c    | 27 ++++++++++++++-----
 3 files changed, 52 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 87666275eed9..bf9d73c5be14 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -61,6 +61,7 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
 	     struct rdt_domain *d)
 {
 	struct resctrl_staged_config *cfg;
+	u32 closid = data->rdtgrp->closid;
 	struct rdt_resource *r = s->res;
 	unsigned long bw_val;
 
@@ -72,6 +73,12 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
 
 	if (!bw_validate(data->buf, &bw_val, r))
 		return -EINVAL;
+
+	if (is_mba_sc(r)) {
+		d->mbps_val[closid] = bw_val;
+		return 0;
+	}
+
 	cfg->new_ctrl = bw_val;
 	cfg->have_new_ctrl = true;
 
@@ -261,14 +268,13 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
 
 static bool apply_config(struct rdt_hw_domain *hw_dom,
 			 struct resctrl_staged_config *cfg, u32 idx,
-			 cpumask_var_t cpu_mask, bool mba_sc)
+			 cpumask_var_t cpu_mask)
 {
 	struct rdt_domain *dom = &hw_dom->d_resctrl;
-	u32 *dc = !mba_sc ? hw_dom->ctrl_val : hw_dom->mbps_val;
 
-	if (cfg->new_ctrl != dc[idx]) {
+	if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) {
 		cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
-		dc[idx] = cfg->new_ctrl;
+		hw_dom->ctrl_val[idx] = cfg->new_ctrl;
 
 		return true;
 	}
@@ -284,14 +290,12 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 	enum resctrl_conf_type t;
 	cpumask_var_t cpu_mask;
 	struct rdt_domain *d;
-	bool mba_sc;
 	int cpu;
 	u32 idx;
 
 	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
 		return -ENOMEM;
 
-	mba_sc = is_mba_sc(r);
 	msr_param.res = NULL;
 	list_for_each_entry(d, &r->domains, list) {
 		hw_dom = resctrl_to_arch_dom(d);
@@ -301,7 +305,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 				continue;
 
 			idx = get_config_index(closid, t);
-			if (!apply_config(hw_dom, cfg, idx, cpu_mask, mba_sc))
+			if (!apply_config(hw_dom, cfg, idx, cpu_mask))
 				continue;
 
 			if (!msr_param.res) {
@@ -315,11 +319,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 		}
 	}
 
-	/*
-	 * Avoid writing the control msr with control values when
-	 * MBA software controller is enabled
-	 */
-	if (cpumask_empty(cpu_mask) || mba_sc)
+	if (cpumask_empty(cpu_mask))
 		goto done;
 	cpu = get_cpu();
 	/* Update resource control msr on this CPU if it's in cpu_mask. */
@@ -406,6 +406,14 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		r = s->res;
+
+		/*
+		 * Writes to mba_sc resources update the software controller,
+		 * not the control MSR.
+		 */
+		if (is_mba_sc(r))
+			continue;
+
 		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
 		if (ret)
 			goto out;
@@ -433,9 +441,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	u32 idx = get_config_index(closid, type);
 
-	if (!is_mba_sc(r))
-		return hw_dom->ctrl_val[idx];
-	return hw_dom->mbps_val[idx];
+	return hw_dom->ctrl_val[idx];
 }
 
 static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
@@ -450,8 +456,12 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
 		if (sep)
 			seq_puts(s, ";");
 
-		ctrl_val = resctrl_arch_get_config(r, dom, closid,
-						   schema->conf_type);
+		if (is_mba_sc(r))
+			ctrl_val = dom->mbps_val[closid];
+		else
+			ctrl_val = resctrl_arch_get_config(r, dom, closid,
+							   schema->conf_type);
+
 		seq_printf(s, r->format_str, dom->id, max_data_width,
 			   ctrl_val);
 		sep = true;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 497cadf3285d..16028b2f756a 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -447,13 +447,11 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	hw_dom_mba = resctrl_to_arch_dom(dom_mba);
 
 	cur_bw = pmbm_data->prev_bw;
-	user_bw = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
+	user_bw = dom_mba->mbps_val[closid];
 	delta_bw = pmbm_data->delta_bw;
-	/*
-	 * resctrl_arch_get_config() chooses the mbps/ctrl value to return
-	 * based on is_mba_sc(). For now, reach into the hw_dom.
-	 */
-	cur_msr_val = hw_dom_mba->ctrl_val[closid];
+
+	/* MBA resource doesn't support CDP */
+	cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
 
 	/*
 	 * For Ctrl groups read data from child monitor groups.
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index f7ebd019e7a5..55d8a12287c3 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1356,11 +1356,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 			      struct seq_file *s, void *v)
 {
 	struct resctrl_schema *schema;
+	enum resctrl_conf_type type;
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
 	struct rdt_domain *d;
 	unsigned int size;
 	int ret = 0;
+	u32 closid;
 	bool sep;
 	u32 ctrl;
 
@@ -1386,8 +1388,11 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 		goto out;
 	}
 
+	closid = rdtgrp->closid;
+
 	list_for_each_entry(schema, &resctrl_schema_all, list) {
 		r = schema->res;
+		type = schema->conf_type;
 		sep = false;
 		seq_printf(s, "%*s:", max_name_width, schema->name);
 		list_for_each_entry(d, &r->domains, list) {
@@ -1396,9 +1401,12 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
 			if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
 				size = 0;
 			} else {
-				ctrl = resctrl_arch_get_config(r, d,
-							       rdtgrp->closid,
-							       schema->conf_type);
+				if (is_mba_sc(r))
+					ctrl = d->mbps_val[closid];
+				else
+					ctrl = resctrl_arch_get_config(r, d,
+								       closid,
+								       type);
 				if (r->rid == RDT_RESOURCE_MBA)
 					size = ctrl;
 				else
@@ -2818,14 +2826,19 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
 }
 
 /* Initialize MBA resource with default values. */
-static void rdtgroup_init_mba(struct rdt_resource *r)
+static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
 {
 	struct resctrl_staged_config *cfg;
 	struct rdt_domain *d;
 
 	list_for_each_entry(d, &r->domains, list) {
+		if (is_mba_sc(r)) {
+			d->mbps_val[closid] = MBA_MAX_MBPS;
+			continue;
+		}
+
 		cfg = &d->staged_config[CDP_NONE];
-		cfg->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl;
+		cfg->new_ctrl = r->default_ctrl;
 		cfg->have_new_ctrl = true;
 	}
 }
@@ -2840,7 +2853,9 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
 	list_for_each_entry(s, &resctrl_schema_all, list) {
 		r = s->res;
 		if (r->rid == RDT_RESOURCE_MBA) {
-			rdtgroup_init_mba(r);
+			rdtgroup_init_mba(r, rdtgrp->closid);
+			if (is_mba_sc(r))
+				continue;
 		} else {
 			ret = rdtgroup_init_cat(s, rdtgrp->closid);
 			if (ret < 0)
-- 
cgit v1.2.3


From b58d4eb1f199f5a26d8c756d8e74a31c48b90428 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:18 +0000
Subject: x86/resctrl: Remove architecture copy of mbps_val

The resctrl arch code provides a second configuration array mbps_val[]
for the MBA software controller.

Since resctrl switched over to allocating and freeing its own array
when needed, nothing uses the arch code version.

Remove it.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-11-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/core.c     | 20 ++++----------------
 arch/x86/kernel/cpu/resctrl/internal.h |  3 ---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c |  4 +---
 3 files changed, 5 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index f69182973175..f0e2820af475 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -397,7 +397,7 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
 	return NULL;
 }
 
-void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
+static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
 {
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	int i;
@@ -406,18 +406,14 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
 	 * Initialize the Control MSRs to having no control.
 	 * For Cache Allocation: Set all bits in cbm
 	 * For Memory Allocation: Set b/w requested to 100%
-	 * and the bandwidth in MBps to U32_MAX
 	 */
-	for (i = 0; i < hw_res->num_closid; i++, dc++, dm++) {
+	for (i = 0; i < hw_res->num_closid; i++, dc++)
 		*dc = r->default_ctrl;
-		*dm = MBA_MAX_MBPS;
-	}
 }
 
 static void domain_free(struct rdt_hw_domain *hw_dom)
 {
 	kfree(hw_dom->ctrl_val);
-	kfree(hw_dom->mbps_val);
 	kfree(hw_dom);
 }
 
@@ -426,23 +422,15 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	struct msr_param m;
-	u32 *dc, *dm;
+	u32 *dc;
 
 	dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
 			   GFP_KERNEL);
 	if (!dc)
 		return -ENOMEM;
 
-	dm = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->mbps_val),
-			   GFP_KERNEL);
-	if (!dm) {
-		kfree(dc);
-		return -ENOMEM;
-	}
-
 	hw_dom->ctrl_val = dc;
-	hw_dom->mbps_val = dm;
-	setup_default_ctrlval(r, dc, dm);
+	setup_default_ctrlval(r, dc);
 
 	m.low = 0;
 	m.high = hw_res->num_closid;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index a7e2cbce29d5..373aaba53ecd 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -308,14 +308,12 @@ struct mbm_state {
  *			  a resource
  * @d_resctrl:	Properties exposed to the resctrl file system
  * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
- * @mbps_val:	When mba_sc is enabled, this holds the bandwidth in MBps
  *
  * Members of this structure are accessed via helpers that provide abstraction.
  */
 struct rdt_hw_domain {
 	struct rdt_domain		d_resctrl;
 	u32				*ctrl_val;
-	u32				*mbps_val;
 };
 
 static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
@@ -529,7 +527,6 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom,
 void mbm_handle_overflow(struct work_struct *work);
 void __init intel_rdt_mbm_apply_quirk(void);
 bool is_mba_sc(struct rdt_resource *r);
-void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
 u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
 void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
 void cqm_handle_limbo(struct work_struct *work);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 55d8a12287c3..6c33dfe7ea53 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -2370,10 +2370,8 @@ static int reset_all_ctrls(struct rdt_resource *r)
 		hw_dom = resctrl_to_arch_dom(d);
 		cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
 
-		for (i = 0; i < hw_res->num_closid; i++) {
+		for (i = 0; i < hw_res->num_closid; i++)
 			hw_dom->ctrl_val[i] = r->default_ctrl;
-			hw_dom->mbps_val[i] = MBA_MAX_MBPS;
-		}
 	}
 	cpu = get_cpu();
 	/* Update CBM on this cpu if it's in cpu_mask. */
-- 
cgit v1.2.3


From ff6357bb50023af2a1dc8f113930082c5252c753 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:19 +0000
Subject: x86/resctrl: Allow update_mba_bw() to update controls directly

update_mba_bw() calculates a new control value for the MBA resource
based on the user provided mbps_val and the current measured
bandwidth. Some control values need remapping by delay_bw_map().

It does this by calling wrmsrl() directly. This needs splitting
up to be done by an architecture specific helper, so that the
remainder can eventually be moved to /fs/.

Add resctrl_arch_update_one() to apply one configuration value
to the provided resource and domain. This avoids the staging
and cross-calling that is only needed with changes made by
user-space. delay_bw_map() moves to be part of the arch code,
to maintain the 'percentage control' view of MBA resources
in resctrl.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-12-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/core.c        |  2 +-
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c | 21 +++++++++++++++++++++
 arch/x86/kernel/cpu/resctrl/internal.h    |  1 -
 arch/x86/kernel/cpu/resctrl/monitor.c     | 13 ++++---------
 include/linux/resctrl.h                   |  8 ++++++++
 5 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index f0e2820af475..90ebb7d71af2 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -296,7 +296,7 @@ mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
  * that can be written to QOS_MSRs.
  * There are currently no SKUs which support non linear delay values.
  */
-u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
 {
 	if (r->membw.delay_linear)
 		return MAX_MBA_BW - bw;
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index bf9d73c5be14..0ab92320de71 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -282,6 +282,27 @@ static bool apply_config(struct rdt_hw_domain *hw_dom,
 	return false;
 }
 
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
+			    u32 closid, enum resctrl_conf_type t, u32 cfg_val)
+{
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+	u32 idx = get_config_index(closid, t);
+	struct msr_param msr_param;
+
+	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
+		return -EINVAL;
+
+	hw_dom->ctrl_val[idx] = cfg_val;
+
+	msr_param.res = r;
+	msr_param.low = idx;
+	msr_param.high = idx + 1;
+	hw_res->msr_update(d, &msr_param, r);
+
+	return 0;
+}
+
 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
 {
 	struct resctrl_staged_config *cfg;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 373aaba53ecd..3b9e43ba7590 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -527,7 +527,6 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom,
 void mbm_handle_overflow(struct work_struct *work);
 void __init intel_rdt_mbm_apply_quirk(void);
 bool is_mba_sc(struct rdt_resource *r);
-u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
 void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
 void cqm_handle_limbo(struct work_struct *work);
 bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 16028b2f756a..3e69386cfe00 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -420,10 +420,8 @@ void mon_event_count(void *info)
  */
 static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 {
-	u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
+	u32 closid, rmid, cur_msr_val, new_msr_val;
 	struct mbm_state *pmbm_data, *cmbm_data;
-	struct rdt_hw_resource *hw_r_mba;
-	struct rdt_hw_domain *hw_dom_mba;
 	u32 cur_bw, delta_bw, user_bw;
 	struct rdt_resource *r_mba;
 	struct rdt_domain *dom_mba;
@@ -433,8 +431,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 	if (!is_mbm_local_enabled())
 		return;
 
-	hw_r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
-	r_mba = &hw_r_mba->r_resctrl;
+	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+
 	closid = rgrp->closid;
 	rmid = rgrp->mon.rmid;
 	pmbm_data = &dom_mbm->mbm_local[rmid];
@@ -444,7 +442,6 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 		pr_warn_once("Failure to get domain for MBA update\n");
 		return;
 	}
-	hw_dom_mba = resctrl_to_arch_dom(dom_mba);
 
 	cur_bw = pmbm_data->prev_bw;
 	user_bw = dom_mba->mbps_val[closid];
@@ -486,9 +483,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
 		return;
 	}
 
-	cur_msr = hw_r_mba->msr_base + closid;
-	wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
-	hw_dom_mba->ctrl_val[closid] = new_msr_val;
+	resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
 
 	/*
 	 * Delta values are updated dynamically package wise for each
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 93dfe553b364..f4c9101df461 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -197,6 +197,14 @@ struct resctrl_schema {
 /* The number of closid supported by this resource regardless of CDP */
 u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
+
+/*
+ * Update the ctrl_val and apply this config right now.
+ * Must be called on one of the domain's CPUs.
+ */
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
+			    u32 closid, enum resctrl_conf_type t, u32 cfg_val);
+
 u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			    u32 closid, enum resctrl_conf_type type);
 int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
-- 
cgit v1.2.3


From 30442571ec81fb33f7bd8cea5a14afb10b8f442a Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:20 +0000
Subject: x86/resctrl: Calculate bandwidth from the previous
 __mon_event_count() chunks

mbm_bw_count() is only called by the mbm_handle_overflow() worker once a
second. It reads the hardware register, calculates the bandwidth and
updates m->prev_bw_msr which is used to hold the previous hardware register
value.

Operating directly on hardware register values makes it difficult to make
this code architecture independent, so that it can be moved to /fs/,
making the mba_sc feature something resctrl supports with no additional
support from the architecture.
Prior to calling mbm_bw_count(), mbm_update() reads from the same hardware
register using __mon_event_count().

Change mbm_bw_count() to use the current chunks value most recently saved
by __mon_event_count(). This removes an extra call to __rmid_read().
Instead of using m->prev_msr to calculate the number of chunks seen,
use the rr->val that was updated by __mon_event_count(). This removes an
extra call to mbm_overflow_count() and get_corrected_mbm_count().
Calculating bandwidth like this means mbm_bw_count() no longer operates
on hardware register values directly.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-13-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/internal.h |  4 ++--
 arch/x86/kernel/cpu/resctrl/monitor.c  | 25 ++++++++++++++++---------
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 3b9e43ba7590..46062099d69e 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -289,7 +289,7 @@ struct rftype {
  * struct mbm_state - status for each MBM counter in each domain
  * @chunks:	Total data moved (multiply by rdt_group.mon_scale to get bytes)
  * @prev_msr:	Value of IA32_QM_CTR for this RMID last time we read it
- * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
+ * @prev_bw_chunks: Previous chunks value read for bandwidth calculation
  * @prev_bw:	The most recent bandwidth in MBps
  * @delta_bw:	Difference between the current and previous bandwidth
  * @delta_comp:	Indicates whether to compute the delta_bw
@@ -297,7 +297,7 @@ struct rftype {
 struct mbm_state {
 	u64	chunks;
 	u64	prev_msr;
-	u64	prev_bw_msr;
+	u64	prev_bw_chunks;
 	u32	prev_bw;
 	u32	delta_bw;
 	bool	delta_comp;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 3e69386cfe00..2d81b6cd9632 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -315,7 +315,7 @@ static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
 
 	if (rr->first) {
 		memset(m, 0, sizeof(struct mbm_state));
-		m->prev_bw_msr = m->prev_msr = tval;
+		m->prev_msr = tval;
 		return 0;
 	}
 
@@ -329,27 +329,32 @@ static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
 }
 
 /*
+ * mbm_bw_count() - Update bw count from values previously read by
+ *		    __mon_event_count().
+ * @rmid:	The rmid used to identify the cached mbm_state.
+ * @rr:		The struct rmid_read populated by __mon_event_count().
+ *
  * Supporting function to calculate the memory bandwidth
- * and delta bandwidth in MBps.
+ * and delta bandwidth in MBps. The chunks value previously read by
+ * __mon_event_count() is compared with the chunks value from the previous
+ * invocation. This must be called once per second to maintain values in MBps.
  */
 static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
 {
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
 	struct mbm_state *m = &rr->d->mbm_local[rmid];
-	u64 tval, cur_bw, chunks;
+	u64 cur_bw, chunks, cur_chunks;
 
-	tval = __rmid_read(rmid, rr->evtid);
-	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return;
+	cur_chunks = rr->val;
+	chunks = cur_chunks - m->prev_bw_chunks;
+	m->prev_bw_chunks = cur_chunks;
 
-	chunks = mbm_overflow_count(m->prev_bw_msr, tval, hw_res->mbm_width);
-	cur_bw = (get_corrected_mbm_count(rmid, chunks) * hw_res->mon_scale) >> 20;
+	cur_bw = (chunks * hw_res->mon_scale) >> 20;
 
 	if (m->delta_comp)
 		m->delta_bw = abs(cur_bw - m->prev_bw);
 	m->delta_comp = false;
 	m->prev_bw = cur_bw;
-	m->prev_bw_msr = tval;
 }
 
 /*
@@ -516,10 +521,12 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
 	 */
 	if (is_mbm_total_enabled()) {
 		rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+		rr.val = 0;
 		__mon_event_count(rmid, &rr);
 	}
 	if (is_mbm_local_enabled()) {
 		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
+		rr.val = 0;
 		__mon_event_count(rmid, &rr);
 
 		/*
-- 
cgit v1.2.3


From 48dbe31a243d5fc7c07b7f03b48e95ec4696b118 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:21 +0000
Subject: x86/resctrl: Add per-rmid arch private storage for overflow and
 chunks

A renamed __rmid_read() is intended as the function that an
architecture agnostic resctrl filesystem driver can use to
read a value in bytes from a counter. Currently the function returns
the MBM values in chunks directly from hardware. For bandwidth
counters the resctrl filesystem uses this to calculate the number of
bytes ever seen.

MPAM's scaling of counters can be changed at runtime, reducing the
resolution but increasing the range. When this is changed the prev_msr
values need to be converted by the architecture code.

Add an array for per-rmid private storage. The prev_msr and chunks
values will move here to allow resctrl_arch_rmid_read() to always
return the number of bytes read by this counter without assistance
from the filesystem. The values are moved in later patches when
the overflow and correction calls are moved into __rmid_read().

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-14-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/core.c     | 35 ++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/resctrl/internal.h | 14 ++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 90ebb7d71af2..de62b0b87ced 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -413,6 +413,8 @@ static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
 
 static void domain_free(struct rdt_hw_domain *hw_dom)
 {
+	kfree(hw_dom->arch_mbm_total);
+	kfree(hw_dom->arch_mbm_local);
 	kfree(hw_dom->ctrl_val);
 	kfree(hw_dom);
 }
@@ -438,6 +440,34 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 	return 0;
 }
 
+/**
+ * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters
+ * @num_rmid:	The size of the MBM counter array
+ * @hw_dom:	The domain that owns the allocated arrays
+ */
+static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
+{
+	size_t tsize;
+
+	if (is_mbm_total_enabled()) {
+		tsize = sizeof(*hw_dom->arch_mbm_total);
+		hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL);
+		if (!hw_dom->arch_mbm_total)
+			return -ENOMEM;
+	}
+	if (is_mbm_local_enabled()) {
+		tsize = sizeof(*hw_dom->arch_mbm_local);
+		hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL);
+		if (!hw_dom->arch_mbm_local) {
+			kfree(hw_dom->arch_mbm_total);
+			hw_dom->arch_mbm_total = NULL;
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
 /*
  * domain_add_cpu - Add a cpu to a resource's domain list.
  *
@@ -487,6 +517,11 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 		return;
 	}
 
+	if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
+		domain_free(hw_dom);
+		return;
+	}
+
 	list_add_tail(&d->list, add_pos);
 
 	err = resctrl_online_domain(r, d);
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 46062099d69e..4de8e5bb93e1 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -303,17 +303,31 @@ struct mbm_state {
 	bool	delta_comp;
 };
 
+/**
+ * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s
+ *			   return value.
+ * @prev_msr:	Value of IA32_QM_CTR last time it was read for the RMID used to
+ *		find this struct.
+ */
+struct arch_mbm_state {
+	u64	prev_msr;
+};
+
 /**
  * struct rdt_hw_domain - Arch private attributes of a set of CPUs that share
  *			  a resource
  * @d_resctrl:	Properties exposed to the resctrl file system
  * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
+ * @arch_mbm_total:	arch private state for MBM total bandwidth
+ * @arch_mbm_local:	arch private state for MBM local bandwidth
  *
  * Members of this structure are accessed via helpers that provide abstraction.
  */
 struct rdt_hw_domain {
 	struct rdt_domain		d_resctrl;
 	u32				*ctrl_val;
+	struct arch_mbm_state		*arch_mbm_total;
+	struct arch_mbm_state		*arch_mbm_local;
 };
 
 static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
-- 
cgit v1.2.3


From 562163595a912cfe99f9ed79a67ccd23a11a5cc3 Mon Sep 17 00:00:00 2001
From: Liu Shixin <liushixin2@huawei.com>
Date: Fri, 9 Sep 2022 17:56:24 +0800
Subject: ACPI: HMAT: Drop unused dev_fmt() and redundant 'HMAT' prefix

Remove unused macro dev_pmt() and redundant 'HMAT' prefix from
pr_*() calls.

Signed-off-by: Liu Shixin <liushixin2@huawei.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/numa/hmat.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c
index c3d783aca196..23f49a2f4d14 100644
--- a/drivers/acpi/numa/hmat.c
+++ b/drivers/acpi/numa/hmat.c
@@ -9,7 +9,6 @@
  */
 
 #define pr_fmt(fmt) "acpi/hmat: " fmt
-#define dev_fmt(fmt) "acpi/hmat: " fmt
 
 #include <linux/acpi.h>
 #include <linux/bitops.h>
@@ -302,7 +301,7 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header,
 	u8 type, mem_hier;
 
 	if (hmat_loc->header.length < sizeof(*hmat_loc)) {
-		pr_notice("HMAT: Unexpected locality header length: %u\n",
+		pr_notice("Unexpected locality header length: %u\n",
 			 hmat_loc->header.length);
 		return -EINVAL;
 	}
@@ -314,12 +313,12 @@ static __init int hmat_parse_locality(union acpi_subtable_headers *header,
 	total_size = sizeof(*hmat_loc) + sizeof(*entries) * ipds * tpds +
 		     sizeof(*inits) * ipds + sizeof(*targs) * tpds;
 	if (hmat_loc->header.length < total_size) {
-		pr_notice("HMAT: Unexpected locality header length:%u, minimum required:%u\n",
+		pr_notice("Unexpected locality header length:%u, minimum required:%u\n",
 			 hmat_loc->header.length, total_size);
 		return -EINVAL;
 	}
 
-	pr_info("HMAT: Locality: Flags:%02x Type:%s Initiator Domains:%u Target Domains:%u Base:%lld\n",
+	pr_info("Locality: Flags:%02x Type:%s Initiator Domains:%u Target Domains:%u Base:%lld\n",
 		hmat_loc->flags, hmat_data_type(type), ipds, tpds,
 		hmat_loc->entry_base_unit);
 
@@ -363,13 +362,13 @@ static __init int hmat_parse_cache(union acpi_subtable_headers *header,
 	u32 attrs;
 
 	if (cache->header.length < sizeof(*cache)) {
-		pr_notice("HMAT: Unexpected cache header length: %u\n",
+		pr_notice("Unexpected cache header length: %u\n",
 			 cache->header.length);
 		return -EINVAL;
 	}
 
 	attrs = cache->cache_attributes;
-	pr_info("HMAT: Cache: Domain:%u Size:%llu Attrs:%08x SMBIOS Handles:%d\n",
+	pr_info("Cache: Domain:%u Size:%llu Attrs:%08x SMBIOS Handles:%d\n",
 		cache->memory_PD, cache->cache_size, attrs,
 		cache->number_of_SMBIOShandles);
 
@@ -424,24 +423,24 @@ static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *heade
 	struct memory_target *target = NULL;
 
 	if (p->header.length != sizeof(*p)) {
-		pr_notice("HMAT: Unexpected address range header length: %u\n",
+		pr_notice("Unexpected address range header length: %u\n",
 			 p->header.length);
 		return -EINVAL;
 	}
 
 	if (hmat_revision == 1)
-		pr_info("HMAT: Memory (%#llx length %#llx) Flags:%04x Processor Domain:%u Memory Domain:%u\n",
+		pr_info("Memory (%#llx length %#llx) Flags:%04x Processor Domain:%u Memory Domain:%u\n",
 			p->reserved3, p->reserved4, p->flags, p->processor_PD,
 			p->memory_PD);
 	else
-		pr_info("HMAT: Memory Flags:%04x Processor Domain:%u Memory Domain:%u\n",
+		pr_info("Memory Flags:%04x Processor Domain:%u Memory Domain:%u\n",
 			p->flags, p->processor_PD, p->memory_PD);
 
 	if ((hmat_revision == 1 && p->flags & ACPI_HMAT_MEMORY_PD_VALID) ||
 	    hmat_revision > 1) {
 		target = find_mem_target(p->memory_PD);
 		if (!target) {
-			pr_debug("HMAT: Memory Domain missing from SRAT\n");
+			pr_debug("Memory Domain missing from SRAT\n");
 			return -EINVAL;
 		}
 	}
@@ -449,7 +448,7 @@ static int __init hmat_parse_proximity_domain(union acpi_subtable_headers *heade
 		int p_node = pxm_to_node(p->processor_PD);
 
 		if (p_node == NUMA_NO_NODE) {
-			pr_debug("HMAT: Invalid Processor Domain\n");
+			pr_debug("Invalid Processor Domain\n");
 			return -EINVAL;
 		}
 		target->processor_pxm = p->processor_PD;
@@ -840,7 +839,7 @@ static __init int hmat_init(void)
 	case 2:
 		break;
 	default:
-		pr_notice("Ignoring HMAT: Unknown revision:%d\n", hmat_revision);
+		pr_notice("Ignoring: Unknown revision:%d\n", hmat_revision);
 		goto out_put;
 	}
 
@@ -848,7 +847,7 @@ static __init int hmat_init(void)
 		if (acpi_table_parse_entries(ACPI_SIG_HMAT,
 					     sizeof(struct acpi_table_hmat), i,
 					     hmat_parse_subtable, 0) < 0) {
-			pr_notice("Ignoring HMAT: Invalid table");
+			pr_notice("Ignoring: Invalid table");
 			goto out_put;
 		}
 	}
-- 
cgit v1.2.3


From f890157e61b85ce8ae01a41ffa375e3b99853698 Mon Sep 17 00:00:00 2001
From: Rafael Mendonca <rafaelmendsr@gmail.com>
Date: Fri, 9 Sep 2022 12:33:19 -0300
Subject: ACPI: PCC: Release resources on address space setup failure path

The allocated memory for the pcc_data struct doesn't get freed under an
error path in pcc_mbox_request_channel() or acpi_os_ioremap(). Also, the
PCC mailbox channel doesn't get freed under an error path in
acpi_os_ioremap().

Fixes: 77e2a04745ff8 ("ACPI: PCC: Implement OperationRegion handler for the PCC Type 3 subtype")
Signed-off-by: Rafael Mendonca <rafaelmendsr@gmail.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_pcc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/acpi/acpi_pcc.c b/drivers/acpi/acpi_pcc.c
index a12b55d81209..84f1ac416b57 100644
--- a/drivers/acpi/acpi_pcc.c
+++ b/drivers/acpi/acpi_pcc.c
@@ -63,6 +63,7 @@ acpi_pcc_address_space_setup(acpi_handle region_handle, u32 function,
 	if (IS_ERR(data->pcc_chan)) {
 		pr_err("Failed to find PCC channel for subspace %d\n",
 		       ctx->subspace_id);
+		kfree(data);
 		return AE_NOT_FOUND;
 	}
 
@@ -72,6 +73,8 @@ acpi_pcc_address_space_setup(acpi_handle region_handle, u32 function,
 	if (!data->pcc_comm_addr) {
 		pr_err("Failed to ioremap PCC comm region mem for %d\n",
 		       ctx->subspace_id);
+		pcc_mbox_free_channel(data->pcc_chan);
+		kfree(data);
 		return AE_NO_MEMORY;
 	}
 
-- 
cgit v1.2.3


From 91cefefb699120efd0a5ba345d12626b688f86ce Mon Sep 17 00:00:00 2001
From: Huisong Li <lihuisong@huawei.com>
Date: Tue, 20 Sep 2022 17:44:59 +0800
Subject: ACPI: PCC: replace wait_for_completion()

Currently, the function waiting for completion of mailbox operation is
'wait_for_completion()'.  The PCC method will be permanently blocked if
this mailbox message fails to execute. So this patch replaces it with
'wait_for_completion_timeout()'. And set the timeout interval to an
arbitrary retries on top of nominal to prevent the remote processor is
slow to respond to PCC commands.

Fixes: 77e2a04745ff ("ACPI: PCC: Implement OperationRegion handler for the PCC Type 3 subtype")
Signed-off-by: Huisong Li <lihuisong@huawei.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_pcc.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/acpi_pcc.c b/drivers/acpi/acpi_pcc.c
index 84f1ac416b57..16ba875e3293 100644
--- a/drivers/acpi/acpi_pcc.c
+++ b/drivers/acpi/acpi_pcc.c
@@ -23,6 +23,12 @@
 
 #include <acpi/pcc.h>
 
+/*
+ * Arbitrary retries in case the remote processor is slow to respond
+ * to PCC commands
+ */
+#define PCC_CMD_WAIT_RETRIES_NUM	500
+
 struct pcc_data {
 	struct pcc_mbox_chan *pcc_chan;
 	void __iomem *pcc_comm_addr;
@@ -89,6 +95,7 @@ acpi_pcc_address_space_handler(u32 function, acpi_physical_address addr,
 {
 	int ret;
 	struct pcc_data *data = region_context;
+	u64 usecs_lat;
 
 	reinit_completion(&data->done);
 
@@ -99,8 +106,20 @@ acpi_pcc_address_space_handler(u32 function, acpi_physical_address addr,
 	if (ret < 0)
 		return AE_ERROR;
 
-	if (data->pcc_chan->mchan->mbox->txdone_irq)
-		wait_for_completion(&data->done);
+	if (data->pcc_chan->mchan->mbox->txdone_irq) {
+		/*
+		 * pcc_chan->latency is just a Nominal value. In reality the remote
+		 * processor could be much slower to reply. So add an arbitrary
+		 * amount of wait on top of Nominal.
+		 */
+		usecs_lat = PCC_CMD_WAIT_RETRIES_NUM * data->pcc_chan->latency;
+		ret = wait_for_completion_timeout(&data->done,
+						  usecs_to_jiffies(usecs_lat));
+		if (ret == 0) {
+			pr_err("PCC command executed timeout!\n");
+			return AE_TIME;
+		}
+	}
 
 	mbox_client_txdone(data->pcc_chan->mchan, ret);
 
-- 
cgit v1.2.3


From 18729106c26fb97d4c9ae63ba7aba9889a058dc4 Mon Sep 17 00:00:00 2001
From: Huisong Li <lihuisong@huawei.com>
Date: Tue, 20 Sep 2022 17:45:00 +0800
Subject: ACPI: PCC: Fix Tx acknowledge in the PCC address space handler

Currently, mbox_client_txdone() is called from the PCC address space
handler and that expects the user the Tx state machine to be controlled
by the client which is not the case and the below warning is thrown:

  | PCCT: Client can't run the TX ticker

Let the controller run the state machine and the end of Tx can be
acknowledge by calling mbox_chan_txdone() instead.

Fixes: 77e2a04745ff ("ACPI: PCC: Implement OperationRegion handler for the PCC Type 3 subtype")
Signed-off-by: Huisong Li <lihuisong@huawei.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_pcc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/acpi_pcc.c b/drivers/acpi/acpi_pcc.c
index 16ba875e3293..ee4ce5ba1fb2 100644
--- a/drivers/acpi/acpi_pcc.c
+++ b/drivers/acpi/acpi_pcc.c
@@ -121,7 +121,7 @@ acpi_pcc_address_space_handler(u32 function, acpi_physical_address addr,
 		}
 	}
 
-	mbox_client_txdone(data->pcc_chan->mchan, ret);
+	mbox_chan_txdone(data->pcc_chan->mchan, ret);
 
 	memcpy_fromio(value, data->pcc_comm_addr, data->ctx.length);
 
-- 
cgit v1.2.3


From 63f534b8bad91d46d2d1fbc76b97be521d9b5e03 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 10 Sep 2022 15:23:46 +0200
Subject: ACPI: PCI: Rework acpi_get_pci_dev()

The PCI device returned by acpi_get_pci_dev() needs to be registered,
so if it corresponds to an ACPI device object, the struct acpi_device
representing that object must be registered too and, moreover, it
should be the ACPI companion of the given PCI device.  Thus it should
be sufficient to look for it in the ACPI device object's list of
physical nodes associated with it.

Modify the code accordingly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/acpi/pci_root.c | 75 ++++++++-----------------------------------------
 1 file changed, 12 insertions(+), 63 deletions(-)

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index d57cf8454b93..c8385ef54c37 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -312,76 +312,25 @@ struct acpi_handle_node {
  */
 struct pci_dev *acpi_get_pci_dev(acpi_handle handle)
 {
-	int dev, fn;
-	unsigned long long adr;
-	acpi_status status;
-	acpi_handle phandle;
-	struct pci_bus *pbus;
-	struct pci_dev *pdev = NULL;
-	struct acpi_handle_node *node, *tmp;
-	struct acpi_pci_root *root;
-	LIST_HEAD(device_list);
-
-	/*
-	 * Walk up the ACPI CA namespace until we reach a PCI root bridge.
-	 */
-	phandle = handle;
-	while (!acpi_is_root_bridge(phandle)) {
-		node = kzalloc(sizeof(struct acpi_handle_node), GFP_KERNEL);
-		if (!node)
-			goto out;
-
-		INIT_LIST_HEAD(&node->node);
-		node->handle = phandle;
-		list_add(&node->node, &device_list);
-
-		status = acpi_get_parent(phandle, &phandle);
-		if (ACPI_FAILURE(status))
-			goto out;
-	}
-
-	root = acpi_pci_find_root(phandle);
-	if (!root)
-		goto out;
+	struct acpi_device *adev = acpi_fetch_acpi_dev(handle);
+	struct acpi_device_physical_node *pn;
+	struct pci_dev *pci_dev = NULL;
 
-	pbus = root->bus;
-
-	/*
-	 * Now, walk back down the PCI device tree until we return to our
-	 * original handle. Assumes that everything between the PCI root
-	 * bridge and the device we're looking for must be a P2P bridge.
-	 */
-	list_for_each_entry(node, &device_list, node) {
-		acpi_handle hnd = node->handle;
-		status = acpi_evaluate_integer(hnd, "_ADR", NULL, &adr);
-		if (ACPI_FAILURE(status))
-			goto out;
-		dev = (adr >> 16) & 0xffff;
-		fn  = adr & 0xffff;
-
-		pdev = pci_get_slot(pbus, PCI_DEVFN(dev, fn));
-		if (!pdev || hnd == handle)
-			break;
+	if (!adev)
+		return NULL;
 
-		pbus = pdev->subordinate;
-		pci_dev_put(pdev);
+	mutex_lock(&adev->physical_node_lock);
 
-		/*
-		 * This function may be called for a non-PCI device that has a
-		 * PCI parent (eg. a disk under a PCI SATA controller).  In that
-		 * case pdev->subordinate will be NULL for the parent.
-		 */
-		if (!pbus) {
-			dev_dbg(&pdev->dev, "Not a PCI-to-PCI bridge\n");
-			pdev = NULL;
+	list_for_each_entry(pn, &adev->physical_node_list, node) {
+		if (dev_is_pci(pn->dev)) {
+			pci_dev = to_pci_dev(pn->dev);
 			break;
 		}
 	}
-out:
-	list_for_each_entry_safe(node, tmp, &device_list, node)
-		kfree(node);
 
-	return pdev;
+	mutex_unlock(&adev->physical_node_lock);
+
+	return pci_dev;
 }
 EXPORT_SYMBOL_GPL(acpi_get_pci_dev);
 
-- 
cgit v1.2.3


From b780d1671cf933caa3f67160f73261f10750f1a9 Mon Sep 17 00:00:00 2001
From: Yauheni Kaliuta <ykaliuta@redhat.com>
Date: Tue, 20 Sep 2022 19:14:09 +0300
Subject: selftests/bpf: Add liburandom_read.so to TEST_GEN_FILES

Added urandom_read shared lib is missing from the list of installed
files what makes urandom_read test after `make install` or `make
gen_tar` broken.

Add the library to TEST_GEN_FILES. The names in the list do not
contain $(OUTPUT) since it's added by lib.mk code.

Fixes: 00a0fa2d7d49 ("selftests/bpf: Add urandom_read shared lib and USDTs")
Signed-off-by: Yauheni Kaliuta <ykaliuta@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220920161409.129953-1-ykaliuta@redhat.com
---
 tools/testing/selftests/bpf/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 5898d3828b82..e6cf21fad69f 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -86,6 +86,7 @@ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
 	xskxceiver xdp_redirect_multi xdp_synproxy veristat
 
 TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/sign-file
+TEST_GEN_FILES += liburandom_read.so
 
 # Emit succinct information message describing current building step
 # $1 - generic step name (e.g., CC, LINK, etc);
-- 
cgit v1.2.3


From f5eb23b91c41a7ffc7ca7fe14f3c512360f02937 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 21 Sep 2022 15:00:34 +0800
Subject: selftests/bpf: Destroy the skeleton when CONFIG_PREEMPT is off

Destroy the created skeleton when CONFIG_PREEMPT is off, else will be
resource leak.

Fixes: 73b97bc78b32 ("selftests/bpf: Test concurrent updates on bpf_task_storage_busy")
Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20220921070035.2016413-2-houtao@huaweicloud.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/map_tests/task_storage_map.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/map_tests/task_storage_map.c b/tools/testing/selftests/bpf/map_tests/task_storage_map.c
index aac08c85240b..7d050364efca 100644
--- a/tools/testing/selftests/bpf/map_tests/task_storage_map.c
+++ b/tools/testing/selftests/bpf/map_tests/task_storage_map.c
@@ -79,6 +79,7 @@ void test_task_storage_map_stress_lookup(void)
 	/* Only for a fully preemptible kernel */
 	if (!skel->kconfig->CONFIG_PREEMPT) {
 		printf("%s SKIP (no CONFIG_PREEMPT)\n", __func__);
+		read_bpf_task_storage_busy__destroy(skel);
 		skips++;
 		return;
 	}
-- 
cgit v1.2.3


From 103d002fb7d548fb1187e350f2b73788558128b9 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Wed, 21 Sep 2022 15:00:35 +0800
Subject: selftests/bpf: Free the allocated resources after test case succeeds

Free the created fd or allocated bpf_object after test case succeeds,
else there will be resource leaks.

Spotted by using address sanitizer and checking the content of
/proc/$pid/fd directory.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Link: https://lore.kernel.org/r/20220921070035.2016413-3-houtao@huaweicloud.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/map_tests/array_map_batch_ops.c  |  2 ++
 .../selftests/bpf/map_tests/htab_map_batch_ops.c   |  2 ++
 .../bpf/map_tests/lpm_trie_map_batch_ops.c         |  2 ++
 tools/testing/selftests/bpf/test_maps.c            | 24 ++++++++++++++--------
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c
index 78c76496b14a..b595556315bc 100644
--- a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c
+++ b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c
@@ -3,6 +3,7 @@
 #include <stdio.h>
 #include <errno.h>
 #include <string.h>
+#include <unistd.h>
 
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
@@ -137,6 +138,7 @@ static void __test_map_lookup_and_update_batch(bool is_pcpu)
 	free(keys);
 	free(values);
 	free(visited);
+	close(map_fd);
 }
 
 static void array_map_batch_ops(void)
diff --git a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
index f807d53fd8dd..1230ccf90128 100644
--- a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
+++ b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
@@ -3,6 +3,7 @@
 #include <stdio.h>
 #include <errno.h>
 #include <string.h>
+#include <unistd.h>
 
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
@@ -255,6 +256,7 @@ void __test_map_lookup_and_delete_batch(bool is_pcpu)
 	free(visited);
 	if (!is_pcpu)
 		free(values);
+	close(map_fd);
 }
 
 void htab_map_batch_ops(void)
diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
index 87d07b596e17..b66d56ddb7ef 100644
--- a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
+++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c
@@ -7,6 +7,7 @@
 #include <errno.h>
 #include <string.h>
 #include <stdlib.h>
+#include <unistd.h>
 
 #include <bpf/bpf.h>
 #include <bpf/libbpf.h>
@@ -150,4 +151,5 @@ void test_lpm_trie_map_batch_ops(void)
 	free(keys);
 	free(values);
 	free(visited);
+	close(map_fd);
 }
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
index 289ff310e283..b73152822aa2 100644
--- a/tools/testing/selftests/bpf/test_maps.c
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -659,13 +659,13 @@ static void test_sockmap(unsigned int tasks, void *data)
 {
 	struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_msg, *bpf_map_break;
 	int map_fd_msg = 0, map_fd_rx = 0, map_fd_tx = 0, map_fd_break;
+	struct bpf_object *parse_obj, *verdict_obj, *msg_obj;
 	int ports[] = {50200, 50201, 50202, 50204};
 	int err, i, fd, udp, sfd[6] = {0xdeadbeef};
 	u8 buf[20] = {0x0, 0x5, 0x3, 0x2, 0x1, 0x0};
 	int parse_prog, verdict_prog, msg_prog;
 	struct sockaddr_in addr;
 	int one = 1, s, sc, rc;
-	struct bpf_object *obj;
 	struct timeval to;
 	__u32 key, value;
 	pid_t pid[tasks];
@@ -761,6 +761,7 @@ static void test_sockmap(unsigned int tasks, void *data)
 		       i, udp);
 		goto out_sockmap;
 	}
+	close(udp);
 
 	/* Test update without programs */
 	for (i = 0; i < 6; i++) {
@@ -823,27 +824,27 @@ static void test_sockmap(unsigned int tasks, void *data)
 
 	/* Load SK_SKB program and Attach */
 	err = bpf_prog_test_load(SOCKMAP_PARSE_PROG,
-			    BPF_PROG_TYPE_SK_SKB, &obj, &parse_prog);
+			    BPF_PROG_TYPE_SK_SKB, &parse_obj, &parse_prog);
 	if (err) {
 		printf("Failed to load SK_SKB parse prog\n");
 		goto out_sockmap;
 	}
 
 	err = bpf_prog_test_load(SOCKMAP_TCP_MSG_PROG,
-			    BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog);
+			    BPF_PROG_TYPE_SK_MSG, &msg_obj, &msg_prog);
 	if (err) {
 		printf("Failed to load SK_SKB msg prog\n");
 		goto out_sockmap;
 	}
 
 	err = bpf_prog_test_load(SOCKMAP_VERDICT_PROG,
-			    BPF_PROG_TYPE_SK_SKB, &obj, &verdict_prog);
+			    BPF_PROG_TYPE_SK_SKB, &verdict_obj, &verdict_prog);
 	if (err) {
 		printf("Failed to load SK_SKB verdict prog\n");
 		goto out_sockmap;
 	}
 
-	bpf_map_rx = bpf_object__find_map_by_name(obj, "sock_map_rx");
+	bpf_map_rx = bpf_object__find_map_by_name(verdict_obj, "sock_map_rx");
 	if (!bpf_map_rx) {
 		printf("Failed to load map rx from verdict prog\n");
 		goto out_sockmap;
@@ -855,7 +856,7 @@ static void test_sockmap(unsigned int tasks, void *data)
 		goto out_sockmap;
 	}
 
-	bpf_map_tx = bpf_object__find_map_by_name(obj, "sock_map_tx");
+	bpf_map_tx = bpf_object__find_map_by_name(verdict_obj, "sock_map_tx");
 	if (!bpf_map_tx) {
 		printf("Failed to load map tx from verdict prog\n");
 		goto out_sockmap;
@@ -867,7 +868,7 @@ static void test_sockmap(unsigned int tasks, void *data)
 		goto out_sockmap;
 	}
 
-	bpf_map_msg = bpf_object__find_map_by_name(obj, "sock_map_msg");
+	bpf_map_msg = bpf_object__find_map_by_name(verdict_obj, "sock_map_msg");
 	if (!bpf_map_msg) {
 		printf("Failed to load map msg from msg_verdict prog\n");
 		goto out_sockmap;
@@ -879,7 +880,7 @@ static void test_sockmap(unsigned int tasks, void *data)
 		goto out_sockmap;
 	}
 
-	bpf_map_break = bpf_object__find_map_by_name(obj, "sock_map_break");
+	bpf_map_break = bpf_object__find_map_by_name(verdict_obj, "sock_map_break");
 	if (!bpf_map_break) {
 		printf("Failed to load map tx from verdict prog\n");
 		goto out_sockmap;
@@ -1125,7 +1126,9 @@ static void test_sockmap(unsigned int tasks, void *data)
 	}
 	close(fd);
 	close(map_fd_rx);
-	bpf_object__close(obj);
+	bpf_object__close(parse_obj);
+	bpf_object__close(msg_obj);
+	bpf_object__close(verdict_obj);
 	return;
 out:
 	for (i = 0; i < 6; i++)
@@ -1283,8 +1286,11 @@ static void test_map_in_map(void)
 			printf("Inner map mim.inner was not destroyed\n");
 			goto out_map_in_map;
 		}
+
+		close(fd);
 	}
 
+	bpf_object__close(obj);
 	return;
 
 out_map_in_map:
-- 
cgit v1.2.3


From 1ae9470c3e14624b0f4d8741c22b5a94062c0e33 Mon Sep 17 00:00:00 2001
From: Jingbo Xu <jefflexu@linux.alibaba.com>
Date: Thu, 22 Sep 2022 14:24:14 +0800
Subject: erofs: clean up .read_folio() and .readahead() in fscache mode

The implementation of these two functions in fscache mode is almost the
same. Extract the same part as a generic helper to remove the code
duplication.

Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Reviewed-by: Jia Zhu <zhujia.zj@bytedance.com>
Link: https://lore.kernel.org/r/20220922062414.20437-1-jefflexu@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/fscache.c | 213 +++++++++++++++++++++--------------------------------
 1 file changed, 83 insertions(+), 130 deletions(-)

diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index bc3556e77b93..998cd26a1b3b 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -240,113 +240,111 @@ out:
 	return ret;
 }
 
-static int erofs_fscache_read_folio_inline(struct folio *folio,
-					 struct erofs_map_blocks *map)
-{
-	struct super_block *sb = folio_mapping(folio)->host->i_sb;
-	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
-	erofs_blk_t blknr;
-	size_t offset, len;
-	void *src, *dst;
-
-	/* For tail packing layout, the offset may be non-zero. */
-	offset = erofs_blkoff(map->m_pa);
-	blknr = erofs_blknr(map->m_pa);
-	len = map->m_llen;
-
-	src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
-	if (IS_ERR(src))
-		return PTR_ERR(src);
-
-	dst = kmap_local_folio(folio, 0);
-	memcpy(dst, src + offset, len);
-	memset(dst + len, 0, PAGE_SIZE - len);
-	kunmap_local(dst);
-
-	erofs_put_metabuf(&buf);
-	return 0;
-}
-
-static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+/*
+ * Read into page cache in the range described by (@pos, @len).
+ *
+ * On return, the caller is responsible for page unlocking if the output @unlock
+ * is true, or the callee will take this responsibility through netfs_io_request
+ * interface.
+ *
+ * The return value is the number of bytes successfully handled, or negative
+ * error code on failure. The only exception is that, the length of the range
+ * instead of the error code is returned on failure after netfs_io_request is
+ * allocated, so that .readahead() could advance rac accordingly.
+ */
+static int erofs_fscache_data_read(struct address_space *mapping,
+				   loff_t pos, size_t len, bool *unlock)
 {
-	struct inode *inode = folio_mapping(folio)->host;
+	struct inode *inode = mapping->host;
 	struct super_block *sb = inode->i_sb;
+	struct netfs_io_request *rreq;
 	struct erofs_map_blocks map;
 	struct erofs_map_dev mdev;
-	struct netfs_io_request *rreq;
-	erofs_off_t pos;
-	loff_t pstart;
+	struct iov_iter iter;
+	size_t count;
 	int ret;
 
-	DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
+	*unlock = true;
 
-	pos = folio_pos(folio);
 	map.m_la = pos;
-
 	ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
-	if (!(map.m_flags & EROFS_MAP_MAPPED)) {
-		folio_zero_range(folio, 0, folio_size(folio));
-		goto out_uptodate;
+	if (map.m_flags & EROFS_MAP_META) {
+		struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+		erofs_blk_t blknr;
+		size_t offset, size;
+		void *src;
+
+		/* For tail packing layout, the offset may be non-zero. */
+		offset = erofs_blkoff(map.m_pa);
+		blknr = erofs_blknr(map.m_pa);
+		size = map.m_llen;
+
+		src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
+		if (IS_ERR(src))
+			return PTR_ERR(src);
+
+		iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, PAGE_SIZE);
+		if (copy_to_iter(src + offset, size, &iter) != size)
+			return -EFAULT;
+		iov_iter_zero(PAGE_SIZE - size, &iter);
+		erofs_put_metabuf(&buf);
+		return PAGE_SIZE;
 	}
 
-	if (map.m_flags & EROFS_MAP_META) {
-		ret = erofs_fscache_read_folio_inline(folio, &map);
-		goto out_uptodate;
+	count = min_t(size_t, map.m_llen - (pos - map.m_la), len);
+	DBG_BUGON(!count || count % PAGE_SIZE);
+
+	if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+		iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count);
+		iov_iter_zero(count, &iter);
+		return count;
 	}
 
 	mdev = (struct erofs_map_dev) {
 		.m_deviceid = map.m_deviceid,
 		.m_pa = map.m_pa,
 	};
-
 	ret = erofs_map_dev(sb, &mdev);
 	if (ret)
-		goto out_unlock;
-
-
-	rreq = erofs_fscache_alloc_request(folio_mapping(folio),
-				folio_pos(folio), folio_size(folio));
-	if (IS_ERR(rreq)) {
-		ret = PTR_ERR(rreq);
-		goto out_unlock;
-	}
+		return ret;
 
-	pstart = mdev.m_pa + (pos - map.m_la);
-	return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
-				rreq, pstart);
+	rreq = erofs_fscache_alloc_request(mapping, pos, count);
+	if (IS_ERR(rreq))
+		return PTR_ERR(rreq);
 
-out_uptodate:
-	if (!ret)
-		folio_mark_uptodate(folio);
-out_unlock:
-	folio_unlock(folio);
-	return ret;
+	*unlock = false;
+	erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+			rreq, mdev.m_pa + (pos - map.m_la));
+	return count;
 }
 
-static void erofs_fscache_advance_folios(struct readahead_control *rac,
-					 size_t len, bool unlock)
+static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
 {
-	while (len) {
-		struct folio *folio = readahead_folio(rac);
-		len -= folio_size(folio);
-		if (unlock) {
+	bool unlock;
+	int ret;
+
+	DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
+
+	ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio),
+				      folio_size(folio), &unlock);
+	if (unlock) {
+		if (ret > 0)
 			folio_mark_uptodate(folio);
-			folio_unlock(folio);
-		}
+		folio_unlock(folio);
 	}
+	return ret < 0 ? ret : 0;
 }
 
 static void erofs_fscache_readahead(struct readahead_control *rac)
 {
-	struct inode *inode = rac->mapping->host;
-	struct super_block *sb = inode->i_sb;
-	size_t len, count, done = 0;
-	erofs_off_t pos;
-	loff_t start, offset;
-	int ret;
+	struct folio *folio;
+	size_t len, done = 0;
+	loff_t start, pos;
+	bool unlock;
+	int ret, size;
 
 	if (!readahead_count(rac))
 		return;
@@ -355,67 +353,22 @@ static void erofs_fscache_readahead(struct readahead_control *rac)
 	len = readahead_length(rac);
 
 	do {
-		struct erofs_map_blocks map;
-		struct erofs_map_dev mdev;
-		struct netfs_io_request *rreq;
-
 		pos = start + done;
-		map.m_la = pos;
-
-		ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
-		if (ret)
+		ret = erofs_fscache_data_read(rac->mapping, pos,
+					      len - done, &unlock);
+		if (ret <= 0)
 			return;
 
-		offset = start + done;
-		count = min_t(size_t, map.m_llen - (pos - map.m_la),
-			      len - done);
-
-		if (!(map.m_flags & EROFS_MAP_MAPPED)) {
-			struct iov_iter iter;
-
-			iov_iter_xarray(&iter, READ, &rac->mapping->i_pages,
-					offset, count);
-			iov_iter_zero(count, &iter);
-
-			erofs_fscache_advance_folios(rac, count, true);
-			ret = count;
-			continue;
-		}
-
-		if (map.m_flags & EROFS_MAP_META) {
-			struct folio *folio = readahead_folio(rac);
-
-			ret = erofs_fscache_read_folio_inline(folio, &map);
-			if (!ret) {
+		size = ret;
+		while (size) {
+			folio = readahead_folio(rac);
+			size -= folio_size(folio);
+			if (unlock) {
 				folio_mark_uptodate(folio);
-				ret = folio_size(folio);
+				folio_unlock(folio);
 			}
-
-			folio_unlock(folio);
-			continue;
 		}
-
-		mdev = (struct erofs_map_dev) {
-			.m_deviceid = map.m_deviceid,
-			.m_pa = map.m_pa,
-		};
-		ret = erofs_map_dev(sb, &mdev);
-		if (ret)
-			return;
-
-		rreq = erofs_fscache_alloc_request(rac->mapping, offset, count);
-		if (IS_ERR(rreq))
-			return;
-		/*
-		 * Drop the ref of folios here. Unlock them in
-		 * rreq_unlock_folios() when rreq complete.
-		 */
-		erofs_fscache_advance_folios(rac, count, false);
-		ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
-					rreq, mdev.m_pa + (pos - map.m_la));
-		if (!ret)
-			ret = count;
-	} while (ret > 0 && ((done += ret) < len));
+	} while ((done += ret) < len);
 }
 
 static const struct address_space_operations erofs_fscache_meta_aops = {
-- 
cgit v1.2.3


From fdffc091e6f94602558bba712b51bc16f79fd6d5 Mon Sep 17 00:00:00 2001
From: Yue Hu <huyue2@coolpad.com>
Date: Fri, 23 Sep 2022 10:11:21 +0800
Subject: erofs: support interlaced uncompressed data for compressed files

Currently, uncompressed data is all handled in the shifted way, which
means we have to shift the whole on-disk plain pcluster to get the
logical data.   However, since we are also using in-place I/O for
uncompressed data, data copy will be reduced a lot if pcluster is
recorded in the interlaced way as illustrated below:
 _______________________________________________________________
|               |    |               |_ tail part |_ head part _|
|<-   blk0    ->| .. |<-   blkn-2  ->|<-         blkn-1       ->|

The logical data then becomes:
 ________________________________________________________
|_ head part _|_  blk0  _| .. |_  blkn-2  _|_ tail part _|

In addition, non-4k plain pclusters are also survived by the
interlaced way, which can be used for non-4k lclusters as well.

However, it's almost impossible to de-duplicate uncompressed data
in the interlaced way, therefore shifted uncompressed data is still
useful.

Signed-off-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/8369112678604fdf4ef796626d59b1fdd0745a53.1663898962.git.huyue2@coolpad.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/decompressor.c | 47 ++++++++++++++++++++++++++++-------------------
 fs/erofs/erofs_fs.h     |  2 ++
 fs/erofs/internal.h     |  1 +
 fs/erofs/zmap.c         | 14 ++++++++++----
 4 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 2d55569f96ac..51b7ac7166d9 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -317,52 +317,61 @@ dstmap_out:
 	return ret;
 }
 
-static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq,
-				     struct page **pagepool)
+static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
+				   struct page **pagepool)
 {
-	const unsigned int nrpages_out =
+	const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+	const unsigned int outpages =
 		PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
 	const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
 					     PAGE_SIZE - rq->pageofs_out);
 	const unsigned int lefthalf = rq->outputsize - righthalf;
+	const unsigned int interlaced_offset =
+		rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
 	unsigned char *src, *dst;
 
-	if (nrpages_out > 2) {
+	if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
 		DBG_BUGON(1);
-		return -EIO;
+		return -EFSCORRUPTED;
 	}
 
 	if (rq->out[0] == *rq->in) {
-		DBG_BUGON(nrpages_out != 1);
+		DBG_BUGON(rq->pageofs_out);
 		return 0;
 	}
 
-	src = kmap_atomic(*rq->in) + rq->pageofs_in;
+	src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
 	if (rq->out[0]) {
-		dst = kmap_atomic(rq->out[0]);
-		memcpy(dst + rq->pageofs_out, src, righthalf);
-		kunmap_atomic(dst);
+		dst = kmap_local_page(rq->out[0]);
+		memcpy(dst + rq->pageofs_out, src + interlaced_offset,
+		       righthalf);
+		kunmap_local(dst);
 	}
 
-	if (nrpages_out == 2) {
-		DBG_BUGON(!rq->out[1]);
-		if (rq->out[1] == *rq->in) {
+	if (outpages > inpages) {
+		DBG_BUGON(!rq->out[outpages - 1]);
+		if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
+			dst = kmap_local_page(rq->out[outpages - 1]);
+			memcpy(dst, interlaced_offset ? src :
+					(src + righthalf), lefthalf);
+			kunmap_local(dst);
+		} else if (!interlaced_offset) {
 			memmove(src, src + righthalf, lefthalf);
-		} else {
-			dst = kmap_atomic(rq->out[1]);
-			memcpy(dst, src + righthalf, lefthalf);
-			kunmap_atomic(dst);
 		}
 	}
-	kunmap_atomic(src);
+	kunmap_local(src);
 	return 0;
 }
 
 static struct z_erofs_decompressor decompressors[] = {
 	[Z_EROFS_COMPRESSION_SHIFTED] = {
-		.decompress = z_erofs_shifted_transform,
+		.decompress = z_erofs_transform_plain,
 		.name = "shifted"
 	},
+	[Z_EROFS_COMPRESSION_INTERLACED] = {
+		.decompress = z_erofs_transform_plain,
+		.name = "interlaced"
+	},
 	[Z_EROFS_COMPRESSION_LZ4] = {
 		.decompress = z_erofs_lz4_decompress,
 		.name = "lz4"
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 2b48373f690b..5c1de6d7ad71 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -295,11 +295,13 @@ struct z_erofs_lzma_cfgs {
  * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
  * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
  * bit 3 : tailpacking inline pcluster (0 - off; 1 - on)
+ * bit 4 : interlaced plain pcluster (0 - off; 1 - on)
  */
 #define Z_EROFS_ADVISE_COMPACTED_2B		0x0001
 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1		0x0002
 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2		0x0004
 #define Z_EROFS_ADVISE_INLINE_PCLUSTER		0x0008
+#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER	0x0010
 
 struct z_erofs_map_header {
 	__le16	h_reserved1;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index ef3f7982b92d..8dbfeb5f4f84 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -420,6 +420,7 @@ struct erofs_map_blocks {
 
 enum {
 	Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+	Z_EROFS_COMPRESSION_INTERLACED,
 	Z_EROFS_COMPRESSION_RUNTIME_MAX
 };
 
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index d58549ca1df9..7196235a441c 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -679,12 +679,18 @@ static int z_erofs_do_map_blocks(struct inode *inode,
 			goto out;
 	}
 
-	if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN)
-		map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
-	else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2)
+	if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) {
+		if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
+			map->m_algorithmformat =
+				Z_EROFS_COMPRESSION_INTERLACED;
+		else
+			map->m_algorithmformat =
+				Z_EROFS_COMPRESSION_SHIFTED;
+	} else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
 		map->m_algorithmformat = vi->z_algorithmtype[1];
-	else
+	} else {
 		map->m_algorithmformat = vi->z_algorithmtype[0];
+	}
 
 	if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
 	    ((flags & EROFS_GET_BLOCKS_READMORE) &&
-- 
cgit v1.2.3


From da73a94fa282f78d485bd0aab36c8ac15b6f792c Mon Sep 17 00:00:00 2001
From: Philippe Schenker <philippe.schenker@toradex.com>
Date: Thu, 22 Sep 2022 14:43:03 +0200
Subject: drm/bridge: lt8912b: add vsync hsync

Currently the bridge driver does not take care whether or not the display
needs positive/negative vertical/horizontal syncs. Pass these two flags
to the bridge from the EDID that was read out from the display.

Fixes: 30e2ae943c26 ("drm/bridge: Introduce LT8912B DSI to HDMI bridge")
Signed-off-by: Philippe Schenker <philippe.schenker@toradex.com>
Acked-by: Adrien Grassein <adrien.grassein@gmail.com>
Signed-off-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20220922124306.34729-2-dev@pschenker.ch
---
 drivers/gpu/drm/bridge/lontium-lt8912b.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/bridge/lontium-lt8912b.c b/drivers/gpu/drm/bridge/lontium-lt8912b.c
index 28bad30dc4e5..0fd3472e767c 100644
--- a/drivers/gpu/drm/bridge/lontium-lt8912b.c
+++ b/drivers/gpu/drm/bridge/lontium-lt8912b.c
@@ -268,7 +268,7 @@ static int lt8912_video_setup(struct lt8912 *lt)
 	u32 hactive, h_total, hpw, hfp, hbp;
 	u32 vactive, v_total, vpw, vfp, vbp;
 	u8 settle = 0x08;
-	int ret;
+	int ret, hsync_activehigh, vsync_activehigh;
 
 	if (!lt)
 		return -EINVAL;
@@ -278,12 +278,14 @@ static int lt8912_video_setup(struct lt8912 *lt)
 	hpw = lt->mode.hsync_len;
 	hbp = lt->mode.hback_porch;
 	h_total = hactive + hfp + hpw + hbp;
+	hsync_activehigh = lt->mode.flags & DISPLAY_FLAGS_HSYNC_HIGH;
 
 	vactive = lt->mode.vactive;
 	vfp = lt->mode.vfront_porch;
 	vpw = lt->mode.vsync_len;
 	vbp = lt->mode.vback_porch;
 	v_total = vactive + vfp + vpw + vbp;
+	vsync_activehigh = lt->mode.flags & DISPLAY_FLAGS_VSYNC_HIGH;
 
 	if (vactive <= 600)
 		settle = 0x04;
@@ -317,6 +319,11 @@ static int lt8912_video_setup(struct lt8912 *lt)
 	ret |= regmap_write(lt->regmap[I2C_CEC_DSI], 0x3e, hfp & 0xff);
 	ret |= regmap_write(lt->regmap[I2C_CEC_DSI], 0x3f, hfp >> 8);
 
+	ret |= regmap_update_bits(lt->regmap[I2C_MAIN], 0xab, BIT(0),
+				  vsync_activehigh ? BIT(0) : 0);
+	ret |= regmap_update_bits(lt->regmap[I2C_MAIN], 0xab, BIT(1),
+				  hsync_activehigh ? BIT(1) : 0);
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 6dd1de12e1243f2013e4fabf31e99e63b1a860d0 Mon Sep 17 00:00:00 2001
From: Philippe Schenker <philippe.schenker@toradex.com>
Date: Thu, 22 Sep 2022 14:43:04 +0200
Subject: drm/bridge: lt8912b: set hdmi or dvi mode

The Lontium LT8912 does have a setting for DVI or HDMI. This patch reads
from EDID what the display needs and sets it accordingly.

Fixes: 30e2ae943c26 ("drm/bridge: Introduce LT8912B DSI to HDMI bridge")
Signed-off-by: Philippe Schenker <philippe.schenker@toradex.com>
Acked-by: Adrien Grassein <adrien.grassein@gmail.com>
Signed-off-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20220922124306.34729-3-dev@pschenker.ch
---
 drivers/gpu/drm/bridge/lontium-lt8912b.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/bridge/lontium-lt8912b.c b/drivers/gpu/drm/bridge/lontium-lt8912b.c
index 0fd3472e767c..6a4bb7422176 100644
--- a/drivers/gpu/drm/bridge/lontium-lt8912b.c
+++ b/drivers/gpu/drm/bridge/lontium-lt8912b.c
@@ -323,6 +323,8 @@ static int lt8912_video_setup(struct lt8912 *lt)
 				  vsync_activehigh ? BIT(0) : 0);
 	ret |= regmap_update_bits(lt->regmap[I2C_MAIN], 0xab, BIT(1),
 				  hsync_activehigh ? BIT(1) : 0);
+	ret |= regmap_update_bits(lt->regmap[I2C_MAIN], 0xb2, BIT(0),
+				  lt->connector.display_info.is_hdmi ? BIT(0) : 0);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 051ad2788d35ca07aec8402542e5d38429f2426a Mon Sep 17 00:00:00 2001
From: Francesco Dolcini <francesco.dolcini@toradex.com>
Date: Thu, 22 Sep 2022 14:43:05 +0200
Subject: drm/bridge: lt8912b: fix corrupted image output

Correct I2C address for the register list in lt8912_write_lvds_config(),
these registers are on the first I2C address (0x48), the current
function is just writing garbage to the wrong registers and this creates
multiple issues (artifacts and output completely corrupted) on some HDMI
displays.

Correct I2C address comes from Lontium documentation and it is the one
used on other out-of-tree LT8912B drivers [1].

[1] https://github.com/boundarydevices/linux/blob/boundary-imx_5.10.x_2.0.0/drivers/video/lt8912.c#L296

Fixes: 30e2ae943c26 ("drm/bridge: Introduce LT8912B DSI to HDMI bridge")
Signed-off-by: Francesco Dolcini <francesco.dolcini@toradex.com>
Signed-off-by: Philippe Schenker <philippe.schenker@toradex.com>
Acked-by: Adrien Grassein <adrien.grassein@gmail.com>
Signed-off-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20220922124306.34729-4-dev@pschenker.ch
---
 drivers/gpu/drm/bridge/lontium-lt8912b.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/bridge/lontium-lt8912b.c b/drivers/gpu/drm/bridge/lontium-lt8912b.c
index 6a4bb7422176..5968f4af190b 100644
--- a/drivers/gpu/drm/bridge/lontium-lt8912b.c
+++ b/drivers/gpu/drm/bridge/lontium-lt8912b.c
@@ -188,7 +188,7 @@ static int lt8912_write_lvds_config(struct lt8912 *lt)
 		{0x03, 0xff},
 	};
 
-	return regmap_multi_reg_write(lt->regmap[I2C_CEC_DSI], seq, ARRAY_SIZE(seq));
+	return regmap_multi_reg_write(lt->regmap[I2C_MAIN], seq, ARRAY_SIZE(seq));
 };
 
 static inline struct lt8912 *bridge_to_lt8912(struct drm_bridge *b)
-- 
cgit v1.2.3


From fea62d370d7a1ba288d71d0cae7ad47c2a02b839 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:22 +0000
Subject: x86/resctrl: Allow per-rmid arch private storage to be reset

To abstract the rmid counters into a helper that returns the number
of bytes counted, architecture specific per-rmid state is needed.

It needs to be possible to reset this hidden state, as the values
may outlive the life of an rmid, or the mount time of the filesystem.

mon_event_read() is called with first = true when an rmid is first
allocated in mkdir_mondata_subdir(). Add resctrl_arch_reset_rmid()
and call it from __mon_event_count()'s rr->first check.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-15-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/internal.h | 18 +++++------------
 arch/x86/kernel/cpu/resctrl/monitor.c  | 35 +++++++++++++++++++++++++++++++++-
 include/linux/resctrl.h                | 23 ++++++++++++++++++++++
 3 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 4de8e5bb93e1..b34a1403f033 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -22,14 +22,6 @@
 
 #define L2_QOS_CDP_ENABLE		0x01ULL
 
-/*
- * Event IDs are used to program IA32_QM_EVTSEL before reading event
- * counter from IA32_QM_CTR
- */
-#define QOS_L3_OCCUP_EVENT_ID		0x01
-#define QOS_L3_MBM_TOTAL_EVENT_ID	0x02
-#define QOS_L3_MBM_LOCAL_EVENT_ID	0x03
-
 #define CQM_LIMBOCHECK_INTERVAL	1000
 
 #define MBM_CNTR_WIDTH_BASE		24
@@ -73,7 +65,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
  * @list:		entry in &rdt_resource->evt_list
  */
 struct mon_evt {
-	u32			evtid;
+	enum resctrl_event_id	evtid;
 	char			*name;
 	struct list_head	list;
 };
@@ -90,9 +82,9 @@ struct mon_evt {
 union mon_data_bits {
 	void *priv;
 	struct {
-		unsigned int rid	: 10;
-		unsigned int evtid	: 8;
-		unsigned int domid	: 14;
+		unsigned int rid		: 10;
+		enum resctrl_event_id evtid	: 8;
+		unsigned int domid		: 14;
 	} u;
 };
 
@@ -100,7 +92,7 @@ struct rmid_read {
 	struct rdtgroup		*rgrp;
 	struct rdt_resource	*r;
 	struct rdt_domain	*d;
-	int			evtid;
+	enum resctrl_event_id	evtid;
 	bool			first;
 	u64			val;
 };
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 2d81b6cd9632..e9755143492b 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -137,7 +137,37 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid)
 	return entry;
 }
 
-static u64 __rmid_read(u32 rmid, u32 eventid)
+static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
+						 u32 rmid,
+						 enum resctrl_event_id eventid)
+{
+	switch (eventid) {
+	case QOS_L3_OCCUP_EVENT_ID:
+		return NULL;
+	case QOS_L3_MBM_TOTAL_EVENT_ID:
+		return &hw_dom->arch_mbm_total[rmid];
+	case QOS_L3_MBM_LOCAL_EVENT_ID:
+		return &hw_dom->arch_mbm_local[rmid];
+	}
+
+	/* Never expect to get here */
+	WARN_ON_ONCE(1);
+
+	return NULL;
+}
+
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
+			     u32 rmid, enum resctrl_event_id eventid)
+{
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+	struct arch_mbm_state *am;
+
+	am = get_arch_mbm_state(hw_dom, rmid, eventid);
+	if (am)
+		memset(am, 0, sizeof(*am));
+}
+
+static u64 __rmid_read(u32 rmid, enum resctrl_event_id eventid)
 {
 	u64 val;
 
@@ -291,6 +321,9 @@ static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
 	struct mbm_state *m;
 	u64 chunks, tval;
 
+	if (rr->first)
+		resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
+
 	tval = __rmid_read(rmid, rr->evtid);
 	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
 		return tval;
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index f4c9101df461..818456770176 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -32,6 +32,16 @@ enum resctrl_conf_type {
 
 #define CDP_NUM_TYPES	(CDP_DATA + 1)
 
+/*
+ * Event IDs, the values match those used to program IA32_QM_EVTSEL before
+ * reading IA32_QM_CTR on RDT systems.
+ */
+enum resctrl_event_id {
+	QOS_L3_OCCUP_EVENT_ID		= 0x01,
+	QOS_L3_MBM_TOTAL_EVENT_ID	= 0x02,
+	QOS_L3_MBM_LOCAL_EVENT_ID	= 0x03,
+};
+
 /**
  * struct resctrl_staged_config - parsed configuration to be applied
  * @new_ctrl:		new ctrl value to be loaded
@@ -210,4 +220,17 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
 void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
 
+/**
+ * resctrl_arch_reset_rmid() - Reset any private state associated with rmid
+ *			       and eventid.
+ * @r:		The domain's resource.
+ * @d:		The rmid's domain.
+ * @rmid:	The rmid whose counter values should be reset.
+ * @eventid:	The eventid whose counter values should be reset.
+ *
+ * This can be called from any CPU.
+ */
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
+			     u32 rmid, enum resctrl_event_id eventid);
+
 #endif /* _RESCTRL_H */
-- 
cgit v1.2.3


From 712f210a457d9c32414df246a72781550bc23ef6 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 21 Sep 2022 20:10:10 -0700
Subject: x86/microcode/AMD: Track patch allocation size explicitly

In preparation for reducing the use of ksize(), record the actual
allocation size for later memcpy(). This avoids copying extra
(uninitialized!) bytes into the patch buffer when the requested
allocation size isn't exactly the size of a kmalloc bucket.
Additionally, fix potential future issues where runtime bounds checking
will notice that the buffer was allocated to a smaller value than
returned by ksize().

Fixes: 757885e94a22 ("x86, microcode, amd: Early microcode patch loading support for AMD")
Suggested-by: Daniel Micay <danielmicay@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/lkml/CA+DvKQ+bp7Y7gmaVhacjv9uF6Ar-o4tet872h4Q8RPYPJjcJQA@mail.gmail.com/
---
 arch/x86/include/asm/microcode.h    | 1 +
 arch/x86/kernel/cpu/microcode/amd.c | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 7f7800e15ed0..74ecc2bd6cd0 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -9,6 +9,7 @@
 struct ucode_patch {
 	struct list_head plist;
 	void *data;		/* Intel uses only this one */
+	unsigned int size;
 	u32 patch_id;
 	u16 equiv_cpu;
 };
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 5f38dd75cbc5..e7410e98fc1f 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -788,6 +788,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
 		kfree(patch);
 		return -EINVAL;
 	}
+	patch->size = *patch_size;
 
 	mc_hdr      = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
 	proc_id     = mc_hdr->processor_rev_id;
@@ -869,7 +870,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
 		return ret;
 
 	memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
-	memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE));
+	memcpy(amd_ucode_patch, p->data, min_t(u32, p->size, PATCH_MAX_SIZE));
 
 	return ret;
 }
-- 
cgit v1.2.3


From 4d044c521a63b2cd394ea6e3547012032145e47e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:23 +0000
Subject: x86/resctrl: Abstract __rmid_read()

__rmid_read() selects the specified eventid and returns the counter
value from the MSR. The error handling is architecture specific, and
handled by the callers, rdtgroup_mondata_show() and __mon_event_count().

Error handling should be handled by architecture specific code, as
a different architecture may have different requirements. MPAM's
counters can report that they are 'not ready', requiring a second
read after a short delay. This should be hidden from resctrl.

Make __rmid_read() the architecture specific function for reading
a counter. Rename it resctrl_arch_rmid_read() and move the error
handling into it.

A read from a counter that hardware supports but resctrl does not
now returns -EINVAL instead of -EIO from the default case in
__mon_event_count(). It isn't possible for user-space to see this
change as resctrl doesn't expose counters it doesn't support.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-16-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c |  4 +-
 arch/x86/kernel/cpu/resctrl/internal.h    |  1 +
 arch/x86/kernel/cpu/resctrl/monitor.c     | 62 +++++++++++++++++++------------
 include/linux/resctrl.h                   |  1 +
 4 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 0ab92320de71..42a1abb378f0 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -579,9 +579,9 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 
 	mon_event_read(&rr, r, d, rdtgrp, evtid, false);
 
-	if (rr.val & RMID_VAL_ERROR)
+	if (rr.err == -EIO)
 		seq_puts(m, "Error\n");
-	else if (rr.val & RMID_VAL_UNAVAIL)
+	else if (rr.err == -EINVAL)
 		seq_puts(m, "Unavailable\n");
 	else
 		seq_printf(m, "%llu\n", rr.val * hw_res->mon_scale);
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index b34a1403f033..1d2e7bd6305f 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -94,6 +94,7 @@ struct rmid_read {
 	struct rdt_domain	*d;
 	enum resctrl_event_id	evtid;
 	bool			first;
+	int			err;
 	u64			val;
 };
 
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index e9755143492b..51ab76f2dfbc 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -167,9 +167,9 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
 		memset(am, 0, sizeof(*am));
 }
 
-static u64 __rmid_read(u32 rmid, enum resctrl_event_id eventid)
+int resctrl_arch_rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
 {
-	u64 val;
+	u64 msr_val;
 
 	/*
 	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
@@ -180,14 +180,24 @@ static u64 __rmid_read(u32 rmid, enum resctrl_event_id eventid)
 	 * are error bits.
 	 */
 	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
-	rdmsrl(MSR_IA32_QM_CTR, val);
+	rdmsrl(MSR_IA32_QM_CTR, msr_val);
 
-	return val;
+	if (msr_val & RMID_VAL_ERROR)
+		return -EIO;
+	if (msr_val & RMID_VAL_UNAVAIL)
+		return -EINVAL;
+
+	*val = msr_val;
+
+	return 0;
 }
 
 static bool rmid_dirty(struct rmid_entry *entry)
 {
-	u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+	u64 val = 0;
+
+	if (resctrl_arch_rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID, &val))
+		return true;
 
 	return val >= resctrl_cqm_threshold;
 }
@@ -259,8 +269,8 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
 {
 	struct rdt_resource *r;
 	struct rdt_domain *d;
-	int cpu;
-	u64 val;
+	int cpu, err;
+	u64 val = 0;
 
 	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
 
@@ -268,8 +278,10 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
 	cpu = get_cpu();
 	list_for_each_entry(d, &r->domains, list) {
 		if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
-			val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
-			if (val <= resctrl_cqm_threshold)
+			err = resctrl_arch_rmid_read(entry->rmid,
+						     QOS_L3_OCCUP_EVENT_ID,
+						     &val);
+			if (err || val <= resctrl_cqm_threshold)
 				continue;
 		}
 
@@ -315,19 +327,19 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
 	return chunks >> shift;
 }
 
-static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
+static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 {
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
 	struct mbm_state *m;
-	u64 chunks, tval;
+	u64 chunks, tval = 0;
 
 	if (rr->first)
 		resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
 
-	tval = __rmid_read(rmid, rr->evtid);
-	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
-		return tval;
-	}
+	rr->err = resctrl_arch_rmid_read(rmid, rr->evtid, &tval);
+	if (rr->err)
+		return rr->err;
+
 	switch (rr->evtid) {
 	case QOS_L3_OCCUP_EVENT_ID:
 		rr->val += tval;
@@ -341,9 +353,9 @@ static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
 	default:
 		/*
 		 * Code would never reach here because an invalid
-		 * event id would fail the __rmid_read.
+		 * event id would fail in resctrl_arch_rmid_read().
 		 */
-		return RMID_VAL_ERROR;
+		return -EINVAL;
 	}
 
 	if (rr->first) {
@@ -399,11 +411,11 @@ void mon_event_count(void *info)
 	struct rdtgroup *rdtgrp, *entry;
 	struct rmid_read *rr = info;
 	struct list_head *head;
-	u64 ret_val;
+	int ret;
 
 	rdtgrp = rr->rgrp;
 
-	ret_val = __mon_event_count(rdtgrp->mon.rmid, rr);
+	ret = __mon_event_count(rdtgrp->mon.rmid, rr);
 
 	/*
 	 * For Ctrl groups read data from child monitor groups and
@@ -415,13 +427,17 @@ void mon_event_count(void *info)
 	if (rdtgrp->type == RDTCTRL_GROUP) {
 		list_for_each_entry(entry, head, mon.crdtgrp_list) {
 			if (__mon_event_count(entry->mon.rmid, rr) == 0)
-				ret_val = 0;
+				ret = 0;
 		}
 	}
 
-	/* Report error if none of rmid_reads are successful */
-	if (ret_val)
-		rr->val = ret_val;
+	/*
+	 * __mon_event_count() calls for newly created monitor groups may
+	 * report -EINVAL/Unavailable if the monitor hasn't seen any traffic.
+	 * Discard error if any of the monitor event reads succeeded.
+	 */
+	if (ret == 0)
+		rr->err = 0;
 }
 
 /*
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 818456770176..efe60dd7fd21 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -219,6 +219,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			    u32 closid, enum resctrl_conf_type type);
 int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
 void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
+int resctrl_arch_rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *res);
 
 /**
  * resctrl_arch_reset_rmid() - Reset any private state associated with rmid
-- 
cgit v1.2.3


From 8286618aca331bf17323ff3023ca831ac6e4b86f Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:24 +0000
Subject: x86/resctrl: Pass the required parameters into
 resctrl_arch_rmid_read()

resctrl_arch_rmid_read() is intended as the function that an
architecture agnostic resctrl filesystem driver can use to
read a value in bytes from a hardware register. Currently the function
returns the MBM values in chunks directly from hardware.

To convert this to bytes, some correction and overflow calculations
are needed. These depend on the resource and domain structures.
Overflow detection requires the old chunks value. None of this
is available to resctrl_arch_rmid_read(). MPAM requires the
resource and domain structures to find the MMIO device that holds
the registers.

Pass the resource and domain to resctrl_arch_rmid_read(). This makes
rmid_dirty() too big. Instead merge it with its only caller, and the
name is kept as a local variable.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-17-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/monitor.c | 31 +++++++++++++++++--------------
 include/linux/resctrl.h               | 18 +++++++++++++++++-
 2 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 51ab76f2dfbc..262141bf4264 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -167,10 +167,14 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
 		memset(am, 0, sizeof(*am));
 }
 
-int resctrl_arch_rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
+			   u32 rmid, enum resctrl_event_id eventid, u64 *val)
 {
 	u64 msr_val;
 
+	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
+		return -EINVAL;
+
 	/*
 	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
 	 * with a valid event code for supported resource type and the bits
@@ -192,16 +196,6 @@ int resctrl_arch_rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
 	return 0;
 }
 
-static bool rmid_dirty(struct rmid_entry *entry)
-{
-	u64 val = 0;
-
-	if (resctrl_arch_rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID, &val))
-		return true;
-
-	return val >= resctrl_cqm_threshold;
-}
-
 /*
  * Check the RMIDs that are marked as busy for this domain. If the
  * reported LLC occupancy is below the threshold clear the busy bit and
@@ -213,6 +207,8 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
 	struct rmid_entry *entry;
 	struct rdt_resource *r;
 	u32 crmid = 1, nrmid;
+	bool rmid_dirty;
+	u64 val = 0;
 
 	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
 
@@ -228,7 +224,14 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
 			break;
 
 		entry = __rmid_entry(nrmid);
-		if (force_free || !rmid_dirty(entry)) {
+
+		if (resctrl_arch_rmid_read(r, d, entry->rmid,
+					   QOS_L3_OCCUP_EVENT_ID, &val))
+			rmid_dirty = true;
+		else
+			rmid_dirty = (val >= resctrl_cqm_threshold);
+
+		if (force_free || !rmid_dirty) {
 			clear_bit(entry->rmid, d->rmid_busy_llc);
 			if (!--entry->busy) {
 				rmid_limbo_count--;
@@ -278,7 +281,7 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
 	cpu = get_cpu();
 	list_for_each_entry(d, &r->domains, list) {
 		if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
-			err = resctrl_arch_rmid_read(entry->rmid,
+			err = resctrl_arch_rmid_read(r, d, entry->rmid,
 						     QOS_L3_OCCUP_EVENT_ID,
 						     &val);
 			if (err || val <= resctrl_cqm_threshold)
@@ -336,7 +339,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 	if (rr->first)
 		resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
 
-	rr->err = resctrl_arch_rmid_read(rmid, rr->evtid, &tval);
+	rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval);
 	if (rr->err)
 		return rr->err;
 
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index efe60dd7fd21..7ccfa0d1bb34 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -219,7 +219,23 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
 			    u32 closid, enum resctrl_conf_type type);
 int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d);
 void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
-int resctrl_arch_rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *res);
+
+/**
+ * resctrl_arch_rmid_read() - Read the eventid counter corresponding to rmid
+ *			      for this resource and domain.
+ * @r:			resource that the counter should be read from.
+ * @d:			domain that the counter should be read from.
+ * @rmid:		rmid of the counter to read.
+ * @eventid:		eventid to read, e.g. L3 occupancy.
+ * @val:		result of the counter read in chunks.
+ *
+ * Call from process context on a CPU that belongs to domain @d.
+ *
+ * Return:
+ * 0 on success, or -EIO, -EINVAL etc on error.
+ */
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
+			   u32 rmid, enum resctrl_event_id eventid, u64 *val);
 
 /**
  * resctrl_arch_reset_rmid() - Reset any private state associated with rmid
-- 
cgit v1.2.3


From 1d81d15db39c2b517bc58f63008c6255dd08aafe Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:25 +0000
Subject: x86/resctrl: Move mbm_overflow_count() into resctrl_arch_rmid_read()

resctrl_arch_rmid_read() is intended as the function that an
architecture agnostic resctrl filesystem driver can use to
read a value in bytes from a counter. Currently the function returns
the MBM values in chunks directly from hardware. When reading a bandwidth
counter, mbm_overflow_count() must be used to correct for any possible
overflow.

mbm_overflow_count() is architecture specific, its behaviour should
be part of resctrl_arch_rmid_read().

Move the mbm_overflow_count() calls into resctrl_arch_rmid_read().
This allows the resctrl filesystems's prev_msr to be removed in
favour of the architecture private version.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-18-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/internal.h |  2 --
 arch/x86/kernel/cpu/resctrl/monitor.c  | 35 +++++++++++++++++++---------------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 1d2e7bd6305f..8039e8aba7de 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -281,7 +281,6 @@ struct rftype {
 /**
  * struct mbm_state - status for each MBM counter in each domain
  * @chunks:	Total data moved (multiply by rdt_group.mon_scale to get bytes)
- * @prev_msr:	Value of IA32_QM_CTR for this RMID last time we read it
  * @prev_bw_chunks: Previous chunks value read for bandwidth calculation
  * @prev_bw:	The most recent bandwidth in MBps
  * @delta_bw:	Difference between the current and previous bandwidth
@@ -289,7 +288,6 @@ struct rftype {
  */
 struct mbm_state {
 	u64	chunks;
-	u64	prev_msr;
 	u64	prev_bw_chunks;
 	u32	prev_bw;
 	u32	delta_bw;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 262141bf4264..862a4462ed60 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -167,9 +167,20 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
 		memset(am, 0, sizeof(*am));
 }
 
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
+{
+	u64 shift = 64 - width, chunks;
+
+	chunks = (cur_msr << shift) - (prev_msr << shift);
+	return chunks >> shift;
+}
+
 int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 			   u32 rmid, enum resctrl_event_id eventid, u64 *val)
 {
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+	struct arch_mbm_state *am;
 	u64 msr_val;
 
 	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
@@ -191,7 +202,13 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 	if (msr_val & RMID_VAL_UNAVAIL)
 		return -EINVAL;
 
-	*val = msr_val;
+	am = get_arch_mbm_state(hw_dom, rmid, eventid);
+	if (am) {
+		*val = mbm_overflow_count(am->prev_msr, msr_val, hw_res->mbm_width);
+		am->prev_msr = msr_val;
+	} else {
+		*val = msr_val;
+	}
 
 	return 0;
 }
@@ -322,19 +339,10 @@ void free_rmid(u32 rmid)
 		list_add_tail(&entry->list, &rmid_free_lru);
 }
 
-static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
-{
-	u64 shift = 64 - width, chunks;
-
-	chunks = (cur_msr << shift) - (prev_msr << shift);
-	return chunks >> shift;
-}
-
 static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 {
-	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
 	struct mbm_state *m;
-	u64 chunks, tval = 0;
+	u64 tval = 0;
 
 	if (rr->first)
 		resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
@@ -363,13 +371,10 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 
 	if (rr->first) {
 		memset(m, 0, sizeof(struct mbm_state));
-		m->prev_msr = tval;
 		return 0;
 	}
 
-	chunks = mbm_overflow_count(m->prev_msr, tval, hw_res->mbm_width);
-	m->chunks += chunks;
-	m->prev_msr = tval;
+	m->chunks += tval;
 
 	rr->val += get_corrected_mbm_count(rmid, m->chunks);
 
-- 
cgit v1.2.3


From 38f72f50d6498ee60ac89deff3686e34ce0c2a32 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:26 +0000
Subject: x86/resctrl: Move get_corrected_mbm_count() into
 resctrl_arch_rmid_read()

resctrl_arch_rmid_read() is intended as the function that an
architecture agnostic resctrl filesystem driver can use to
read a value in bytes from a counter. Currently the function returns
the MBM values in chunks directly from hardware. When reading a bandwidth
counter, get_corrected_mbm_count() must be used to correct the
value read.

get_corrected_mbm_count() is architecture specific, this work should be
done in resctrl_arch_rmid_read().

Move the function calls. This allows the resctrl filesystems's chunks
value to be removed in favour of the architecture private version.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-19-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/internal.h | 4 ++--
 arch/x86/kernel/cpu/resctrl/monitor.c  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 8039e8aba7de..bdb55c2fbdd3 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -280,14 +280,12 @@ struct rftype {
 
 /**
  * struct mbm_state - status for each MBM counter in each domain
- * @chunks:	Total data moved (multiply by rdt_group.mon_scale to get bytes)
  * @prev_bw_chunks: Previous chunks value read for bandwidth calculation
  * @prev_bw:	The most recent bandwidth in MBps
  * @delta_bw:	Difference between the current and previous bandwidth
  * @delta_comp:	Indicates whether to compute the delta_bw
  */
 struct mbm_state {
-	u64	chunks;
 	u64	prev_bw_chunks;
 	u32	prev_bw;
 	u32	delta_bw;
@@ -297,10 +295,12 @@ struct mbm_state {
 /**
  * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s
  *			   return value.
+ * @chunks:	Total data moved (multiply by rdt_group.mon_scale to get bytes)
  * @prev_msr:	Value of IA32_QM_CTR last time it was read for the RMID used to
  *		find this struct.
  */
 struct arch_mbm_state {
+	u64	chunks;
 	u64	prev_msr;
 };
 
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 862a4462ed60..27bb4947a176 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -204,7 +204,9 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 
 	am = get_arch_mbm_state(hw_dom, rmid, eventid);
 	if (am) {
-		*val = mbm_overflow_count(am->prev_msr, msr_val, hw_res->mbm_width);
+		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
+						 hw_res->mbm_width);
+		*val = get_corrected_mbm_count(rmid, am->chunks);
 		am->prev_msr = msr_val;
 	} else {
 		*val = msr_val;
@@ -374,9 +376,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
 		return 0;
 	}
 
-	m->chunks += tval;
-
-	rr->val += get_corrected_mbm_count(rmid, m->chunks);
+	rr->val += tval;
 
 	return 0;
 }
-- 
cgit v1.2.3


From ae2328b52962531c2d7c6b531022a3eb2d680f17 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:27 +0000
Subject: x86/resctrl: Rename and change the units of resctrl_cqm_threshold

resctrl_cqm_threshold is stored in a hardware specific chunk size,
but exposed to user-space as bytes.

This means the filesystem parts of resctrl need to know how the hardware
counts, to convert the user provided byte value to chunks. The interface
between the architecture's resctrl code and the filesystem ought to
treat everything as bytes.

Change the unit of resctrl_cqm_threshold to bytes. resctrl_arch_rmid_read()
still returns its value in chunks, so this needs converting to bytes.
As all the users have been touched, rename the variable to
resctrl_rmid_realloc_threshold, which describes what the value is for.

Neither r->num_rmid nor hw_res->mon_scale are guaranteed to be a power
of 2, so the existing code introduces a rounding error from resctrl's
theoretical fraction of the cache usage. This behaviour is kept as it
ensures the user visible value matches the value read from hardware
when the rmid will be reallocated.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-20-james.morse@arm.com
---
 arch/x86/include/asm/resctrl.h         |  9 +++++++
 arch/x86/kernel/cpu/resctrl/internal.h |  1 -
 arch/x86/kernel/cpu/resctrl/monitor.c  | 43 ++++++++++++++++++++--------------
 arch/x86/kernel/cpu/resctrl/rdtgroup.c |  9 ++-----
 include/linux/resctrl.h                |  2 ++
 5 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index d60ed0668a59..d24b04ebf950 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -81,6 +81,15 @@ static void __resctrl_sched_in(void)
 	}
 }
 
+static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
+{
+	unsigned int scale = boot_cpu_data.x86_cache_occ_scale;
+
+	/* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
+	val /= scale;
+	return val * scale;
+}
+
 static inline void resctrl_sched_in(void)
 {
 	if (static_branch_likely(&rdt_enable_key))
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index bdb55c2fbdd3..c05e9b7cf77a 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -98,7 +98,6 @@ struct rmid_read {
 	u64			val;
 };
 
-extern unsigned int resctrl_cqm_threshold;
 extern bool rdt_alloc_capable;
 extern bool rdt_mon_capable;
 extern unsigned int rdt_mon_features;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 27bb4947a176..e91afe99b763 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -17,7 +17,10 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
+
 #include <asm/cpu_device_id.h>
+#include <asm/resctrl.h>
+
 #include "internal.h"
 
 struct rmid_entry {
@@ -37,8 +40,8 @@ static LIST_HEAD(rmid_free_lru);
  * @rmid_limbo_count     count of currently unused but (potentially)
  *     dirty RMIDs.
  *     This counts RMIDs that no one is currently using but that
- *     may have a occupancy value > intel_cqm_threshold. User can change
- *     the threshold occupancy value.
+ *     may have a occupancy value > resctrl_rmid_realloc_threshold. User can
+ *     change the threshold occupancy value.
  */
 static unsigned int rmid_limbo_count;
 
@@ -59,10 +62,10 @@ bool rdt_mon_capable;
 unsigned int rdt_mon_features;
 
 /*
- * This is the threshold cache occupancy at which we will consider an
+ * This is the threshold cache occupancy in bytes at which we will consider an
  * RMID available for re-allocation.
  */
-unsigned int resctrl_cqm_threshold;
+unsigned int resctrl_rmid_realloc_threshold;
 
 #define CF(cf)	((unsigned long)(1048576 * (cf) + 0.5))
 
@@ -223,14 +226,13 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
  */
 void __check_limbo(struct rdt_domain *d, bool force_free)
 {
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	struct rmid_entry *entry;
-	struct rdt_resource *r;
 	u32 crmid = 1, nrmid;
 	bool rmid_dirty;
 	u64 val = 0;
 
-	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-
 	/*
 	 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
 	 * are marked as busy for occupancy < threshold. If the occupancy
@@ -245,10 +247,12 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
 		entry = __rmid_entry(nrmid);
 
 		if (resctrl_arch_rmid_read(r, d, entry->rmid,
-					   QOS_L3_OCCUP_EVENT_ID, &val))
+					   QOS_L3_OCCUP_EVENT_ID, &val)) {
 			rmid_dirty = true;
-		else
-			rmid_dirty = (val >= resctrl_cqm_threshold);
+		} else {
+			val *= hw_res->mon_scale;
+			rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
+		}
 
 		if (force_free || !rmid_dirty) {
 			clear_bit(entry->rmid, d->rmid_busy_llc);
@@ -289,13 +293,12 @@ int alloc_rmid(void)
 
 static void add_rmid_to_limbo(struct rmid_entry *entry)
 {
-	struct rdt_resource *r;
+	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	struct rdt_domain *d;
 	int cpu, err;
 	u64 val = 0;
 
-	r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-
 	entry->busy = 0;
 	cpu = get_cpu();
 	list_for_each_entry(d, &r->domains, list) {
@@ -303,7 +306,8 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
 			err = resctrl_arch_rmid_read(r, d, entry->rmid,
 						     QOS_L3_OCCUP_EVENT_ID,
 						     &val);
-			if (err || val <= resctrl_cqm_threshold)
+			val *= hw_res->mon_scale;
+			if (err || val <= resctrl_rmid_realloc_threshold)
 				continue;
 		}
 
@@ -744,6 +748,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
 	unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	unsigned int cl_size = boot_cpu_data.x86_cache_size;
+	unsigned int threshold;
 	int ret;
 
 	hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
@@ -762,10 +767,14 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
 	 *
 	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
 	 */
-	resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid;
+	threshold = cl_size * 1024 / r->num_rmid;
 
-	/* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
-	resctrl_cqm_threshold /= hw_res->mon_scale;
+	/*
+	 * Because num_rmid may not be a power of two, round the value
+	 * to the nearest multiple of hw_res->mon_scale so it matches a
+	 * value the hardware will measure. mon_scale may not be a power of 2.
+	 */
+	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
 
 	ret = dom_data_init(r);
 	if (ret)
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 6c33dfe7ea53..849bdec37217 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1030,10 +1030,7 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
 static int max_threshold_occ_show(struct kernfs_open_file *of,
 				  struct seq_file *seq, void *v)
 {
-	struct rdt_resource *r = of->kn->parent->priv;
-	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
-
-	seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale);
+	seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
 
 	return 0;
 }
@@ -1055,7 +1052,6 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
 static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 				       char *buf, size_t nbytes, loff_t off)
 {
-	struct rdt_hw_resource *hw_res;
 	unsigned int bytes;
 	int ret;
 
@@ -1066,8 +1062,7 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 	if (bytes > (boot_cpu_data.x86_cache_size * 1024))
 		return -EINVAL;
 
-	hw_res = resctrl_to_arch_res(of->kn->parent->priv);
-	resctrl_cqm_threshold = bytes / hw_res->mon_scale;
+	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
 
 	return nbytes;
 }
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 7ccfa0d1bb34..9995d043650a 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -250,4 +250,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
 			     u32 rmid, enum resctrl_event_id eventid);
 
+extern unsigned int resctrl_rmid_realloc_threshold;
+
 #endif /* _RESCTRL_H */
-- 
cgit v1.2.3


From d80975e264c8f01518890f3d91ab5bada8fa7f5e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:28 +0000
Subject: x86/resctrl: Add resctrl_rmid_realloc_limit to abstract x86's
 boot_cpu_data

resctrl_rmid_realloc_threshold can be set by user-space. The maximum
value is specified by the architecture.

Currently max_threshold_occ_write() reads the maximum value from
boot_cpu_data.x86_cache_size, which is not portable to another
architecture.

Add resctrl_rmid_realloc_limit to describe the maximum size in bytes
that user-space can set the threshold to.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-21-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/monitor.c  | 9 +++++++--
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +-
 include/linux/resctrl.h                | 1 +
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index e91afe99b763..8d15568d7121 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -67,6 +67,11 @@ unsigned int rdt_mon_features;
  */
 unsigned int resctrl_rmid_realloc_threshold;
 
+/*
+ * This is the maximum value for the reallocation threshold, in bytes.
+ */
+unsigned int resctrl_rmid_realloc_limit;
+
 #define CF(cf)	((unsigned long)(1048576 * (cf) + 0.5))
 
 /*
@@ -747,10 +752,10 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
 {
 	unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
-	unsigned int cl_size = boot_cpu_data.x86_cache_size;
 	unsigned int threshold;
 	int ret;
 
+	resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
 	hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
 	r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
 	hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
@@ -767,7 +772,7 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
 	 *
 	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
 	 */
-	threshold = cl_size * 1024 / r->num_rmid;
+	threshold = resctrl_rmid_realloc_limit / r->num_rmid;
 
 	/*
 	 * Because num_rmid may not be a power of two, round the value
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 849bdec37217..e5a48f05e787 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1059,7 +1059,7 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 	if (ret)
 		return ret;
 
-	if (bytes > (boot_cpu_data.x86_cache_size * 1024))
+	if (bytes > resctrl_rmid_realloc_limit)
 		return -EINVAL;
 
 	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 9995d043650a..cb857f753322 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -251,5 +251,6 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
 			     u32 rmid, enum resctrl_event_id eventid);
 
 extern unsigned int resctrl_rmid_realloc_threshold;
+extern unsigned int resctrl_rmid_realloc_limit;
 
 #endif /* _RESCTRL_H */
-- 
cgit v1.2.3


From f7b1843eca6fe295ba0c71fc02a3291954078f2b Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Fri, 2 Sep 2022 15:48:29 +0000
Subject: x86/resctrl: Make resctrl_arch_rmid_read() return values in bytes

resctrl_arch_rmid_read() returns a value in chunks, as read from the
hardware. This needs scaling to bytes by mon_scale, as provided by
the architecture code.

Now that resctrl_arch_rmid_read() performs the overflow and corrections
itself, it may as well return a value in bytes directly. This allows
the accesses to the architecture specific 'hw' structure to be removed.

Move the mon_scale conversion into resctrl_arch_rmid_read().
mbm_bw_count() is updated to calculate bandwidth from bytes.

Signed-off-by: James Morse <james.morse@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Jamie Iles <quic_jiles@quicinc.com>
Reviewed-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Xin Hao <xhao@linux.alibaba.com>
Tested-by: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Tested-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20220902154829.30399-22-james.morse@arm.com
---
 arch/x86/kernel/cpu/resctrl/ctrlmondata.c |  6 ++----
 arch/x86/kernel/cpu/resctrl/internal.h    |  4 ++--
 arch/x86/kernel/cpu/resctrl/monitor.c     | 24 +++++++++++-------------
 include/linux/resctrl.h                   |  2 +-
 4 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 42a1abb378f0..1dafbdc5ac31 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -549,7 +549,6 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
 int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 {
 	struct kernfs_open_file *of = m->private;
-	struct rdt_hw_resource *hw_res;
 	u32 resid, evtid, domid;
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
@@ -569,8 +568,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 	domid = md.u.domid;
 	evtid = md.u.evtid;
 
-	hw_res = &rdt_resources_all[resid];
-	r = &hw_res->r_resctrl;
+	r = &rdt_resources_all[resid].r_resctrl;
 	d = rdt_find_domain(r, domid, NULL);
 	if (IS_ERR_OR_NULL(d)) {
 		ret = -ENOENT;
@@ -584,7 +582,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 	else if (rr.err == -EINVAL)
 		seq_puts(m, "Unavailable\n");
 	else
-		seq_printf(m, "%llu\n", rr.val * hw_res->mon_scale);
+		seq_printf(m, "%llu\n", rr.val);
 
 out:
 	rdtgroup_kn_unlock(of->kn);
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index c05e9b7cf77a..5f7128686cfd 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -279,13 +279,13 @@ struct rftype {
 
 /**
  * struct mbm_state - status for each MBM counter in each domain
- * @prev_bw_chunks: Previous chunks value read for bandwidth calculation
+ * @prev_bw_bytes: Previous bytes value read for bandwidth calculation
  * @prev_bw:	The most recent bandwidth in MBps
  * @delta_bw:	Difference between the current and previous bandwidth
  * @delta_comp:	Indicates whether to compute the delta_bw
  */
 struct mbm_state {
-	u64	prev_bw_chunks;
+	u64	prev_bw_bytes;
 	u32	prev_bw;
 	u32	delta_bw;
 	bool	delta_comp;
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 8d15568d7121..efe0c30d3a12 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -16,6 +16,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/sizes.h>
 #include <linux/slab.h>
 
 #include <asm/cpu_device_id.h>
@@ -189,7 +190,7 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	struct arch_mbm_state *am;
-	u64 msr_val;
+	u64 msr_val, chunks;
 
 	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
 		return -EINVAL;
@@ -214,12 +215,14 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 	if (am) {
 		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
 						 hw_res->mbm_width);
-		*val = get_corrected_mbm_count(rmid, am->chunks);
+		chunks = get_corrected_mbm_count(rmid, am->chunks);
 		am->prev_msr = msr_val;
 	} else {
-		*val = msr_val;
+		chunks = msr_val;
 	}
 
+	*val = chunks * hw_res->mon_scale;
+
 	return 0;
 }
 
@@ -232,7 +235,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 void __check_limbo(struct rdt_domain *d, bool force_free)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	struct rmid_entry *entry;
 	u32 crmid = 1, nrmid;
 	bool rmid_dirty;
@@ -255,7 +257,6 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
 					   QOS_L3_OCCUP_EVENT_ID, &val)) {
 			rmid_dirty = true;
 		} else {
-			val *= hw_res->mon_scale;
 			rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
 		}
 
@@ -299,7 +300,6 @@ int alloc_rmid(void)
 static void add_rmid_to_limbo(struct rmid_entry *entry)
 {
 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
-	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	struct rdt_domain *d;
 	int cpu, err;
 	u64 val = 0;
@@ -311,7 +311,6 @@ static void add_rmid_to_limbo(struct rmid_entry *entry)
 			err = resctrl_arch_rmid_read(r, d, entry->rmid,
 						     QOS_L3_OCCUP_EVENT_ID,
 						     &val);
-			val *= hw_res->mon_scale;
 			if (err || val <= resctrl_rmid_realloc_threshold)
 				continue;
 		}
@@ -403,15 +402,14 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
  */
 static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
 {
-	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
 	struct mbm_state *m = &rr->d->mbm_local[rmid];
-	u64 cur_bw, chunks, cur_chunks;
+	u64 cur_bw, bytes, cur_bytes;
 
-	cur_chunks = rr->val;
-	chunks = cur_chunks - m->prev_bw_chunks;
-	m->prev_bw_chunks = cur_chunks;
+	cur_bytes = rr->val;
+	bytes = cur_bytes - m->prev_bw_bytes;
+	m->prev_bw_bytes = cur_bytes;
 
-	cur_bw = (chunks * hw_res->mon_scale) >> 20;
+	cur_bw = bytes / SZ_1M;
 
 	if (m->delta_comp)
 		m->delta_bw = abs(cur_bw - m->prev_bw);
diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index cb857f753322..0cf5b20c6ddf 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -227,7 +227,7 @@ void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d);
  * @d:			domain that the counter should be read from.
  * @rmid:		rmid of the counter to read.
  * @eventid:		eventid to read, e.g. L3 occupancy.
- * @val:		result of the counter read in chunks.
+ * @val:		result of the counter read in bytes.
  *
  * Call from process context on a CPU that belongs to domain @d.
  *
-- 
cgit v1.2.3


From cc62d98bd56d45de4531844ca23913a15136c05b Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Mon, 22 Aug 2022 18:08:04 -0700
Subject: Revert "drm: bridge: analogix/dp: add panel prepare/unprepare in
 suspend/resume time"

This reverts commit 211f276ed3d96e964d2d1106a198c7f4a4b3f4c0.

For quite some time, core DRM helpers already ensure that any relevant
connectors/CRTCs/etc. are disabled, as well as their associated
components (e.g., bridges) when suspending the system. Thus,
analogix_dp_bridge_{enable,disable}() already get called, which in turn
call drm_panel_{prepare,unprepare}(). This makes these drm_panel_*()
calls redundant.

Besides redundancy, there are a few problems with this handling:

(1) drm_panel_{prepare,unprepare}() are *not* reference-counted APIs and
are not in general designed to be handled by multiple callers --
although some panel drivers have a coarse 'prepared' flag that mitigates
some damage, at least. So at a minimum this is redundant and confusing,
but in some cases, this could be actively harmful.

(2) The error-handling is a bit non-standard. We ignored errors in
suspend(), but handled errors in resume(). And recently, people noticed
that the clk handling is unbalanced in error paths, and getting *that*
right is not actually trivial, given the current way errors are mostly
ignored.

(3) In the particular way analogix_dp_{suspend,resume}() get used (e.g.,
in rockchip_dp_*(), as a late/early callback), we don't necessarily have
a proper PM relationship between the DP/bridge device and the panel
device. So while the DP bridge gets resumed, the panel's parent device
(e.g., platform_device) may still be suspended, and so any prepare()
calls may fail.

So remove the superfluous, possibly-harmful suspend()/resume() handling
of panel state.

Fixes: 211f276ed3d9 ("drm: bridge: analogix/dp: add panel prepare/unprepare in suspend/resume time")
Link: https://lore.kernel.org/all/Yv2CPBD3Picg%2FgVe@google.com/
Signed-off-by: Brian Norris <briannorris@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20220822180729.1.I8ac5abe3a4c1c6fd5c061686c6e883c22f69022c@changeid
---
 drivers/gpu/drm/bridge/analogix/analogix_dp_core.c | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
index 8aadcc0aa90b..df9370e0ff23 100644
--- a/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
+++ b/drivers/gpu/drm/bridge/analogix/analogix_dp_core.c
@@ -1864,12 +1864,6 @@ EXPORT_SYMBOL_GPL(analogix_dp_remove);
 int analogix_dp_suspend(struct analogix_dp_device *dp)
 {
 	clk_disable_unprepare(dp->clock);
-
-	if (dp->plat_data->panel) {
-		if (drm_panel_unprepare(dp->plat_data->panel))
-			DRM_ERROR("failed to turnoff the panel\n");
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(analogix_dp_suspend);
@@ -1884,13 +1878,6 @@ int analogix_dp_resume(struct analogix_dp_device *dp)
 		return ret;
 	}
 
-	if (dp->plat_data->panel) {
-		if (drm_panel_prepare(dp->plat_data->panel)) {
-			DRM_ERROR("failed to setup the panel\n");
-			return -EBUSY;
-		}
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(analogix_dp_resume);
-- 
cgit v1.2.3


From 14646de48bd77947cd6a325b42eecddcec5a35c7 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 22 Jul 2022 16:33:35 -0700
Subject: EDAC/skx_common: Add ChipSelect ADXL component

Each pseudo channel of HBM has its own retry_rd_err_log registers.
The bit 0 of ChipSelect ADXL component encodes the pseudo channel
number of HBM memory. So add ChipSelect ADXL component to get HBM
pseudo channel number.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220722233338.341567-1-tony.luck@intel.com
---
 drivers/edac/skx_common.c | 5 +++++
 drivers/edac/skx_common.h | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 7276ce3a33e1..f0f8e98f6efb 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -27,9 +27,11 @@ static const char * const component_names[] = {
 	[INDEX_MEMCTRL]		= "MemoryControllerId",
 	[INDEX_CHANNEL]		= "ChannelId",
 	[INDEX_DIMM]		= "DimmSlotId",
+	[INDEX_CS]		= "ChipSelect",
 	[INDEX_NM_MEMCTRL]	= "NmMemoryControllerId",
 	[INDEX_NM_CHANNEL]	= "NmChannelId",
 	[INDEX_NM_DIMM]		= "NmDimmSlotId",
+	[INDEX_NM_CS]		= "NmChipSelect",
 };
 
 static int component_indices[ARRAY_SIZE(component_names)];
@@ -139,10 +141,13 @@ static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_me
 			       (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1;
 		res->dimm    = (adxl_nm_bitmap & BIT_NM_DIMM) ?
 			       (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1;
+		res->cs      = (adxl_nm_bitmap & BIT_NM_CS) ?
+			       (int)adxl_values[component_indices[INDEX_NM_CS]] : -1;
 	} else {
 		res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
 		res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
 		res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
+		res->cs      = (int)adxl_values[component_indices[INDEX_CS]];
 	}
 
 	if (res->imc > NUM_IMC - 1 || res->imc < 0) {
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index c542f1562825..167760fd75ba 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -112,16 +112,19 @@ enum {
 	INDEX_MEMCTRL,
 	INDEX_CHANNEL,
 	INDEX_DIMM,
+	INDEX_CS,
 	INDEX_NM_FIRST,
 	INDEX_NM_MEMCTRL = INDEX_NM_FIRST,
 	INDEX_NM_CHANNEL,
 	INDEX_NM_DIMM,
+	INDEX_NM_CS,
 	INDEX_MAX
 };
 
 #define BIT_NM_MEMCTRL	BIT_ULL(INDEX_NM_MEMCTRL)
 #define BIT_NM_CHANNEL	BIT_ULL(INDEX_NM_CHANNEL)
 #define BIT_NM_DIMM	BIT_ULL(INDEX_NM_DIMM)
+#define BIT_NM_CS	BIT_ULL(INDEX_NM_CS)
 
 struct decoded_addr {
 	struct mce *mce;
@@ -134,6 +137,7 @@ struct decoded_addr {
 	int	sktways;
 	int	chanways;
 	int	dimm;
+	int	cs;
 	int	rank;
 	int	channel_rank;
 	u64	rank_address;
-- 
cgit v1.2.3


From acd4cf68fefe70138926056e3137c9ea99ef7ebf Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 22 Jul 2022 16:33:36 -0700
Subject: EDAC/i10nm: Retrieve and print retry_rd_err_log registers for HBM

An HBM memory channel is divided into two pseudo channels. Each
pseudo channel has its own retry_rd_err_log registers. Retrieve and
print retry_rd_err_log registers of the HBM pseudo channel if the
memory error is from HBM.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220722233338.341567-1-tony.luck@intel.com
---
 drivers/edac/i10nm_base.c | 84 +++++++++++++++++++++++++++++++++++++----------
 drivers/edac/skx_common.h |  4 +++
 2 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 817f618fcff0..b5e9db162915 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -79,18 +79,20 @@ static bool mem_cfg_2lm;
 
 static u32 offsets_scrub_icx[]  = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8};
 static u32 offsets_scrub_spr[]  = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8};
+static u32 offsets_scrub_spr_hbm0[]  = {0x2860, 0x2854, 0x2b08, 0x2858, 0x2828, 0x0ed8};
+static u32 offsets_scrub_spr_hbm1[]  = {0x2c60, 0x2c54, 0x2f08, 0x2c58, 0x2c28, 0x0fa8};
 static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0};
 static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0};
+static u32 offsets_demand_spr_hbm0[] = {0x2a54, 0x2a60, 0x2b10, 0x2a58, 0x2a5c, 0x0ee0};
+static u32 offsets_demand_spr_hbm1[] = {0x2e54, 0x2e60, 0x2f10, 0x2e58, 0x2e5c, 0x0fb0};
 
-static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable)
+static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable,
+				      u32 *offsets_scrub, u32 *offsets_demand)
 {
 	u32 s, d;
 
-	if (!imc->mbase)
-		return;
-
-	s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]);
-	d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]);
+	s = I10NM_GET_REG32(imc, chan, offsets_scrub[0]);
+	d = I10NM_GET_REG32(imc, chan, offsets_demand[0]);
 
 	if (enable) {
 		/* Save default configurations */
@@ -117,21 +119,39 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable
 			d &= ~RETRY_RD_ERR_LOG_EN;
 	}
 
-	I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s);
-	I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d);
+	I10NM_SET_REG32(imc, chan, offsets_scrub[0], s);
+	I10NM_SET_REG32(imc, chan, offsets_demand[0], d);
 }
 
 static void enable_retry_rd_err_log(bool enable)
 {
+	struct skx_imc *imc;
 	struct skx_dev *d;
 	int i, j;
 
 	edac_dbg(2, "\n");
 
 	list_for_each_entry(d, i10nm_edac_list, list)
-		for (i = 0; i < I10NM_NUM_IMC; i++)
-			for (j = 0; j < I10NM_NUM_CHANNELS; j++)
-				__enable_retry_rd_err_log(&d->imc[i], j, enable);
+		for (i = 0; i < I10NM_NUM_IMC; i++) {
+			imc = &d->imc[i];
+			if (!imc->mbase)
+				continue;
+
+			for (j = 0; j < I10NM_NUM_CHANNELS; j++) {
+				if (imc->hbm_mc) {
+					__enable_retry_rd_err_log(imc, j, enable,
+								  res_cfg->offsets_scrub_hbm0,
+								  res_cfg->offsets_demand_hbm0);
+					__enable_retry_rd_err_log(imc, j, enable,
+								  res_cfg->offsets_scrub_hbm1,
+								  res_cfg->offsets_demand_hbm1);
+				} else {
+					__enable_retry_rd_err_log(imc, j, enable,
+								  res_cfg->offsets_scrub,
+								  res_cfg->offsets_demand);
+				}
+			}
+	}
 }
 
 static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
@@ -142,12 +162,24 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
 	u32 corr0, corr1, corr2, corr3;
 	u64 log2a, log5;
 	u32 *offsets;
-	int n;
+	int n, pch;
 
 	if (!imc->mbase)
 		return;
 
-	offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand;
+	if (imc->hbm_mc) {
+		pch = res->cs & 1;
+
+		if (pch)
+			offsets = scrub_err ? res_cfg->offsets_scrub_hbm1 :
+					      res_cfg->offsets_demand_hbm1;
+		else
+			offsets = scrub_err ? res_cfg->offsets_scrub_hbm0 :
+					      res_cfg->offsets_demand_hbm0;
+	} else {
+		offsets = scrub_err ? res_cfg->offsets_scrub :
+				      res_cfg->offsets_demand;
+	}
 
 	log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]);
 	log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]);
@@ -165,10 +197,24 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
 			     log0, log1, log2, log3, log4, log5);
 	}
 
-	corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18);
-	corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c);
-	corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20);
-	corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24);
+	if (imc->hbm_mc) {
+		if (pch) {
+			corr0 = I10NM_GET_REG32(imc, res->channel, 0x2c18);
+			corr1 = I10NM_GET_REG32(imc, res->channel, 0x2c1c);
+			corr2 = I10NM_GET_REG32(imc, res->channel, 0x2c20);
+			corr3 = I10NM_GET_REG32(imc, res->channel, 0x2c24);
+		} else {
+			corr0 = I10NM_GET_REG32(imc, res->channel, 0x2818);
+			corr1 = I10NM_GET_REG32(imc, res->channel, 0x281c);
+			corr2 = I10NM_GET_REG32(imc, res->channel, 0x2820);
+			corr3 = I10NM_GET_REG32(imc, res->channel, 0x2824);
+		}
+	} else {
+		corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18);
+		corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c);
+		corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20);
+		corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24);
+	}
 
 	if (len - n > 0)
 		snprintf(msg + n, len - n,
@@ -519,7 +565,11 @@ static struct res_config spr_cfg = {
 	.sad_all_devfn		= PCI_DEVFN(10, 0),
 	.sad_all_offset		= 0x300,
 	.offsets_scrub		= offsets_scrub_spr,
+	.offsets_scrub_hbm0	= offsets_scrub_spr_hbm0,
+	.offsets_scrub_hbm1	= offsets_scrub_spr_hbm1,
 	.offsets_demand		= offsets_demand_spr,
+	.offsets_demand_hbm0	= offsets_demand_spr_hbm0,
+	.offsets_demand_hbm1	= offsets_demand_spr_hbm1,
 };
 
 static const struct x86_cpu_id i10nm_cpuids[] = {
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 167760fd75ba..455e652c0e46 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -164,7 +164,11 @@ struct res_config {
 	int sad_all_offset;
 	/* Offsets of retry_rd_err_log registers */
 	u32 *offsets_scrub;
+	u32 *offsets_scrub_hbm0;
+	u32 *offsets_scrub_hbm1;
 	u32 *offsets_demand;
+	u32 *offsets_demand_hbm0;
+	u32 *offsets_demand_hbm1;
 };
 
 typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci,
-- 
cgit v1.2.3


From d5f5e49953f68bb7b15afd6e32ad176b987c6525 Mon Sep 17 00:00:00 2001
From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Date: Fri, 22 Jul 2022 16:33:37 -0700
Subject: EDAC/i10nm: Print an extra register set of retry_rd_err_log

Sapphire Rapids server adds an extra register set for logging more
retry_rd_err_log data. So add code to print the extra register set.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220722233338.341567-1-tony.luck@intel.com
---
 drivers/edac/i10nm_base.c | 81 ++++++++++++++++++++++++++++++++++++++++-------
 drivers/edac/skx_common.h |  2 ++
 2 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index b5e9db162915..a22ea053f8e1 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -83,26 +83,38 @@ static u32 offsets_scrub_spr_hbm0[]  = {0x2860, 0x2854, 0x2b08, 0x2858, 0x2828,
 static u32 offsets_scrub_spr_hbm1[]  = {0x2c60, 0x2c54, 0x2f08, 0x2c58, 0x2c28, 0x0fa8};
 static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0};
 static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0};
+static u32 offsets_demand2_spr[] = {0x22c70, 0x22d80, 0x22f18, 0x22d58, 0x22c64, 0x20f10};
 static u32 offsets_demand_spr_hbm0[] = {0x2a54, 0x2a60, 0x2b10, 0x2a58, 0x2a5c, 0x0ee0};
 static u32 offsets_demand_spr_hbm1[] = {0x2e54, 0x2e60, 0x2f10, 0x2e58, 0x2e5c, 0x0fb0};
 
 static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable,
-				      u32 *offsets_scrub, u32 *offsets_demand)
+				      u32 *offsets_scrub, u32 *offsets_demand,
+				      u32 *offsets_demand2)
 {
-	u32 s, d;
+	u32 s, d, d2;
 
 	s = I10NM_GET_REG32(imc, chan, offsets_scrub[0]);
 	d = I10NM_GET_REG32(imc, chan, offsets_demand[0]);
+	if (offsets_demand2)
+		d2 = I10NM_GET_REG32(imc, chan, offsets_demand2[0]);
 
 	if (enable) {
 		/* Save default configurations */
 		imc->chan[chan].retry_rd_err_log_s = s;
 		imc->chan[chan].retry_rd_err_log_d = d;
+		if (offsets_demand2)
+			imc->chan[chan].retry_rd_err_log_d2 = d2;
 
 		s &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
 		s |=  RETRY_RD_ERR_LOG_EN;
 		d &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
 		d |=  RETRY_RD_ERR_LOG_EN;
+
+		if (offsets_demand2) {
+			d2 &= ~RETRY_RD_ERR_LOG_UC;
+			d2 |=  RETRY_RD_ERR_LOG_NOOVER;
+			d2 |=  RETRY_RD_ERR_LOG_EN;
+		}
 	} else {
 		/* Restore default configurations */
 		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC)
@@ -117,10 +129,21 @@ static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable
 			d |=  RETRY_RD_ERR_LOG_NOOVER;
 		if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN))
 			d &= ~RETRY_RD_ERR_LOG_EN;
+
+		if (offsets_demand2) {
+			if (imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_UC)
+				d2 |=  RETRY_RD_ERR_LOG_UC;
+			if (!(imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_NOOVER))
+				d2 &=  ~RETRY_RD_ERR_LOG_NOOVER;
+			if (!(imc->chan[chan].retry_rd_err_log_d2 & RETRY_RD_ERR_LOG_EN))
+				d2 &= ~RETRY_RD_ERR_LOG_EN;
+		}
 	}
 
 	I10NM_SET_REG32(imc, chan, offsets_scrub[0], s);
 	I10NM_SET_REG32(imc, chan, offsets_demand[0], d);
+	if (offsets_demand2)
+		I10NM_SET_REG32(imc, chan, offsets_demand2[0], d2);
 }
 
 static void enable_retry_rd_err_log(bool enable)
@@ -141,14 +164,17 @@ static void enable_retry_rd_err_log(bool enable)
 				if (imc->hbm_mc) {
 					__enable_retry_rd_err_log(imc, j, enable,
 								  res_cfg->offsets_scrub_hbm0,
-								  res_cfg->offsets_demand_hbm0);
+								  res_cfg->offsets_demand_hbm0,
+								  NULL);
 					__enable_retry_rd_err_log(imc, j, enable,
 								  res_cfg->offsets_scrub_hbm1,
-								  res_cfg->offsets_demand_hbm1);
+								  res_cfg->offsets_demand_hbm1,
+								  NULL);
 				} else {
 					__enable_retry_rd_err_log(imc, j, enable,
 								  res_cfg->offsets_scrub,
-								  res_cfg->offsets_demand);
+								  res_cfg->offsets_demand,
+								  res_cfg->offsets_demand2);
 				}
 			}
 	}
@@ -160,7 +186,10 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
 	struct skx_imc *imc = &res->dev->imc[res->imc];
 	u32 log0, log1, log2, log3, log4;
 	u32 corr0, corr1, corr2, corr3;
+	u32 lxg0, lxg1, lxg3, lxg4;
+	u32 *xffsets = NULL;
 	u64 log2a, log5;
+	u64 lxg2a, lxg5;
 	u32 *offsets;
 	int n, pch;
 
@@ -177,8 +206,12 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
 			offsets = scrub_err ? res_cfg->offsets_scrub_hbm0 :
 					      res_cfg->offsets_demand_hbm0;
 	} else {
-		offsets = scrub_err ? res_cfg->offsets_scrub :
-				      res_cfg->offsets_demand;
+		if (scrub_err) {
+			offsets = res_cfg->offsets_scrub;
+		} else {
+			offsets = res_cfg->offsets_demand;
+			xffsets = res_cfg->offsets_demand2;
+		}
 	}
 
 	log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]);
@@ -187,10 +220,28 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
 	log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]);
 	log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]);
 
+	if (xffsets) {
+		lxg0 = I10NM_GET_REG32(imc, res->channel, xffsets[0]);
+		lxg1 = I10NM_GET_REG32(imc, res->channel, xffsets[1]);
+		lxg3 = I10NM_GET_REG32(imc, res->channel, xffsets[3]);
+		lxg4 = I10NM_GET_REG32(imc, res->channel, xffsets[4]);
+		lxg5 = I10NM_GET_REG64(imc, res->channel, xffsets[5]);
+	}
+
 	if (res_cfg->type == SPR) {
 		log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]);
-		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx]",
+		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx",
 			     log0, log1, log2a, log3, log4, log5);
+
+		if (len - n > 0) {
+			if (xffsets) {
+				lxg2a = I10NM_GET_REG64(imc, res->channel, xffsets[2]);
+				n += snprintf(msg + n, len - n, " %.8x %.8x %.16llx %.8x %.8x %.16llx]",
+					     lxg0, lxg1, lxg2a, lxg3, lxg4, lxg5);
+			} else {
+				n += snprintf(msg + n, len - n, "]");
+			}
+		}
 	} else {
 		log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]);
 		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]",
@@ -225,9 +276,16 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
 			 corr3 & 0xffff, corr3 >> 16);
 
 	/* Clear status bits */
-	if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
-		log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
-		I10NM_SET_REG32(imc, res->channel, offsets[0], log0);
+	if (retry_rd_err_log == 2) {
+		if (log0 & RETRY_RD_ERR_LOG_OVER_UC_V) {
+			log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
+			I10NM_SET_REG32(imc, res->channel, offsets[0], log0);
+		}
+
+		if (xffsets && (lxg0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
+			lxg0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
+			I10NM_SET_REG32(imc, res->channel, xffsets[0], lxg0);
+		}
 	}
 }
 
@@ -568,6 +626,7 @@ static struct res_config spr_cfg = {
 	.offsets_scrub_hbm0	= offsets_scrub_spr_hbm0,
 	.offsets_scrub_hbm1	= offsets_scrub_spr_hbm1,
 	.offsets_demand		= offsets_demand_spr,
+	.offsets_demand2	= offsets_demand2_spr,
 	.offsets_demand_hbm0	= offsets_demand_spr_hbm0,
 	.offsets_demand_hbm1	= offsets_demand_spr_hbm1,
 };
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 455e652c0e46..0cbadd3d2cd3 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -86,6 +86,7 @@ struct skx_dev {
 			struct pci_dev	*edev;
 			u32 retry_rd_err_log_s;
 			u32 retry_rd_err_log_d;
+			u32 retry_rd_err_log_d2;
 			struct skx_dimm {
 				u8 close_pg;
 				u8 bank_xor_enable;
@@ -167,6 +168,7 @@ struct res_config {
 	u32 *offsets_scrub_hbm0;
 	u32 *offsets_scrub_hbm1;
 	u32 *offsets_demand;
+	u32 *offsets_demand2;
 	u32 *offsets_demand_hbm0;
 	u32 *offsets_demand_hbm1;
 };
-- 
cgit v1.2.3


From d389059685b46861c264cda4f37a33feeab91dfc Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Fri, 22 Jul 2022 16:33:38 -0700
Subject: x86/sb_edac: Add row column translation for Broadwell

The sb_edac driver lacks translation for DIMM internal address.

Add memory address translation for row/column/bank/bank_group
on Broadwell.

Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/all/20220722233338.341567-1-tony.luck@intel.com
---
 drivers/edac/sb_edac.c | 148 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 138 insertions(+), 10 deletions(-)

diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 9678ab97c7ac..8e39370fdb5c 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -335,6 +335,12 @@ struct sbridge_info {
 struct sbridge_channel {
 	u32		ranks;
 	u32		dimms;
+	struct dimm {
+		u32 rowbits;
+		u32 colbits;
+		u32 bank_xor_enable;
+		u32 amap_fine;
+	} dimm[MAX_DIMMS];
 };
 
 struct pci_id_descr {
@@ -1603,7 +1609,7 @@ static int __populate_dimms(struct mem_ctl_info *mci,
 		banks = 8;
 
 	for (i = 0; i < channels; i++) {
-		u32 mtr;
+		u32 mtr, amap = 0;
 
 		int max_dimms_per_channel;
 
@@ -1615,6 +1621,7 @@ static int __populate_dimms(struct mem_ctl_info *mci,
 			max_dimms_per_channel = ARRAY_SIZE(mtr_regs);
 			if (!pvt->pci_tad[i])
 				continue;
+			pci_read_config_dword(pvt->pci_tad[i], 0x8c, &amap);
 		}
 
 		for (j = 0; j < max_dimms_per_channel; j++) {
@@ -1627,6 +1634,7 @@ static int __populate_dimms(struct mem_ctl_info *mci,
 					mtr_regs[j], &mtr);
 			}
 			edac_dbg(4, "Channel #%d  MTR%d = %x\n", i, j, mtr);
+
 			if (IS_DIMM_PRESENT(mtr)) {
 				if (!IS_ECC_ENABLED(pvt->info.mcmtr)) {
 					sbridge_printk(KERN_ERR, "CPU SrcID #%d, Ha #%d, Channel #%d has DIMMs, but ECC is disabled\n",
@@ -1661,6 +1669,11 @@ static int __populate_dimms(struct mem_ctl_info *mci,
 				dimm->dtype = pvt->info.get_width(pvt, mtr);
 				dimm->mtype = mtype;
 				dimm->edac_mode = mode;
+				pvt->channel[i].dimm[j].rowbits = order_base_2(rows);
+				pvt->channel[i].dimm[j].colbits = order_base_2(cols);
+				pvt->channel[i].dimm[j].bank_xor_enable =
+						GET_BITFIELD(pvt->info.mcmtr, 9, 9);
+				pvt->channel[i].dimm[j].amap_fine = GET_BITFIELD(amap, 0, 0);
 				snprintf(dimm->label, sizeof(dimm->label),
 						 "CPU_SrcID#%u_Ha#%u_Chan#%u_DIMM#%u",
 						 pvt->sbridge_dev->source_id, pvt->sbridge_dev->dom, i, j);
@@ -1922,6 +1935,99 @@ static struct mem_ctl_info *get_mci_for_node_id(u8 node_id, u8 ha)
 	return NULL;
 }
 
+static u8 sb_close_row[] = {
+	15, 16, 17, 18, 20, 21, 22, 28, 10, 11, 12, 13, 29, 30, 31, 32, 33
+};
+
+static u8 sb_close_column[] = {
+	3, 4, 5, 14, 19, 23, 24, 25, 26, 27
+};
+
+static u8 sb_open_row[] = {
+	14, 15, 16, 20, 28, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33
+};
+
+static u8 sb_open_column[] = {
+	3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+};
+
+static u8 sb_open_fine_column[] = {
+	3, 4, 5, 7, 8, 9, 10, 11, 12, 13
+};
+
+static int sb_bits(u64 addr, int nbits, u8 *bits)
+{
+	int i, res = 0;
+
+	for (i = 0; i < nbits; i++)
+		res |= ((addr >> bits[i]) & 1) << i;
+	return res;
+}
+
+static int sb_bank_bits(u64 addr, int b0, int b1, int do_xor, int x0, int x1)
+{
+	int ret = GET_BITFIELD(addr, b0, b0) | (GET_BITFIELD(addr, b1, b1) << 1);
+
+	if (do_xor)
+		ret ^= GET_BITFIELD(addr, x0, x0) | (GET_BITFIELD(addr, x1, x1) << 1);
+
+	return ret;
+}
+
+static bool sb_decode_ddr4(struct mem_ctl_info *mci, int ch, u8 rank,
+			   u64 rank_addr, char *msg)
+{
+	int dimmno = 0;
+	int row, col, bank_address, bank_group;
+	struct sbridge_pvt *pvt;
+	u32 bg0 = 0, rowbits = 0, colbits = 0;
+	u32 amap_fine = 0, bank_xor_enable = 0;
+
+	dimmno = (rank < 12) ? rank / 4 : 2;
+	pvt = mci->pvt_info;
+	amap_fine =  pvt->channel[ch].dimm[dimmno].amap_fine;
+	bg0 = amap_fine ? 6 : 13;
+	rowbits = pvt->channel[ch].dimm[dimmno].rowbits;
+	colbits = pvt->channel[ch].dimm[dimmno].colbits;
+	bank_xor_enable = pvt->channel[ch].dimm[dimmno].bank_xor_enable;
+
+	if (pvt->is_lockstep) {
+		pr_warn_once("LockStep row/column decode is not supported yet!\n");
+		msg[0] = '\0';
+		return false;
+	}
+
+	if (pvt->is_close_pg) {
+		row = sb_bits(rank_addr, rowbits, sb_close_row);
+		col = sb_bits(rank_addr, colbits, sb_close_column);
+		col |= 0x400; /* C10 is autoprecharge, always set */
+		bank_address = sb_bank_bits(rank_addr, 8, 9, bank_xor_enable, 22, 28);
+		bank_group = sb_bank_bits(rank_addr, 6, 7, bank_xor_enable, 20, 21);
+	} else {
+		row = sb_bits(rank_addr, rowbits, sb_open_row);
+		if (amap_fine)
+			col = sb_bits(rank_addr, colbits, sb_open_fine_column);
+		else
+			col = sb_bits(rank_addr, colbits, sb_open_column);
+		bank_address = sb_bank_bits(rank_addr, 18, 19, bank_xor_enable, 22, 23);
+		bank_group = sb_bank_bits(rank_addr, bg0, 17, bank_xor_enable, 20, 21);
+	}
+
+	row &= (1u << rowbits) - 1;
+
+	sprintf(msg, "row:0x%x col:0x%x bank_addr:%d bank_group:%d",
+		row, col, bank_address, bank_group);
+	return true;
+}
+
+static bool sb_decode_ddr3(struct mem_ctl_info *mci, int ch, u8 rank,
+			   u64 rank_addr, char *msg)
+{
+	pr_warn_once("DDR3 row/column decode not support yet!\n");
+	msg[0] = '\0';
+	return false;
+}
+
 static int get_memory_error_data(struct mem_ctl_info *mci,
 				 u64 addr,
 				 u8 *socket, u8 *ha,
@@ -1937,12 +2043,13 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 	int			interleave_mode, shiftup = 0;
 	unsigned int		sad_interleave[MAX_INTERLEAVE];
 	u32			reg, dram_rule;
-	u8			ch_way, sck_way, pkg, sad_ha = 0;
+	u8			ch_way, sck_way, pkg, sad_ha = 0, rankid = 0;
 	u32			tad_offset;
 	u32			rir_way;
 	u32			mb, gb;
 	u64			ch_addr, offset, limit = 0, prv = 0;
-
+	u64			rank_addr;
+	enum mem_type		mtype;
 
 	/*
 	 * Step 0) Check if the address is at special memory ranges
@@ -2226,6 +2333,28 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 	pci_read_config_dword(pvt->pci_tad[base_ch], rir_offset[n_rir][idx], &reg);
 	*rank = RIR_RNK_TGT(pvt->info.type, reg);
 
+	if (pvt->info.type == BROADWELL) {
+		if (pvt->is_close_pg)
+			shiftup = 6;
+		else
+			shiftup = 13;
+
+		rank_addr = ch_addr >> shiftup;
+		rank_addr /= (1 << rir_way);
+		rank_addr <<= shiftup;
+		rank_addr |= ch_addr & GENMASK_ULL(shiftup - 1, 0);
+		rank_addr -= RIR_OFFSET(pvt->info.type, reg);
+
+		mtype = pvt->info.get_memory_type(pvt);
+		rankid = *rank;
+		if (mtype == MEM_DDR4 || mtype == MEM_RDDR4)
+			sb_decode_ddr4(mci, base_ch, rankid, rank_addr, msg);
+		else
+			sb_decode_ddr3(mci, base_ch, rankid, rank_addr, msg);
+	} else {
+		msg[0] = '\0';
+	}
+
 	edac_dbg(0, "RIR#%d: channel address 0x%08Lx < 0x%08Lx, RIR interleave %d, index %d\n",
 		 n_rir,
 		 ch_addr,
@@ -2950,7 +3079,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
 	struct mem_ctl_info *new_mci;
 	struct sbridge_pvt *pvt = mci->pvt_info;
 	enum hw_event_mc_err_type tp_event;
-	char *optype, msg[256];
+	char *optype, msg[256], msg_full[512];
 	bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
 	bool overflow = GET_BITFIELD(m->status, 62, 62);
 	bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
@@ -3089,18 +3218,17 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
 	 */
 	if (!pvt->is_lockstep && !pvt->is_cur_addr_mirrored && !pvt->is_close_pg)
 		channel = first_channel;
-
-	snprintf(msg, sizeof(msg),
-		 "%s%s area:%s err_code:%04x:%04x socket:%d ha:%d channel_mask:%ld rank:%d",
+	snprintf(msg_full, sizeof(msg_full),
+		 "%s%s area:%s err_code:%04x:%04x socket:%d ha:%d channel_mask:%ld rank:%d %s",
 		 overflow ? " OVERFLOW" : "",
 		 (uncorrected_error && recoverable) ? " recoverable" : "",
 		 area_type,
 		 mscod, errcode,
 		 socket, ha,
 		 channel_mask,
-		 rank);
+		 rank, msg);
 
-	edac_dbg(0, "%s\n", msg);
+	edac_dbg(0, "%s\n", msg_full);
 
 	/* FIXME: need support for channel mask */
 
@@ -3111,7 +3239,7 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
 	edac_mc_handle_error(tp_event, mci, core_err_cnt,
 			     m->addr >> PAGE_SHIFT, m->addr & ~PAGE_MASK, 0,
 			     channel, dimm, -1,
-			     optype, msg);
+			     optype, msg_full);
 	return;
 err_parsing:
 	edac_mc_handle_error(tp_event, mci, core_err_cnt, 0, 0, 0,
-- 
cgit v1.2.3


From e0401dce5e28fb7118dbfd055c77d94433778a85 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Mon, 19 Sep 2022 17:53:30 +0000
Subject: selftests/bpf: Simplify cgroup_hierarchical_stats selftest

The cgroup_hierarchical_stats selftest is complicated. It has to be,
because it tests an entire workflow of recording, aggregating, and
dumping cgroup stats. However, some of the complexity is unnecessary.
The test now enables the memory controller in a cgroup hierarchy, invokes
reclaim, measure reclaim time, THEN uses that reclaim time to test the
stats collection and aggregation. We don't need to use such a
complicated stat, as the context in which the stat is collected is
orthogonal.

Simplify the test by using a simple stat instead of reclaim time, the
total number of times a process has ever entered a cgroup. This makes
the test simpler and removes the dependency on the memory controller and
the memory reclaim interface.

Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: KP Singh <kpsingh@kernel.org>
Link: https://lore.kernel.org/bpf/20220919175330.890793-1-yosryahmed@google.com
---
 .../bpf/prog_tests/cgroup_hierarchical_stats.c     | 170 +++++++++----------
 .../bpf/progs/cgroup_hierarchical_stats.c          | 181 +++++++--------------
 2 files changed, 131 insertions(+), 220 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c
index bed1661596f7..3bd27d2ea668 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_hierarchical_stats.c
@@ -1,6 +1,22 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Functions to manage eBPF programs attached to cgroup subsystems
+ * This test makes sure BPF stats collection using rstat works correctly.
+ * The test uses 3 BPF progs:
+ * (a) counter: This BPF prog is invoked every time we attach a process to a
+ *              cgroup and locklessly increments a percpu counter.
+ *              The program then calls cgroup_rstat_updated() to inform rstat
+ *              of an update on the (cpu, cgroup) pair.
+ *
+ * (b) flusher: This BPF prog is invoked when an rstat flush is ongoing, it
+ *              aggregates all percpu counters to a total counter, and also
+ *              propagates the changes to the ancestor cgroups.
+ *
+ * (c) dumper: This BPF prog is a cgroup_iter. It is used to output the total
+ *             counter of a cgroup through reading a file in userspace.
+ *
+ * The test sets up a cgroup hierarchy, and the above programs. It spawns a few
+ * processes in the leaf cgroups and makes sure all the counters are aggregated
+ * correctly.
  *
  * Copyright 2022 Google LLC.
  */
@@ -21,8 +37,10 @@
 #define PAGE_SIZE 4096
 #define MB(x) (x << 20)
 
+#define PROCESSES_PER_CGROUP 3
+
 #define BPFFS_ROOT "/sys/fs/bpf/"
-#define BPFFS_VMSCAN BPFFS_ROOT"vmscan/"
+#define BPFFS_ATTACH_COUNTERS BPFFS_ROOT "attach_counters/"
 
 #define CG_ROOT_NAME "root"
 #define CG_ROOT_ID 1
@@ -79,7 +97,7 @@ static int setup_bpffs(void)
 		return err;
 
 	/* Create a directory to contain stat files in bpffs */
-	err = mkdir(BPFFS_VMSCAN, 0755);
+	err = mkdir(BPFFS_ATTACH_COUNTERS, 0755);
 	if (!ASSERT_OK(err, "mkdir"))
 		return err;
 
@@ -89,7 +107,7 @@ static int setup_bpffs(void)
 static void cleanup_bpffs(void)
 {
 	/* Remove created directory in bpffs */
-	ASSERT_OK(rmdir(BPFFS_VMSCAN), "rmdir "BPFFS_VMSCAN);
+	ASSERT_OK(rmdir(BPFFS_ATTACH_COUNTERS), "rmdir "BPFFS_ATTACH_COUNTERS);
 
 	/* Unmount bpffs, if it wasn't already mounted when we started */
 	if (mounted_bpffs)
@@ -118,18 +136,6 @@ static int setup_cgroups(void)
 
 		cgroups[i].fd = fd;
 		cgroups[i].id = get_cgroup_id(cgroups[i].path);
-
-		/*
-		 * Enable memcg controller for the entire hierarchy.
-		 * Note that stats are collected for all cgroups in a hierarchy
-		 * with memcg enabled anyway, but are only exposed for cgroups
-		 * that have memcg enabled.
-		 */
-		if (i < N_NON_LEAF_CGROUPS) {
-			err = enable_controllers(cgroups[i].path, "memory");
-			if (!ASSERT_OK(err, "enable_controllers"))
-				return err;
-		}
 	}
 	return 0;
 }
@@ -154,109 +160,85 @@ static void destroy_hierarchy(void)
 	cleanup_bpffs();
 }
 
-static int reclaimer(const char *cgroup_path, size_t size)
-{
-	static char size_buf[128];
-	char *buf, *ptr;
-	int err;
-
-	/* Join cgroup in the parent process workdir */
-	if (join_parent_cgroup(cgroup_path))
-		return EACCES;
-
-	/* Allocate memory */
-	buf = malloc(size);
-	if (!buf)
-		return ENOMEM;
-
-	/* Write to memory to make sure it's actually allocated */
-	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
-		*ptr = 1;
-
-	/* Try to reclaim memory */
-	snprintf(size_buf, 128, "%lu", size);
-	err = write_cgroup_file_parent(cgroup_path, "memory.reclaim", size_buf);
-
-	free(buf);
-	/* memory.reclaim returns EAGAIN if the amount is not fully reclaimed */
-	if (err && errno != EAGAIN)
-		return errno;
-
-	return 0;
-}
-
-static int induce_vmscan(void)
+static int attach_processes(void)
 {
-	int i, status;
+	int i, j, status;
 
-	/*
-	 * In every leaf cgroup, run a child process that allocates some memory
-	 * and attempts to reclaim some of it.
-	 */
+	/* In every leaf cgroup, attach 3 processes */
 	for (i = N_NON_LEAF_CGROUPS; i < N_CGROUPS; i++) {
-		pid_t pid;
-
-		/* Create reclaimer child */
-		pid = fork();
-		if (pid == 0) {
-			status = reclaimer(cgroups[i].path, MB(5));
-			exit(status);
+		for (j = 0; j < PROCESSES_PER_CGROUP; j++) {
+			pid_t pid;
+
+			/* Create child and attach to cgroup */
+			pid = fork();
+			if (pid == 0) {
+				if (join_parent_cgroup(cgroups[i].path))
+					exit(EACCES);
+				exit(0);
+			}
+
+			/* Cleanup child */
+			waitpid(pid, &status, 0);
+			if (!ASSERT_TRUE(WIFEXITED(status), "child process exited"))
+				return 1;
+			if (!ASSERT_EQ(WEXITSTATUS(status), 0,
+				       "child process exit code"))
+				return 1;
 		}
-
-		/* Cleanup reclaimer child */
-		waitpid(pid, &status, 0);
-		ASSERT_TRUE(WIFEXITED(status), "reclaimer exited");
-		ASSERT_EQ(WEXITSTATUS(status), 0, "reclaim exit code");
 	}
 	return 0;
 }
 
 static unsigned long long
-get_cgroup_vmscan_delay(unsigned long long cgroup_id, const char *file_name)
+get_attach_counter(unsigned long long cgroup_id, const char *file_name)
 {
-	unsigned long long vmscan = 0, id = 0;
+	unsigned long long attach_counter = 0, id = 0;
 	static char buf[128], path[128];
 
 	/* For every cgroup, read the file generated by cgroup_iter */
-	snprintf(path, 128, "%s%s", BPFFS_VMSCAN, file_name);
+	snprintf(path, 128, "%s%s", BPFFS_ATTACH_COUNTERS, file_name);
 	if (!ASSERT_OK(read_from_file(path, buf, 128), "read cgroup_iter"))
 		return 0;
 
 	/* Check the output file formatting */
-	ASSERT_EQ(sscanf(buf, "cg_id: %llu, total_vmscan_delay: %llu\n",
-			 &id, &vmscan), 2, "output format");
+	ASSERT_EQ(sscanf(buf, "cg_id: %llu, attach_counter: %llu\n",
+			 &id, &attach_counter), 2, "output format");
 
 	/* Check that the cgroup_id is displayed correctly */
 	ASSERT_EQ(id, cgroup_id, "cgroup_id");
-	/* Check that the vmscan reading is non-zero */
-	ASSERT_GT(vmscan, 0, "vmscan_reading");
-	return vmscan;
+	/* Check that the counter is non-zero */
+	ASSERT_GT(attach_counter, 0, "attach counter non-zero");
+	return attach_counter;
 }
 
-static void check_vmscan_stats(void)
+static void check_attach_counters(void)
 {
-	unsigned long long vmscan_readings[N_CGROUPS], vmscan_root;
+	unsigned long long attach_counters[N_CGROUPS], root_attach_counter;
 	int i;
 
-	for (i = 0; i < N_CGROUPS; i++) {
-		vmscan_readings[i] = get_cgroup_vmscan_delay(cgroups[i].id,
-							     cgroups[i].name);
-	}
+	for (i = 0; i < N_CGROUPS; i++)
+		attach_counters[i] = get_attach_counter(cgroups[i].id,
+							cgroups[i].name);
 
 	/* Read stats for root too */
-	vmscan_root = get_cgroup_vmscan_delay(CG_ROOT_ID, CG_ROOT_NAME);
+	root_attach_counter = get_attach_counter(CG_ROOT_ID, CG_ROOT_NAME);
+
+	/* Check that all leafs cgroups have an attach counter of 3 */
+	for (i = N_NON_LEAF_CGROUPS; i < N_CGROUPS; i++)
+		ASSERT_EQ(attach_counters[i], PROCESSES_PER_CGROUP,
+			  "leaf cgroup attach counter");
 
 	/* Check that child1 == child1_1 + child1_2 */
-	ASSERT_EQ(vmscan_readings[1], vmscan_readings[3] + vmscan_readings[4],
-		  "child1_vmscan");
+	ASSERT_EQ(attach_counters[1], attach_counters[3] + attach_counters[4],
+		  "child1_counter");
 	/* Check that child2 == child2_1 + child2_2 */
-	ASSERT_EQ(vmscan_readings[2], vmscan_readings[5] + vmscan_readings[6],
-		  "child2_vmscan");
+	ASSERT_EQ(attach_counters[2], attach_counters[5] + attach_counters[6],
+		  "child2_counter");
 	/* Check that test == child1 + child2 */
-	ASSERT_EQ(vmscan_readings[0], vmscan_readings[1] + vmscan_readings[2],
-		  "test_vmscan");
+	ASSERT_EQ(attach_counters[0], attach_counters[1] + attach_counters[2],
+		  "test_counter");
 	/* Check that root >= test */
-	ASSERT_GE(vmscan_root, vmscan_readings[1], "root_vmscan");
+	ASSERT_GE(root_attach_counter, attach_counters[1], "root_counter");
 }
 
 /* Creates iter link and pins in bpffs, returns 0 on success, -errno on failure.
@@ -278,12 +260,12 @@ static int setup_cgroup_iter(struct cgroup_hierarchical_stats *obj,
 	linfo.cgroup.order = BPF_CGROUP_ITER_SELF_ONLY;
 	opts.link_info = &linfo;
 	opts.link_info_len = sizeof(linfo);
-	link = bpf_program__attach_iter(obj->progs.dump_vmscan, &opts);
+	link = bpf_program__attach_iter(obj->progs.dumper, &opts);
 	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		return -EFAULT;
 
 	/* Pin the link to a bpffs file */
-	snprintf(path, 128, "%s%s", BPFFS_VMSCAN, file_name);
+	snprintf(path, 128, "%s%s", BPFFS_ATTACH_COUNTERS, file_name);
 	err = bpf_link__pin(link, path);
 	ASSERT_OK(err, "pin cgroup_iter");
 
@@ -313,7 +295,7 @@ static int setup_progs(struct cgroup_hierarchical_stats **skel)
 	if (!ASSERT_OK(err, "setup_cgroup_iter"))
 		return err;
 
-	bpf_program__set_autoattach((*skel)->progs.dump_vmscan, false);
+	bpf_program__set_autoattach((*skel)->progs.dumper, false);
 	err = cgroup_hierarchical_stats__attach(*skel);
 	if (!ASSERT_OK(err, "attach"))
 		return err;
@@ -328,13 +310,13 @@ static void destroy_progs(struct cgroup_hierarchical_stats *skel)
 
 	for (i = 0; i < N_CGROUPS; i++) {
 		/* Delete files in bpffs that cgroup_iters are pinned in */
-		snprintf(path, 128, "%s%s", BPFFS_VMSCAN,
+		snprintf(path, 128, "%s%s", BPFFS_ATTACH_COUNTERS,
 			 cgroups[i].name);
 		ASSERT_OK(remove(path), "remove cgroup_iter pin");
 	}
 
 	/* Delete root file in bpffs */
-	snprintf(path, 128, "%s%s", BPFFS_VMSCAN, CG_ROOT_NAME);
+	snprintf(path, 128, "%s%s", BPFFS_ATTACH_COUNTERS, CG_ROOT_NAME);
 	ASSERT_OK(remove(path), "remove cgroup_iter root pin");
 	cgroup_hierarchical_stats__destroy(skel);
 }
@@ -347,9 +329,9 @@ void test_cgroup_hierarchical_stats(void)
 		goto hierarchy_cleanup;
 	if (setup_progs(&skel))
 		goto cleanup;
-	if (induce_vmscan())
+	if (attach_processes())
 		goto cleanup;
-	check_vmscan_stats();
+	check_attach_counters();
 cleanup:
 	destroy_progs(skel);
 hierarchy_cleanup:
diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c
index 8ab4253a1592..c74362854948 100644
--- a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c
+++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Functions to manage eBPF programs attached to cgroup subsystems
- *
  * Copyright 2022 Google LLC.
  */
 #include "vmlinux.h"
@@ -11,25 +9,14 @@
 
 char _license[] SEC("license") = "GPL";
 
-/*
- * Start times are stored per-task, not per-cgroup, as multiple tasks in one
- * cgroup can perform reclaim concurrently.
- */
-struct {
-	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
-	__type(key, int);
-	__type(value, __u64);
-} vmscan_start_time SEC(".maps");
-
-struct vmscan_percpu {
+struct percpu_attach_counter {
 	/* Previous percpu state, to figure out if we have new updates */
 	__u64 prev;
 	/* Current percpu state */
 	__u64 state;
 };
 
-struct vmscan {
+struct attach_counter {
 	/* State propagated through children, pending aggregation */
 	__u64 pending;
 	/* Total state, including all cpus and all children */
@@ -38,147 +25,94 @@ struct vmscan {
 
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
-	__uint(max_entries, 100);
+	__uint(max_entries, 1024);
 	__type(key, __u64);
-	__type(value, struct vmscan_percpu);
-} pcpu_cgroup_vmscan_elapsed SEC(".maps");
+	__type(value, struct percpu_attach_counter);
+} percpu_attach_counters SEC(".maps");
 
 struct {
 	__uint(type, BPF_MAP_TYPE_HASH);
-	__uint(max_entries, 100);
+	__uint(max_entries, 1024);
 	__type(key, __u64);
-	__type(value, struct vmscan);
-} cgroup_vmscan_elapsed SEC(".maps");
+	__type(value, struct attach_counter);
+} attach_counters SEC(".maps");
 
 extern void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) __ksym;
 extern void cgroup_rstat_flush(struct cgroup *cgrp) __ksym;
 
-static struct cgroup *task_memcg(struct task_struct *task)
-{
-	int cgrp_id;
-
-#if __has_builtin(__builtin_preserve_enum_value)
-	cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id, memory_cgrp_id);
-#else
-	cgrp_id = memory_cgrp_id;
-#endif
-	return task->cgroups->subsys[cgrp_id]->cgroup;
-}
-
 static uint64_t cgroup_id(struct cgroup *cgrp)
 {
 	return cgrp->kn->id;
 }
 
-static int create_vmscan_percpu_elem(__u64 cg_id, __u64 state)
+static int create_percpu_attach_counter(__u64 cg_id, __u64 state)
 {
-	struct vmscan_percpu pcpu_init = {.state = state, .prev = 0};
+	struct percpu_attach_counter pcpu_init = {.state = state, .prev = 0};
 
-	return bpf_map_update_elem(&pcpu_cgroup_vmscan_elapsed, &cg_id,
+	return bpf_map_update_elem(&percpu_attach_counters, &cg_id,
 				   &pcpu_init, BPF_NOEXIST);
 }
 
-static int create_vmscan_elem(__u64 cg_id, __u64 state, __u64 pending)
+static int create_attach_counter(__u64 cg_id, __u64 state, __u64 pending)
 {
-	struct vmscan init = {.state = state, .pending = pending};
+	struct attach_counter init = {.state = state, .pending = pending};
 
-	return bpf_map_update_elem(&cgroup_vmscan_elapsed, &cg_id,
+	return bpf_map_update_elem(&attach_counters, &cg_id,
 				   &init, BPF_NOEXIST);
 }
 
-SEC("tp_btf/mm_vmscan_memcg_reclaim_begin")
-int BPF_PROG(vmscan_start, int order, gfp_t gfp_flags)
+SEC("fentry/cgroup_attach_task")
+int BPF_PROG(counter, struct cgroup *dst_cgrp, struct task_struct *leader,
+	     bool threadgroup)
 {
-	struct task_struct *task = bpf_get_current_task_btf();
-	__u64 *start_time_ptr;
-
-	start_time_ptr = bpf_task_storage_get(&vmscan_start_time, task, 0,
-					      BPF_LOCAL_STORAGE_GET_F_CREATE);
-	if (start_time_ptr)
-		*start_time_ptr = bpf_ktime_get_ns();
-	return 0;
-}
-
-SEC("tp_btf/mm_vmscan_memcg_reclaim_end")
-int BPF_PROG(vmscan_end, unsigned long nr_reclaimed)
-{
-	struct vmscan_percpu *pcpu_stat;
-	struct task_struct *current = bpf_get_current_task_btf();
-	struct cgroup *cgrp;
-	__u64 *start_time_ptr;
-	__u64 current_elapsed, cg_id;
-	__u64 end_time = bpf_ktime_get_ns();
-
-	/*
-	 * cgrp is the first parent cgroup of current that has memcg enabled in
-	 * its subtree_control, or NULL if memcg is disabled in the entire tree.
-	 * In a cgroup hierarchy like this:
-	 *                               a
-	 *                              / \
-	 *                             b   c
-	 *  If "a" has memcg enabled, while "b" doesn't, then processes in "b"
-	 *  will accumulate their stats directly to "a". This makes sure that no
-	 *  stats are lost from processes in leaf cgroups that don't have memcg
-	 *  enabled, but only exposes stats for cgroups that have memcg enabled.
-	 */
-	cgrp = task_memcg(current);
-	if (!cgrp)
+	__u64 cg_id = cgroup_id(dst_cgrp);
+	struct percpu_attach_counter *pcpu_counter = bpf_map_lookup_elem(
+			&percpu_attach_counters,
+			&cg_id);
+
+	if (pcpu_counter)
+		pcpu_counter->state += 1;
+	else if (create_percpu_attach_counter(cg_id, 1))
 		return 0;
 
-	cg_id = cgroup_id(cgrp);
-	start_time_ptr = bpf_task_storage_get(&vmscan_start_time, current, 0,
-					      BPF_LOCAL_STORAGE_GET_F_CREATE);
-	if (!start_time_ptr)
-		return 0;
-
-	current_elapsed = end_time - *start_time_ptr;
-	pcpu_stat = bpf_map_lookup_elem(&pcpu_cgroup_vmscan_elapsed,
-					&cg_id);
-	if (pcpu_stat)
-		pcpu_stat->state += current_elapsed;
-	else if (create_vmscan_percpu_elem(cg_id, current_elapsed))
-		return 0;
-
-	cgroup_rstat_updated(cgrp, bpf_get_smp_processor_id());
+	cgroup_rstat_updated(dst_cgrp, bpf_get_smp_processor_id());
 	return 0;
 }
 
 SEC("fentry/bpf_rstat_flush")
-int BPF_PROG(vmscan_flush, struct cgroup *cgrp, struct cgroup *parent, int cpu)
+int BPF_PROG(flusher, struct cgroup *cgrp, struct cgroup *parent, int cpu)
 {
-	struct vmscan_percpu *pcpu_stat;
-	struct vmscan *total_stat, *parent_stat;
+	struct percpu_attach_counter *pcpu_counter;
+	struct attach_counter *total_counter, *parent_counter;
 	__u64 cg_id = cgroup_id(cgrp);
 	__u64 parent_cg_id = parent ? cgroup_id(parent) : 0;
-	__u64 *pcpu_vmscan;
 	__u64 state;
 	__u64 delta = 0;
 
 	/* Add CPU changes on this level since the last flush */
-	pcpu_stat = bpf_map_lookup_percpu_elem(&pcpu_cgroup_vmscan_elapsed,
-					       &cg_id, cpu);
-	if (pcpu_stat) {
-		state = pcpu_stat->state;
-		delta += state - pcpu_stat->prev;
-		pcpu_stat->prev = state;
+	pcpu_counter = bpf_map_lookup_percpu_elem(&percpu_attach_counters,
+						  &cg_id, cpu);
+	if (pcpu_counter) {
+		state = pcpu_counter->state;
+		delta += state - pcpu_counter->prev;
+		pcpu_counter->prev = state;
 	}
 
-	total_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, &cg_id);
-	if (!total_stat) {
-		if (create_vmscan_elem(cg_id, delta, 0))
+	total_counter = bpf_map_lookup_elem(&attach_counters, &cg_id);
+	if (!total_counter) {
+		if (create_attach_counter(cg_id, delta, 0))
 			return 0;
-
 		goto update_parent;
 	}
 
 	/* Collect pending stats from subtree */
-	if (total_stat->pending) {
-		delta += total_stat->pending;
-		total_stat->pending = 0;
+	if (total_counter->pending) {
+		delta += total_counter->pending;
+		total_counter->pending = 0;
 	}
 
 	/* Propagate changes to this cgroup's total */
-	total_stat->state += delta;
+	total_counter->state += delta;
 
 update_parent:
 	/* Skip if there are no changes to propagate, or no parent */
@@ -186,20 +120,20 @@ update_parent:
 		return 0;
 
 	/* Propagate changes to cgroup's parent */
-	parent_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed,
-					  &parent_cg_id);
-	if (parent_stat)
-		parent_stat->pending += delta;
+	parent_counter = bpf_map_lookup_elem(&attach_counters,
+					     &parent_cg_id);
+	if (parent_counter)
+		parent_counter->pending += delta;
 	else
-		create_vmscan_elem(parent_cg_id, 0, delta);
+		create_attach_counter(parent_cg_id, 0, delta);
 	return 0;
 }
 
 SEC("iter.s/cgroup")
-int BPF_PROG(dump_vmscan, struct bpf_iter_meta *meta, struct cgroup *cgrp)
+int BPF_PROG(dumper, struct bpf_iter_meta *meta, struct cgroup *cgrp)
 {
 	struct seq_file *seq = meta->seq;
-	struct vmscan *total_stat;
+	struct attach_counter *total_counter;
 	__u64 cg_id = cgrp ? cgroup_id(cgrp) : 0;
 
 	/* Do nothing for the terminal call */
@@ -209,18 +143,13 @@ int BPF_PROG(dump_vmscan, struct bpf_iter_meta *meta, struct cgroup *cgrp)
 	/* Flush the stats to make sure we get the most updated numbers */
 	cgroup_rstat_flush(cgrp);
 
-	total_stat = bpf_map_lookup_elem(&cgroup_vmscan_elapsed, &cg_id);
-	if (!total_stat) {
-		BPF_SEQ_PRINTF(seq, "cg_id: %llu, total_vmscan_delay: 0\n",
+	total_counter = bpf_map_lookup_elem(&attach_counters, &cg_id);
+	if (!total_counter) {
+		BPF_SEQ_PRINTF(seq, "cg_id: %llu, attach_counter: 0\n",
 			       cg_id);
 	} else {
-		BPF_SEQ_PRINTF(seq, "cg_id: %llu, total_vmscan_delay: %llu\n",
-			       cg_id, total_stat->state);
+		BPF_SEQ_PRINTF(seq, "cg_id: %llu, attach_counter: %llu\n",
+			       cg_id, total_counter->state);
 	}
-
-	/*
-	 * We only dump stats for one cgroup here, so return 1 to stop
-	 * iteration after the first cgroup.
-	 */
-	return 1;
+	return 0;
 }
-- 
cgit v1.2.3


From d3923513edd7f4a614a169122b0eb6b9acb2c8a3 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Fri, 5 Aug 2022 13:50:08 +0100
Subject: EDAC/i7300: Correct the i7300_exit() function name in comment

The incorrect function name is being used in the comment for function
i7300_exit. Correct this.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lore.kernel.org/r/20220805125008.2346559-1-colin.i.king@gmail.com
---
 drivers/edac/i7300_edac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c
index 4f28b8c8d378..61adaa872ba7 100644
--- a/drivers/edac/i7300_edac.c
+++ b/drivers/edac/i7300_edac.c
@@ -1193,7 +1193,7 @@ static int __init i7300_init(void)
 }
 
 /**
- * i7300_init() - Unregisters the driver
+ * i7300_exit() - Unregisters the driver
  */
 static void __exit i7300_exit(void)
 {
-- 
cgit v1.2.3


From e588c116df6ca64a295017571151992c76d03132 Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Thu, 22 Sep 2022 14:28:44 +0800
Subject: libbpf: Add pathname_concat() helper

Move snprintf and len check to common helper pathname_concat() to make the
code simpler.

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1663828124-10437-1-git-send-email-wangyufen@huawei.com
---
 tools/lib/bpf/libbpf.c | 76 +++++++++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 47 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 67bc18506150..e691f08a297f 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2097,19 +2097,30 @@ static bool get_map_field_int(const char *map_name, const struct btf *btf,
 	return true;
 }
 
+static int pathname_concat(char *buf, size_t buf_sz, const char *path, const char *name)
+{
+	int len;
+
+	len = snprintf(buf, buf_sz, "%s/%s", path, name);
+	if (len < 0)
+		return -EINVAL;
+	if (len >= buf_sz)
+		return -ENAMETOOLONG;
+
+	return 0;
+}
+
 static int build_map_pin_path(struct bpf_map *map, const char *path)
 {
 	char buf[PATH_MAX];
-	int len;
+	int err;
 
 	if (!path)
 		path = "/sys/fs/bpf";
 
-	len = snprintf(buf, PATH_MAX, "%s/%s", path, bpf_map__name(map));
-	if (len < 0)
-		return -EINVAL;
-	else if (len >= PATH_MAX)
-		return -ENAMETOOLONG;
+	err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map));
+	if (err)
+		return err;
 
 	return bpf_map__set_pin_path(map, buf);
 }
@@ -7968,17 +7979,9 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path)
 			continue;
 
 		if (path) {
-			int len;
-
-			len = snprintf(buf, PATH_MAX, "%s/%s", path,
-				       bpf_map__name(map));
-			if (len < 0) {
-				err = -EINVAL;
-				goto err_unpin_maps;
-			} else if (len >= PATH_MAX) {
-				err = -ENAMETOOLONG;
+			err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map));
+			if (err)
 				goto err_unpin_maps;
-			}
 			sanitize_pin_path(buf);
 			pin_path = buf;
 		} else if (!map->pin_path) {
@@ -8016,14 +8019,9 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path)
 		char buf[PATH_MAX];
 
 		if (path) {
-			int len;
-
-			len = snprintf(buf, PATH_MAX, "%s/%s", path,
-				       bpf_map__name(map));
-			if (len < 0)
-				return libbpf_err(-EINVAL);
-			else if (len >= PATH_MAX)
-				return libbpf_err(-ENAMETOOLONG);
+			err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map));
+			if (err)
+				return libbpf_err(err);
 			sanitize_pin_path(buf);
 			pin_path = buf;
 		} else if (!map->pin_path) {
@@ -8041,6 +8039,7 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path)
 int bpf_object__pin_programs(struct bpf_object *obj, const char *path)
 {
 	struct bpf_program *prog;
+	char buf[PATH_MAX];
 	int err;
 
 	if (!obj)
@@ -8052,17 +8051,9 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path)
 	}
 
 	bpf_object__for_each_program(prog, obj) {
-		char buf[PATH_MAX];
-		int len;
-
-		len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name);
-		if (len < 0) {
-			err = -EINVAL;
-			goto err_unpin_programs;
-		} else if (len >= PATH_MAX) {
-			err = -ENAMETOOLONG;
+		err = pathname_concat(buf, sizeof(buf), path, prog->name);
+		if (err)
 			goto err_unpin_programs;
-		}
 
 		err = bpf_program__pin(prog, buf);
 		if (err)
@@ -8073,13 +8064,7 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path)
 
 err_unpin_programs:
 	while ((prog = bpf_object__prev_program(obj, prog))) {
-		char buf[PATH_MAX];
-		int len;
-
-		len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name);
-		if (len < 0)
-			continue;
-		else if (len >= PATH_MAX)
+		if (pathname_concat(buf, sizeof(buf), path, prog->name))
 			continue;
 
 		bpf_program__unpin(prog, buf);
@@ -8098,13 +8083,10 @@ int bpf_object__unpin_programs(struct bpf_object *obj, const char *path)
 
 	bpf_object__for_each_program(prog, obj) {
 		char buf[PATH_MAX];
-		int len;
 
-		len = snprintf(buf, PATH_MAX, "%s/%s", path, prog->name);
-		if (len < 0)
-			return libbpf_err(-EINVAL);
-		else if (len >= PATH_MAX)
-			return libbpf_err(-ENAMETOOLONG);
+		err = pathname_concat(buf, sizeof(buf), path, prog->name);
+		if (err)
+			return libbpf_err(err);
 
 		err = bpf_program__unpin(prog, buf);
 		if (err)
-- 
cgit v1.2.3


From dbdea9b36fb61da3b9a1be0dd63542e2bfd3e5d7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 23 Sep 2022 16:05:59 -0700
Subject: libbpf: restore memory layout of bpf_object_open_opts

When attach_prog_fd field was removed in libbpf 1.0 and replaced with
`long: 0` placeholder, it actually shifted all the subsequent fields by
8 byte. This is due to `long: 0` promising to adjust next field's offset
to long-aligned offset. But in this case we were already long-aligned
as pin_root_path is a pointer. So `long: 0` had no effect, and thus
didn't feel the gap created by removed attach_prog_fd.

Non-zero bitfield should have been used instead. I validated using
pahole. Originally kconfig field was at offset 40. With `long: 0` it's
at offset 32, which is wrong. With this change it's back at offset 40.

While technically libbpf 1.0 is allowed to break backwards
compatibility and applications should have been recompiled against
libbpf 1.0 headers, but given how trivial it is to preserve memory
layout, let's fix this.

Reported-by: Grant Seltzer Richman <grantseltzer@gmail.com>
Fixes: 146bf811f5ac ("libbpf: remove most other deprecated high-level APIs")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220923230559.666608-1-andrii@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/lib/bpf/libbpf.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index e2d8c17f2e85..eee883f007f9 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -118,7 +118,9 @@ struct bpf_object_open_opts {
 	 * auto-pinned to that path on load; defaults to "/sys/fs/bpf".
 	 */
 	const char *pin_root_path;
-	long :0;
+
+	__u32 :32; /* stub out now removed attach_prog_fd */
+
 	/* Additional kernel config content that augments and overrides
 	 * system Kconfig for CONFIG_xxx externs.
 	 */
-- 
cgit v1.2.3


From 067f4f291c2063d86abe0a526ef211e03a4f1258 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 23 Sep 2022 10:59:09 -0700
Subject: selftests/bpf: add sign-file to .gitignore

Add sign-file to .gitignore to avoid accidentally checking it in.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220923175913.3272430-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 3b288562963e..07d2d0a8c5cb 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -40,6 +40,7 @@ test_cpp
 /runqslower
 /bench
 /veristat
+/sign-file
 *.ko
 *.tmp
 xskxceiver
-- 
cgit v1.2.3


From c2488d70ceee352611e55943c25abf30117e3b67 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 23 Sep 2022 10:59:10 -0700
Subject: selftests/bpf: make veristat's verifier log parsing faster and more
 robust

Make sure veristat doesn't spend ridiculous amount of time parsing
verifier stats from verifier log, especially for very large logs or
truncated logs (e.g., when verifier returns -ENOSPC due to too small
buffer). For this, parse lines from the end of the log and make sure we
parse only up to 100 last lines, where stats should be, if at all.

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220923175913.3272430-3-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/veristat.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 51030234b60a..77bdfd6fe302 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -419,19 +419,30 @@ static void free_verif_stats(struct verif_stats *stats, size_t stat_cnt)
 
 static char verif_log_buf[64 * 1024];
 
-static int parse_verif_log(const char *buf, size_t buf_sz, struct verif_stats *s)
+#define MAX_PARSED_LOG_LINES 100
+
+static int parse_verif_log(char * const buf, size_t buf_sz, struct verif_stats *s)
 {
-	const char *next;
-	int pos;
+	const char *cur;
+	int pos, lines;
+
+	buf[buf_sz - 1] = '\0';
 
-	for (pos = 0; buf[0]; buf = next) {
-		if (buf[0] == '\n')
-			buf++;
-		next = strchrnul(&buf[pos], '\n');
+	for (pos = strlen(buf) - 1, lines = 0; pos >= 0 && lines < MAX_PARSED_LOG_LINES; lines++) {
+		/* find previous endline or otherwise take the start of log buf */
+		for (cur = &buf[pos]; cur > buf && cur[0] != '\n'; cur--, pos--) {
+		}
+		/* next time start from end of previous line (or pos goes to <0) */
+		pos--;
+		/* if we found endline, point right after endline symbol;
+		 * otherwise, stay at the beginning of log buf
+		 */
+		if (cur[0] == '\n')
+			cur++;
 
-		if (1 == sscanf(buf, "verification time %ld usec\n", &s->stats[DURATION]))
+		if (1 == sscanf(cur, "verification time %ld usec\n", &s->stats[DURATION]))
 			continue;
-		if (6 == sscanf(buf, "processed %ld insns (limit %*d) max_states_per_insn %ld total_states %ld peak_states %ld mark_read %ld",
+		if (6 == sscanf(cur, "processed %ld insns (limit %*d) max_states_per_insn %ld total_states %ld peak_states %ld mark_read %ld",
 				&s->stats[TOTAL_INSNS],
 				&s->stats[MAX_STATES_PER_INSN],
 				&s->stats[TOTAL_STATES],
-- 
cgit v1.2.3


From 518fee8bfaf2c628007909c0fc5336930b9b6ee4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 23 Sep 2022 10:59:11 -0700
Subject: selftests/bpf: make veristat skip non-BPF and failing-to-open BPF
 objects

Make veristat ignore non-BPF object files. This allows simpler
mass-verification (e.g., `sudo ./veristat *.bpf.o` in selftests/bpf
directory). Note that `sudo ./veristat *.o` would also work, but with
selftests's multiple copies of BPF object files (.bpf.o and
.bpf.linked{1,2,3}.o) it's 4x slower.

Also, given some of BPF object files could be incomplete in the sense
that they are meant to be statically linked into final BPF object file
(like linked_maps, linked_funcs, linked_vars), note such instances in
stderr, but proceed anyways. This seems like a better trade off between
completely silently ignoring BPF object file and aborting
mass-verification altogether.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220923175913.3272430-4-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/veristat.c | 78 ++++++++++++++++++++++++++++++----
 1 file changed, 70 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 77bdfd6fe302..f09dd143a8df 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -15,6 +15,8 @@
 #include <sys/sysinfo.h>
 #include <sys/stat.h>
 #include <bpf/libbpf.h>
+#include <libelf.h>
+#include <gelf.h>
 
 enum stat_id {
 	VERDICT,
@@ -78,6 +80,11 @@ static struct env {
 	struct filter *deny_filters;
 	int allow_filter_cnt;
 	int deny_filter_cnt;
+
+	int files_processed;
+	int files_skipped;
+	int progs_processed;
+	int progs_skipped;
 } env;
 
 static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
@@ -226,8 +233,41 @@ static bool should_process_file(const char *filename)
 	return false;
 }
 
-static bool should_process_prog(const char *filename, const char *prog_name)
+static bool is_bpf_obj_file(const char *path) {
+	Elf64_Ehdr *ehdr;
+	int fd, err = -EINVAL;
+	Elf *elf = NULL;
+
+	fd = open(path, O_RDONLY | O_CLOEXEC);
+	if (fd < 0)
+		return true; /* we'll fail later and propagate error */
+
+	/* ensure libelf is initialized */
+	(void)elf_version(EV_CURRENT);
+
+	elf = elf_begin(fd, ELF_C_READ, NULL);
+	if (!elf)
+		goto cleanup;
+
+	if (elf_kind(elf) != ELF_K_ELF || gelf_getclass(elf) != ELFCLASS64)
+		goto cleanup;
+
+	ehdr = elf64_getehdr(elf);
+	/* Old LLVM set e_machine to EM_NONE */
+	if (!ehdr || ehdr->e_type != ET_REL || (ehdr->e_machine && ehdr->e_machine != EM_BPF))
+		goto cleanup;
+
+	err = 0;
+cleanup:
+	if (elf)
+		elf_end(elf);
+	close(fd);
+	return err == 0;
+}
+
+static bool should_process_prog(const char *path, const char *prog_name)
 {
+	const char *filename = basename(path);
 	int i;
 
 	if (env.deny_filter_cnt > 0) {
@@ -303,7 +343,7 @@ static int append_filter_file(const char *path)
 	f = fopen(path, "r");
 	if (!f) {
 		err = -errno;
-		fprintf(stderr, "Failed to open '%s': %d\n", path, err);
+		fprintf(stderr, "Failed to open filters in '%s': %d\n", path, err);
 		return err;
 	}
 
@@ -463,8 +503,10 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 	int err = 0;
 	void *tmp;
 
-	if (!should_process_prog(basename(filename), bpf_program__name(prog)))
+	if (!should_process_prog(filename, bpf_program__name(prog))) {
+		env.progs_skipped++;
 		return 0;
+	}
 
 	tmp = realloc(env.prog_stats, (env.prog_stat_cnt + 1) * sizeof(*env.prog_stats));
 	if (!tmp)
@@ -487,6 +529,7 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 	verif_log_buf[0] = '\0';
 
 	err = bpf_object__load(obj);
+	env.progs_processed++;
 
 	stats->file_name = strdup(basename(filename));
 	stats->prog_name = strdup(bpf_program__name(prog));
@@ -513,18 +556,37 @@ static int process_obj(const char *filename)
 	LIBBPF_OPTS(bpf_object_open_opts, opts);
 	int err = 0, prog_cnt = 0;
 
-	if (!should_process_file(basename(filename)))
+	if (!should_process_file(basename(filename))) {
+		if (env.verbose)
+			printf("Skipping '%s' due to filters...\n", filename);
+		env.files_skipped++;
+		return 0;
+	}
+	if (!is_bpf_obj_file(filename)) {
+		if (env.verbose)
+			printf("Skipping '%s' as it's not a BPF object file...\n", filename);
+		env.files_skipped++;
 		return 0;
+	}
 
 	old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn);
 
 	obj = bpf_object__open_file(filename, &opts);
 	if (!obj) {
-		err = -errno;
-		fprintf(stderr, "Failed to open '%s': %d\n", filename, err);
+		/* if libbpf can't open BPF object file, it could be because
+		 * that BPF object file is incomplete and has to be statically
+		 * linked into a final BPF object file; instead of bailing
+		 * out, report it into stderr, mark it as skipped, and
+		 * proceeed
+		 */
+		fprintf(stderr, "Failed to open '%s': %d\n", filename, -errno);
+		env.files_skipped++;
+		err = 0;
 		goto cleanup;
 	}
 
+	env.files_processed++;
+
 	bpf_object__for_each_program(prog, obj) {
 		prog_cnt++;
 	}
@@ -732,8 +794,8 @@ static void output_stats(const struct verif_stats *s, enum resfmt fmt, bool last
 
 	if (last && fmt == RESFMT_TABLE) {
 		output_header_underlines();
-		printf("Done. Processed %d object files, %d programs.\n",
-		       env.filename_cnt, env.prog_stat_cnt);
+		printf("Done. Processed %d files, %d programs. Skipped %d files, %d programs.\n",
+		       env.files_processed, env.files_skipped, env.progs_processed, env.progs_skipped);
 	}
 }
 
-- 
cgit v1.2.3


From c511d009ceb8cd980e4a823b7ca74abbdc7cdccc Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 23 Sep 2022 10:59:12 -0700
Subject: selftests/bpf: emit processing progress and add quiet mode to
 veristat

Emit "Processing <filepath>..." for each BPF object file to be
processed, to show progress. But also add -q (--quiet) flag to silence
such messages. Doing something more clever (like overwriting same output
line) is to cumbersome and easily breakable if there is any other
console output (e.g., errors from libbpf).

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220923175913.3272430-5-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/veristat.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index f09dd143a8df..85a77f1dd863 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -63,6 +63,7 @@ static struct env {
 	char **filenames;
 	int filename_cnt;
 	bool verbose;
+	bool quiet;
 	enum resfmt out_fmt;
 	bool comparison_mode;
 
@@ -107,6 +108,7 @@ const char argp_program_doc[] =
 static const struct argp_option opts[] = {
 	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
 	{ "verbose", 'v', NULL, 0, "Verbose mode" },
+	{ "quiet", 'q', NULL, 0, "Quiet mode" },
 	{ "emit", 'e', "SPEC", 0, "Specify stats to be emitted" },
 	{ "sort", 's', "SPEC", 0, "Specify sort order" },
 	{ "output-format", 'o', "FMT", 0, "Result output format (table, csv), default is table." },
@@ -131,6 +133,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 	case 'v':
 		env.verbose = true;
 		break;
+	case 'q':
+		env.quiet = true;
+		break;
 	case 'e':
 		err = parse_stats(arg, &env.output_spec);
 		if (err)
@@ -569,8 +574,10 @@ static int process_obj(const char *filename)
 		return 0;
 	}
 
-	old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn);
+	if (!env.quiet && env.out_fmt == RESFMT_TABLE)
+		printf("Processing '%s'...\n", basename(filename));
 
+	old_libbpf_print_fn = libbpf_set_print(libbpf_print_fn);
 	obj = bpf_object__open_file(filename, &opts);
 	if (!obj) {
 		/* if libbpf can't open BPF object file, it could be because
@@ -1268,6 +1275,12 @@ int main(int argc, char **argv)
 	if (argp_parse(&argp, argc, argv, 0, NULL, NULL))
 		return 1;
 
+	if (env.verbose && env.quiet) {
+		fprintf(stderr, "Verbose and quiet modes are incompatible, please specify just one or neither!\n");
+		argp_help(&argp, stderr, ARGP_HELP_USAGE, "veristat");
+		return 1;
+	}
+
 	if (env.output_spec.spec_cnt == 0)
 		env.output_spec = default_output_spec;
 	if (env.sort_spec.spec_cnt == 0)
-- 
cgit v1.2.3


From e310efc5ddde04c41aa0501b5a7235b134c5fc6c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Fri, 23 Sep 2022 10:59:13 -0700
Subject: selftests/bpf: allow to adjust BPF verifier log level in veristat

Add -l (--log-level) flag to override default BPF verifier log lever.
This only matters in verbose mode, which is the mode in which veristat
emits verifier log for each processed BPF program.

This is important because for successfully verified BPF programs
log_level 1 is empty, as BPF verifier truncates all the successfully
verified paths. So -l2 is the only way to actually get BPF verifier log
in practice. It looks sometihng like this:

  [vmuser@archvm bpf]$ sudo ./veristat xdp_tx.bpf.o -vl2
  Processing 'xdp_tx.bpf.o'...
  PROCESSING xdp_tx.bpf.o/xdp_tx, DURATION US: 19, VERDICT: success, VERIFIER LOG:
  func#0 @0
  0: R1=ctx(off=0,imm=0) R10=fp0
  ; return XDP_TX;
  0: (b4) w0 = 3                        ; R0_w=3
  1: (95) exit
  verification time 19 usec
  stack depth 0
  processed 2 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0

  File          Program  Verdict  Duration (us)  Total insns  Total states  Peak states
  ------------  -------  -------  -------------  -----------  ------------  -----------
  xdp_tx.bpf.o  xdp_tx   success             19            2             0            0
  ------------  -------  -------  -------------  -----------  ------------  -----------
  Done. Processed 1 files, 0 programs. Skipped 1 files, 0 programs.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20220923175913.3272430-6-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/testing/selftests/bpf/veristat.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c
index 85a77f1dd863..b0d83a28e348 100644
--- a/tools/testing/selftests/bpf/veristat.c
+++ b/tools/testing/selftests/bpf/veristat.c
@@ -64,6 +64,7 @@ static struct env {
 	int filename_cnt;
 	bool verbose;
 	bool quiet;
+	int log_level;
 	enum resfmt out_fmt;
 	bool comparison_mode;
 
@@ -108,6 +109,7 @@ const char argp_program_doc[] =
 static const struct argp_option opts[] = {
 	{ NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
 	{ "verbose", 'v', NULL, 0, "Verbose mode" },
+	{ "log-level", 'l', "LEVEL", 0, "Verifier log level (default 0 for normal mode, 1 for verbose mode)" },
 	{ "quiet", 'q', NULL, 0, "Quiet mode" },
 	{ "emit", 'e', "SPEC", 0, "Specify stats to be emitted" },
 	{ "sort", 's', "SPEC", 0, "Specify sort order" },
@@ -156,6 +158,14 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
 			return -EINVAL;
 		}
 		break;
+	case 'l':
+		errno = 0;
+		env.log_level = strtol(arg, NULL, 10);
+		if (errno) {
+			fprintf(stderr, "invalid log level: %s\n", arg);
+			argp_usage(state);
+		}
+		break;
 	case 'C':
 		env.comparison_mode = true;
 		break;
@@ -526,7 +536,7 @@ static int process_prog(const char *filename, struct bpf_object *obj, struct bpf
 		if (!buf)
 			return -ENOMEM;
 		bpf_program__set_log_buf(prog, buf, buf_sz);
-		bpf_program__set_log_level(prog, 1 | 4); /* stats + log */
+		bpf_program__set_log_level(prog, env.log_level | 4); /* stats + log */
 	} else {
 		bpf_program__set_log_buf(prog, buf, buf_sz);
 		bpf_program__set_log_level(prog, 4); /* only verifier stats */
@@ -1280,6 +1290,8 @@ int main(int argc, char **argv)
 		argp_help(&argp, stderr, ARGP_HELP_USAGE, "veristat");
 		return 1;
 	}
+	if (env.verbose && env.log_level == 0)
+		env.log_level = 1;
 
 	if (env.output_spec.spec_cnt == 0)
 		env.output_spec = default_output_spec;
-- 
cgit v1.2.3


From 730320fd770d4114a2ecb6fb223dcc8c3cecdc5b Mon Sep 17 00:00:00 2001
From: Aleksander Jan Bajkowski <olek2@wp.pl>
Date: Wed, 21 Sep 2022 22:59:44 +0200
Subject: MIPS: lantiq: enable all hardware interrupts on second VPE

This patch is needed to handle interrupts by the second VPE on the Lantiq
ARX100, xRX200, xRX300 and xRX330 SoCs. Switching some ICU interrupts to
the second VPE results in a hang. Currently, the vsmp_init_secondary()
function is responsible for enabling these interrupts. It only enables
Malta-specific interrupts (SW0, SW1, HW4 and HW5).

The MIPS core has 8 interrupts defined. On Lantiq SoCs, hardware
interrupts are wired to an ICU instance. Each VPE has an independent
instance of the ICU. The mapping of the ICU interrupts is shown below:
SW0(IP0) - IPI call,
SW1(IP1) - IPI resched,
HW0(IP2) - ICU 0-31,
HW1(IP3) - ICU 32-63,
HW2(IP4) - ICU 64-95,
HW3(IP5) - ICU 96-127,
HW4(IP6) - ICU 128-159,
HW5(IP7) - timer.

This patch enables all interrupt lines on the second VPE.

This problem affects multithreaded SoCs with a custom interrupt controller.
SOCs with 1004Kc core and newer use the MIPS GIC. At this point, I am aware
that the Realtek RTL839x and RTL930x SoCs may need a similar fix. In the
future, this may be replaced with some generic solution.

Tested on Lantiq xRX200.

Suggested-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Aleksander Jan Bajkowski <olek2@wp.pl>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/lantiq/prom.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/mips/lantiq/prom.c b/arch/mips/lantiq/prom.c
index c731082a0c42..be4829cc7a3a 100644
--- a/arch/mips/lantiq/prom.c
+++ b/arch/mips/lantiq/prom.c
@@ -34,6 +34,14 @@ unsigned long physical_memsize = 0L;
  */
 static struct ltq_soc_info soc_info;
 
+/*
+ * These structs are used to override vsmp_init_secondary()
+ */
+#if defined(CONFIG_MIPS_MT_SMP)
+extern const struct plat_smp_ops vsmp_smp_ops;
+static struct plat_smp_ops lantiq_smp_ops;
+#endif
+
 const char *get_system_type(void)
 {
 	return soc_info.sys_type;
@@ -84,6 +92,17 @@ void __init plat_mem_setup(void)
 	__dt_setup_arch(dtb);
 }
 
+#if defined(CONFIG_MIPS_MT_SMP)
+static void lantiq_init_secondary(void)
+{
+	/*
+	 * MIPS CPU startup function vsmp_init_secondary() will only
+	 * enable some of the interrupts for the second CPU/VPE.
+	 */
+	set_c0_status(ST0_IM);
+}
+#endif
+
 void __init prom_init(void)
 {
 	/* call the soc specific detetcion code and get it to fill soc_info */
@@ -95,7 +114,10 @@ void __init prom_init(void)
 	prom_init_cmdline();
 
 #if defined(CONFIG_MIPS_MT_SMP)
-	if (register_vsmp_smp_ops())
-		panic("failed to register_vsmp_smp_ops()");
+	if (cpu_has_mipsmt) {
+		lantiq_smp_ops = vsmp_smp_ops;
+		lantiq_smp_ops.init_secondary = lantiq_init_secondary;
+		register_smp_ops(&lantiq_smp_ops);
+	}
 #endif
 }
-- 
cgit v1.2.3


From 692cb5b0f4e25b3ff504f108e55033dd20d2ef74 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 22 Sep 2022 21:55:40 -0700
Subject: MIPS: Lantiq: switch vmmc to use gpiod API

This switches vmmc to use gpiod API instead of OF-specific legacy gpio
API that we want to stop exporting from gpiolib.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/lantiq/xway/vmmc.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/arch/mips/lantiq/xway/vmmc.c b/arch/mips/lantiq/xway/vmmc.c
index 7a14da8d9d15..a1947306ac18 100644
--- a/arch/mips/lantiq/xway/vmmc.c
+++ b/arch/mips/lantiq/xway/vmmc.c
@@ -4,9 +4,10 @@
  *  Copyright (C) 2012 John Crispin <john@phrozen.org>
  */
 
+#include <linux/err.h>
 #include <linux/export.h>
+#include <linux/gpio/consumer.h>
 #include <linux/of_platform.h>
-#include <linux/of_gpio.h>
 #include <linux/dma-mapping.h>
 
 #include <lantiq_soc.h>
@@ -25,6 +26,7 @@ EXPORT_SYMBOL(ltq_get_cp1_base);
 static int vmmc_probe(struct platform_device *pdev)
 {
 #define CP1_SIZE       (1 << 20)
+	struct gpio_desc *gpio;
 	int gpio_count;
 	dma_addr_t dma;
 
@@ -32,16 +34,18 @@ static int vmmc_probe(struct platform_device *pdev)
 		(void *) CPHYSADDR(dma_alloc_coherent(&pdev->dev, CP1_SIZE,
 						    &dma, GFP_KERNEL));
 
-	gpio_count = of_gpio_count(pdev->dev.of_node);
+	gpio_count = gpiod_count(&pdev->dev, NULL);
 	while (gpio_count > 0) {
-		enum of_gpio_flags flags;
-		int gpio = of_get_gpio_flags(pdev->dev.of_node,
-					     --gpio_count, &flags);
-		if (gpio_request(gpio, "vmmc-relay"))
+		gpio = devm_gpiod_get_index(&pdev->dev,
+					    NULL, --gpio_count, GPIOD_OUT_HIGH);
+		if (IS_ERR(gpio)) {
+			dev_err(&pdev->dev,
+				"failed to request GPIO idx %d: %d\n",
+				gpio_count, PTR_ERR(gpio);
 			continue;
-		dev_info(&pdev->dev, "requested GPIO %d\n", gpio);
-		gpio_direction_output(gpio,
-				      (flags & OF_GPIO_ACTIVE_LOW) ? (0) : (1));
+		}
+
+		gpio_consumer_set_name(gpio, "vmmc-relay");
 	}
 
 	dev_info(&pdev->dev, "reserved %dMB at 0x%p", CP1_SIZE >> 20, cp1_base);
-- 
cgit v1.2.3


From c8fbf8a88fcd4464b0727d646a85793ccba49b03 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Tue, 20 Sep 2022 21:07:10 +0800
Subject: MIPS: remove orphan sb1250_time_init() declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All uses of sb1250_time_init() have been removed by
commit d527eef5b7f1 ("[MIPS] Sibyte: Finish conversion to modern
time APIs."), so remove the orphan declaration.

The comments about sb1250_time_init() is useless in arch_init_irq(),
so remove these comments.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/include/asm/sibyte/sb1250.h | 1 -
 arch/mips/sibyte/sb1250/irq.c         | 6 ------
 2 files changed, 7 deletions(-)

diff --git a/arch/mips/include/asm/sibyte/sb1250.h b/arch/mips/include/asm/sibyte/sb1250.h
index dbde5f93f0dd..495b31925ed7 100644
--- a/arch/mips/include/asm/sibyte/sb1250.h
+++ b/arch/mips/include/asm/sibyte/sb1250.h
@@ -32,7 +32,6 @@ extern unsigned int soc_type;
 extern unsigned int periph_rev;
 extern unsigned int zbbus_mhz;
 
-extern void sb1250_time_init(void);
 extern void sb1250_mask_irq(int cpu, int irq);
 extern void sb1250_unmask_irq(int cpu, int irq);
 
diff --git a/arch/mips/sibyte/sb1250/irq.c b/arch/mips/sibyte/sb1250/irq.c
index 86f49c48fc34..2f08ad267a11 100644
--- a/arch/mips/sibyte/sb1250/irq.c
+++ b/arch/mips/sibyte/sb1250/irq.c
@@ -262,12 +262,6 @@ void __init arch_init_irq(void)
 	__raw_writeq(tmp, IOADDR(A_IMR_REGISTER(0, R_IMR_INTERRUPT_MASK)));
 	__raw_writeq(tmp, IOADDR(A_IMR_REGISTER(1, R_IMR_INTERRUPT_MASK)));
 
-	/*
-	 * Note that the timer interrupts are also mapped, but this is
-	 * done in sb1250_time_init().	Also, the profiling driver
-	 * does its own management of IP7.
-	 */
-
 	/* Enable necessary IPs, disable the rest */
 	change_c0_status(ST0_IM, imask);
 }
-- 
cgit v1.2.3


From fe23057cbc042073f7a05a92ed2cd1d2b3c81f28 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Tue, 20 Sep 2022 21:07:11 +0800
Subject: MIPS: IRQ: remove orphan allocate_irqno() declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All uses of allocate_irqno() have been removed by
commit 69a07a41d908 ("MIPS: SGI-IP27: rework HUB interrupts"),
so remove the orphan declaration.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/include/asm/irq.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index 76941ae572b2..44f9824c1d8c 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -63,8 +63,6 @@ extern void do_domain_IRQ(struct irq_domain *domain, unsigned int irq);
 extern void arch_init_irq(void);
 extern void spurious_interrupt(void);
 
-extern int allocate_irqno(void);
-
 /*
  * Before R2 the timer and performance counter interrupts were both fixed to
  * IE7.	 Since R2 their number has to be read from the c0_intctl register.
-- 
cgit v1.2.3


From 564d53b8b5909ce7a23106873aaf4c4496c3df10 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Wed, 14 Sep 2022 09:33:17 +0800
Subject: PNPBIOS: remove unused pnpid32_to_pnpid() declaration

All uses of pnpid32_to_pnpid() were removed by
commit 25eb846189d2 ("PNP: add pnp_eisa_id_to_string()"),
so remove the declaration, too.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Reviewed-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/pnp/pnpbios/pnpbios.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/pnp/pnpbios/pnpbios.h b/drivers/pnp/pnpbios/pnpbios.h
index 2ce739ff9c1a..f3302006842e 100644
--- a/drivers/pnp/pnpbios/pnpbios.h
+++ b/drivers/pnp/pnpbios/pnpbios.h
@@ -153,7 +153,6 @@ extern int  pnpbios_dont_use_current_config;
 extern int pnpbios_parse_data_stream(struct pnp_dev *dev, struct pnp_bios_node * node);
 extern int pnpbios_read_resources_from_node(struct pnp_dev *dev, struct pnp_bios_node *node);
 extern int pnpbios_write_resources_to_node(struct pnp_dev *dev, struct pnp_bios_node *node);
-extern void pnpid32_to_pnpid(u32 id, char *str);
 
 extern void pnpbios_print_status(const char * module, u16 status);
 extern void pnpbios_calls_init(union pnp_bios_install_struct * header);
-- 
cgit v1.2.3


From 018d6711c26e4bd26e20a819fcc7f8ab902608f3 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Thu, 15 Sep 2022 13:23:14 -0500
Subject: ACPI: x86: Add a quirk for Dell Inspiron 14 2-in-1 for
 StorageD3Enable

Dell Inspiron 14 2-in-1 has two ACPI nodes under GPP1 both with _ADR of
0, both without _HID.  It's ambiguous which the kernel should take, but
it seems to take "DEV0".  Unfortunately "DEV0" is missing the device
property `StorageD3Enable` which is present on "NVME".

To avoid this causing problems for suspend, add a quirk for this system
to behave like `StorageD3Enable` property was found.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=216440
Reported-and-tested-by: Luya Tshimbalanga <luya@fedoraproject.org>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/x86/utils.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/x86/utils.c b/drivers/acpi/x86/utils.c
index 664070fc8349..d7cdd8406c84 100644
--- a/drivers/acpi/x86/utils.c
+++ b/drivers/acpi/x86/utils.c
@@ -207,9 +207,26 @@ static const struct x86_cpu_id storage_d3_cpu_ids[] = {
 	{}
 };
 
+static const struct dmi_system_id force_storage_d3_dmi[] = {
+	{
+		/*
+		 * _ADR is ambiguous between GPP1.DEV0 and GPP1.NVME
+		 * but .NVME is needed to get StorageD3Enable node
+		 * https://bugzilla.kernel.org/show_bug.cgi?id=216440
+		 */
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 14 7425 2-in-1"),
+		}
+	},
+	{}
+};
+
 bool force_storage_d3(void)
 {
-	return x86_match_cpu(storage_d3_cpu_ids);
+	const struct dmi_system_id *dmi_id = dmi_first_match(force_storage_d3_dmi);
+
+	return dmi_id || x86_match_cpu(storage_d3_cpu_ids);
 }
 
 /*
-- 
cgit v1.2.3


From f336443acc890434c7d621366d1bc5bc8f298840 Mon Sep 17 00:00:00 2001
From: Hanjun Guo <guohanjun@huawei.com>
Date: Sat, 17 Sep 2022 17:45:09 +0800
Subject: ACPI: AC: Remove the leftover struct acpi_ac_bl

In commit 57a183222271 ("ACPI / x86: Introduce an
acpi_quirk_skip_acpi_ac_and_battery() helper"), the usage of struct
acpi_ac_bl was removed, but left the definition of the struct in
the file, so remove the useless code.

Signed-off-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/ac.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c
index c29e41bfcf35..bb9fe7984b1a 100644
--- a/drivers/acpi/ac.c
+++ b/drivers/acpi/ac.c
@@ -36,11 +36,6 @@ static int acpi_ac_add(struct acpi_device *device);
 static int acpi_ac_remove(struct acpi_device *device);
 static void acpi_ac_notify(struct acpi_device *device, u32 event);
 
-struct acpi_ac_bl {
-	const char *hid;
-	int hrv;
-};
-
 static const struct acpi_device_id ac_device_ids[] = {
 	{"ACPI0003", 0},
 	{"", 0},
-- 
cgit v1.2.3


From bf2ee8d0c385f883a00473768b67faf2189b2410 Mon Sep 17 00:00:00 2001
From: Jianmin Lv <lvjianmin@loongson.cn>
Date: Sun, 11 Sep 2022 17:06:34 +0800
Subject: ACPI: scan: Support multiple DMA windows with different offsets

In DT systems configurations, of_dma_get_range() returns struct
bus_dma_region DMA regions; they are used to set-up devices
DMA windows with different offset available for translation between DMA
address and CPU address.

In ACPI systems configuration, acpi_dma_get_range() does not return
DMA regions yet and that precludes setting up the dev->dma_range_map
pointer and therefore DMA regions with multiple offsets.

Update acpi_dma_get_range() to return struct bus_dma_region
DMA regions like of_dma_get_range() does.

After updating acpi_dma_get_range(), acpi_arch_dma_setup() is changed for
ARM64, where the original dma_addr and size are removed as these
arguments are now redundant, and pass 0 and U64_MAX for dma_base
and size of arch_setup_dma_ops; this is a simplification consistent
with what other ACPI architectures also pass to iommu_setup_dma_ops().

Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Jianmin Lv <lvjianmin@loongson.cn>
Reviewed-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/arm64/dma.c | 28 +++++++++++++++----------
 drivers/acpi/scan.c      | 53 +++++++++++++++++++++---------------------------
 include/acpi/acpi_bus.h  |  3 +--
 include/linux/acpi.h     |  7 +++----
 4 files changed, 44 insertions(+), 47 deletions(-)

diff --git a/drivers/acpi/arm64/dma.c b/drivers/acpi/arm64/dma.c
index f16739ad3cc0..93d796531af3 100644
--- a/drivers/acpi/arm64/dma.c
+++ b/drivers/acpi/arm64/dma.c
@@ -4,11 +4,12 @@
 #include <linux/device.h>
 #include <linux/dma-direct.h>
 
-void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
+void acpi_arch_dma_setup(struct device *dev)
 {
 	int ret;
 	u64 end, mask;
-	u64 dmaaddr = 0, size = 0, offset = 0;
+	u64 size = 0;
+	const struct bus_dma_region *map = NULL;
 
 	/*
 	 * If @dev is expected to be DMA-capable then the bus code that created
@@ -26,7 +27,19 @@ void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
 	else
 		size = 1ULL << 32;
 
-	ret = acpi_dma_get_range(dev, &dmaaddr, &offset, &size);
+	ret = acpi_dma_get_range(dev, &map);
+	if (!ret && map) {
+		const struct bus_dma_region *r = map;
+
+		for (end = 0; r->size; r++) {
+			if (r->dma_start + r->size - 1 > end)
+				end = r->dma_start + r->size - 1;
+		}
+
+		size = end + 1;
+		dev->dma_range_map = map;
+	}
+
 	if (ret == -ENODEV)
 		ret = iort_dma_get_ranges(dev, &size);
 	if (!ret) {
@@ -34,17 +47,10 @@ void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
 		 * Limit coherent and dma mask based on size retrieved from
 		 * firmware.
 		 */
-		end = dmaaddr + size - 1;
+		end = size - 1;
 		mask = DMA_BIT_MASK(ilog2(end) + 1);
 		dev->bus_dma_limit = end;
 		dev->coherent_dma_mask = min(dev->coherent_dma_mask, mask);
 		*dev->dma_mask = min(*dev->dma_mask, mask);
 	}
-
-	*dma_addr = dmaaddr;
-	*dma_size = size;
-
-	ret = dma_direct_set_offset(dev, dmaaddr + offset, dmaaddr, size);
-
-	dev_dbg(dev, "dma_offset(%#08llx)%s\n", offset, ret ? " failed!" : "");
 }
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 42cec8120f18..f96ef8536037 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -20,6 +20,7 @@
 #include <linux/platform_data/x86/apple.h>
 #include <linux/pgtable.h>
 #include <linux/crc32.h>
+#include <linux/dma-direct.h>
 
 #include "internal.h"
 
@@ -1467,25 +1468,21 @@ enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev)
  * acpi_dma_get_range() - Get device DMA parameters.
  *
  * @dev: device to configure
- * @dma_addr: pointer device DMA address result
- * @offset: pointer to the DMA offset result
- * @size: pointer to DMA range size result
+ * @map: pointer to DMA ranges result
  *
- * Evaluate DMA regions and return respectively DMA region start, offset
- * and size in dma_addr, offset and size on parsing success; it does not
- * update the passed in values on failure.
+ * Evaluate DMA regions and return pointer to DMA regions on
+ * parsing success; it does not update the passed in values on failure.
  *
  * Return 0 on success, < 0 on failure.
  */
-int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
-		       u64 *size)
+int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map)
 {
 	struct acpi_device *adev;
 	LIST_HEAD(list);
 	struct resource_entry *rentry;
 	int ret;
 	struct device *dma_dev = dev;
-	u64 len, dma_start = U64_MAX, dma_end = 0, dma_offset = 0;
+	struct bus_dma_region *r;
 
 	/*
 	 * Walk the device tree chasing an ACPI companion with a _DMA
@@ -1510,31 +1507,28 @@ int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
 
 	ret = acpi_dev_get_dma_resources(adev, &list);
 	if (ret > 0) {
+		r = kcalloc(ret + 1, sizeof(*r), GFP_KERNEL);
+		if (!r) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
 		list_for_each_entry(rentry, &list, node) {
-			if (dma_offset && rentry->offset != dma_offset) {
+			if (rentry->res->start >= rentry->res->end) {
+				kfree(r);
 				ret = -EINVAL;
-				dev_warn(dma_dev, "Can't handle multiple windows with different offsets\n");
+				dev_dbg(dma_dev, "Invalid DMA regions configuration\n");
 				goto out;
 			}
-			dma_offset = rentry->offset;
 
-			/* Take lower and upper limits */
-			if (rentry->res->start < dma_start)
-				dma_start = rentry->res->start;
-			if (rentry->res->end > dma_end)
-				dma_end = rentry->res->end;
-		}
-
-		if (dma_start >= dma_end) {
-			ret = -EINVAL;
-			dev_dbg(dma_dev, "Invalid DMA regions configuration\n");
-			goto out;
+			r->cpu_start = rentry->res->start;
+			r->dma_start = rentry->res->start - rentry->offset;
+			r->size = resource_size(rentry->res);
+			r->offset = rentry->offset;
+			r++;
 		}
 
-		*dma_addr = dma_start - dma_offset;
-		len = dma_end - dma_start;
-		*size = max(len, len + 1);
-		*offset = dma_offset;
+		*map = r;
 	}
  out:
 	acpi_dev_free_resource_list(&list);
@@ -1624,20 +1618,19 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
 			  const u32 *input_id)
 {
 	const struct iommu_ops *iommu;
-	u64 dma_addr = 0, size = 0;
 
 	if (attr == DEV_DMA_NOT_SUPPORTED) {
 		set_dma_ops(dev, &dma_dummy_ops);
 		return 0;
 	}
 
-	acpi_arch_dma_setup(dev, &dma_addr, &size);
+	acpi_arch_dma_setup(dev);
 
 	iommu = acpi_iommu_configure_id(dev, input_id);
 	if (PTR_ERR(iommu) == -EPROBE_DEFER)
 		return -EPROBE_DEFER;
 
-	arch_setup_dma_ops(dev, dma_addr, size,
+	arch_setup_dma_ops(dev, 0, U64_MAX,
 				iommu, attr == DEV_DMA_COHERENT);
 
 	return 0;
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index e7d27373ff71..73ac4a1d6947 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -613,8 +613,7 @@ enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
 int acpi_iommu_fwspec_init(struct device *dev, u32 id,
 			   struct fwnode_handle *fwnode,
 			   const struct iommu_ops *ops);
-int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
-		       u64 *size);
+int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map);
 int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
 			   const u32 *input_id);
 static inline int acpi_dma_configure(struct device *dev,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 6f64b2f3dc54..bb41623dab77 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -281,12 +281,12 @@ void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa);
 
 #ifdef CONFIG_ARM64
 void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa);
-void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size);
+void acpi_arch_dma_setup(struct device *dev);
 #else
 static inline void
 acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
 static inline void
-acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) { }
+acpi_arch_dma_setup(struct device *dev) { }
 #endif
 
 int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);
@@ -977,8 +977,7 @@ static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev)
 	return DEV_DMA_NOT_SUPPORTED;
 }
 
-static inline int acpi_dma_get_range(struct device *dev, u64 *dma_addr,
-				     u64 *offset, u64 *size)
+static inline int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map)
 {
 	return -ENODEV;
 }
-- 
cgit v1.2.3


From c78c43fe7d42524c8f364aaf95ef3652e7f1186b Mon Sep 17 00:00:00 2001
From: Jianmin Lv <lvjianmin@loongson.cn>
Date: Sun, 11 Sep 2022 17:06:35 +0800
Subject: LoongArch: Use acpi_arch_dma_setup() and remove ARCH_HAS_PHYS_TO_DMA

Use _DMA defined in ACPI spec for translation between
DMA address and CPU address, and implement acpi_arch_dma_setup
for initializing dev->dma_range_map, where acpi_dma_get_range
is called for parsing _DMA.

e.g.
If we have two dma ranges:
cpu address      dma address    size         offset
0x200080000000   0x2080000000   0x400000000  0x1fe000000000
0x400080000000   0x4080000000   0x400000000  0x3fc000000000

_DMA for pci devices should be declared in host bridge as
flowing:

Name (_DMA, ResourceTemplate() {
        QWordMemory (ResourceProducer,
            PosDecode,
            MinFixed,
            MaxFixed,
            NonCacheable,
            ReadWrite,
            0x0,
            0x4080000000,
            0x447fffffff,
            0x3fc000000000,
            0x400000000,
            ,
            ,
            )

        QWordMemory (ResourceProducer,
            PosDecode,
            MinFixed,
            MaxFixed,
            NonCacheable,
            ReadWrite,
            0x0,
            0x2080000000,
            0x247fffffff,
            0x1fe000000000,
            0x400000000,
            ,
            ,
            )
    })

Acked-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Jianmin Lv <lvjianmin@loongson.cn>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/loongarch/Kconfig        |  1 -
 arch/loongarch/kernel/dma.c   | 52 +++++++++++++++++--------------------------
 arch/loongarch/kernel/setup.c |  2 +-
 include/linux/acpi.h          |  9 +++++---
 4 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index c7dd6ad779af..551dd99e98b8 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -10,7 +10,6 @@ config LOONGARCH
 	select ARCH_ENABLE_MEMORY_HOTPLUG
 	select ARCH_ENABLE_MEMORY_HOTREMOVE
 	select ARCH_HAS_ACPI_TABLE_UPGRADE	if ACPI
-	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_INLINE_READ_LOCK if !PREEMPTION
diff --git a/arch/loongarch/kernel/dma.c b/arch/loongarch/kernel/dma.c
index 8c9b5314a13e..7a9c6a9dd2d0 100644
--- a/arch/loongarch/kernel/dma.c
+++ b/arch/loongarch/kernel/dma.c
@@ -2,39 +2,29 @@
 /*
  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
  */
-#include <linux/init.h>
+#include <linux/acpi.h>
 #include <linux/dma-direct.h>
-#include <linux/dma-mapping.h>
-#include <linux/dma-map-ops.h>
-#include <linux/swiotlb.h>
 
-#include <asm/bootinfo.h>
-#include <asm/dma.h>
-#include <asm/loongson.h>
-
-/*
- * We extract 4bit node id (bit 44~47) from Loongson-3's
- * 48bit physical address space and embed it into 40bit.
- */
-
-static int node_id_offset;
-
-dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
-{
-	long nid = (paddr >> 44) & 0xf;
-
-	return ((nid << 44) ^ paddr) | (nid << node_id_offset);
-}
-
-phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
+void acpi_arch_dma_setup(struct device *dev)
 {
-	long nid = (daddr >> node_id_offset) & 0xf;
+	int ret;
+	u64 mask, end = 0;
+	const struct bus_dma_region *map = NULL;
+
+	ret = acpi_dma_get_range(dev, &map);
+	if (!ret && map) {
+		const struct bus_dma_region *r = map;
+
+		for (end = 0; r->size; r++) {
+			if (r->dma_start + r->size - 1 > end)
+				end = r->dma_start + r->size - 1;
+		}
+
+		mask = DMA_BIT_MASK(ilog2(end) + 1);
+		dev->bus_dma_limit = end;
+		dev->dma_range_map = map;
+		dev->coherent_dma_mask = min(dev->coherent_dma_mask, mask);
+		*dev->dma_mask = min(*dev->dma_mask, mask);
+	}
 
-	return ((nid << node_id_offset) ^ daddr) | (nid << 44);
-}
-
-void __init plat_swiotlb_setup(void)
-{
-	swiotlb_init(true, SWIOTLB_VERBOSE);
-	node_id_offset = ((readl(LS7A_DMA_CFG) & LS7A_DMA_NODE_MASK) >> LS7A_DMA_NODE_SHF) + 36;
 }
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 8f5c2f9a1a83..d97c69dbe553 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -247,7 +247,7 @@ static void __init arch_mem_init(char **cmdline_p)
 	sparse_init();
 	memblock_set_bottom_up(true);
 
-	plat_swiotlb_setup();
+	swiotlb_init(true, SWIOTLB_VERBOSE);
 
 	dma_contiguous_reserve(PFN_PHYS(max_low_pfn));
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index bb41623dab77..a71d73a0d43e 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -279,14 +279,17 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { }
 
 void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa);
 
+#if defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH)
+void acpi_arch_dma_setup(struct device *dev);
+#else
+static inline void acpi_arch_dma_setup(struct device *dev) { }
+#endif
+
 #ifdef CONFIG_ARM64
 void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa);
-void acpi_arch_dma_setup(struct device *dev);
 #else
 static inline void
 acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
-static inline void
-acpi_arch_dma_setup(struct device *dev) { }
 #endif
 
 int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);
-- 
cgit v1.2.3


From ae2df912d1a557a3548be83da20851ac55f42ab3 Mon Sep 17 00:00:00 2001
From: Jeremy Linton <jeremy.linton@arm.com>
Date: Mon, 12 Sep 2022 15:37:22 -0500
Subject: ACPI: CPPC: Disable FIE if registers in PCC regions

PCC regions utilize a mailbox to set/retrieve register values used by
the CPPC code. This is fine as long as the operations are
infrequent. With the FIE code enabled though the overhead can range
from 2-11% of system CPU overhead (ex: as measured by top) on Arm
based machines.

So, before enabling FIE assure none of the registers used by
cppc_get_perf_ctrs() are in the PCC region. Finally, add a module
parameter which can override the PCC region detection at boot or
module reload.

Signed-off-by: Jeremy Linton <jeremy.linton@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/cppc_acpi.c       | 42 ++++++++++++++++++++++++++++++++++++++++++
 drivers/cpufreq/cppc_cpufreq.c | 25 +++++++++++++++++++++----
 include/acpi/cppc_acpi.h       |  5 +++++
 3 files changed, 68 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
index c2309429146f..093675b1a1ff 100644
--- a/drivers/acpi/cppc_acpi.c
+++ b/drivers/acpi/cppc_acpi.c
@@ -1243,6 +1243,48 @@ out_err:
 }
 EXPORT_SYMBOL_GPL(cppc_get_perf_caps);
 
+/**
+ * cppc_perf_ctrs_in_pcc - Check if any perf counters are in a PCC region.
+ *
+ * CPPC has flexibility about how CPU performance counters are accessed.
+ * One of the choices is PCC regions, which can have a high access latency. This
+ * routine allows callers of cppc_get_perf_ctrs() to know this ahead of time.
+ *
+ * Return: true if any of the counters are in PCC regions, false otherwise
+ */
+bool cppc_perf_ctrs_in_pcc(void)
+{
+	int cpu;
+
+	for_each_present_cpu(cpu) {
+		struct cpc_register_resource *ref_perf_reg;
+		struct cpc_desc *cpc_desc;
+
+		cpc_desc = per_cpu(cpc_desc_ptr, cpu);
+
+		if (CPC_IN_PCC(&cpc_desc->cpc_regs[DELIVERED_CTR]) ||
+		    CPC_IN_PCC(&cpc_desc->cpc_regs[REFERENCE_CTR]) ||
+		    CPC_IN_PCC(&cpc_desc->cpc_regs[CTR_WRAP_TIME]))
+			return true;
+
+
+		ref_perf_reg = &cpc_desc->cpc_regs[REFERENCE_PERF];
+
+		/*
+		 * If reference perf register is not supported then we should
+		 * use the nominal perf value
+		 */
+		if (!CPC_SUPPORTED(ref_perf_reg))
+			ref_perf_reg = &cpc_desc->cpc_regs[NOMINAL_PERF];
+
+		if (CPC_IN_PCC(ref_perf_reg))
+			return true;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(cppc_perf_ctrs_in_pcc);
+
 /**
  * cppc_get_perf_ctrs - Read a CPU's performance feedback counters.
  * @cpunum: CPU from which to read counters.
diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 9adb7612993e..432dfb4e8027 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -63,7 +63,15 @@ static struct cppc_workaround_oem_info wa_info[] = {
 
 static struct cpufreq_driver cppc_cpufreq_driver;
 
+static enum {
+	FIE_UNSET = -1,
+	FIE_ENABLED,
+	FIE_DISABLED
+} fie_disabled = FIE_UNSET;
+
 #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE
+module_param(fie_disabled, int, 0444);
+MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)");
 
 /* Frequency invariance support */
 struct cppc_freq_invariance {
@@ -158,7 +166,7 @@ static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy)
 	struct cppc_freq_invariance *cppc_fi;
 	int cpu, ret;
 
-	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+	if (fie_disabled)
 		return;
 
 	for_each_cpu(cpu, policy->cpus) {
@@ -199,7 +207,7 @@ static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy)
 	struct cppc_freq_invariance *cppc_fi;
 	int cpu;
 
-	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+	if (fie_disabled)
 		return;
 
 	/* policy->cpus will be empty here, use related_cpus instead */
@@ -229,7 +237,15 @@ static void __init cppc_freq_invariance_init(void)
 	};
 	int ret;
 
-	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+	if (fie_disabled != FIE_ENABLED && fie_disabled != FIE_DISABLED) {
+		fie_disabled = FIE_ENABLED;
+		if (cppc_perf_ctrs_in_pcc()) {
+			pr_info("FIE not enabled on systems with registers in PCC\n");
+			fie_disabled = FIE_DISABLED;
+		}
+	}
+
+	if (fie_disabled)
 		return;
 
 	kworker_fie = kthread_create_worker(0, "cppc_fie");
@@ -247,7 +263,7 @@ static void __init cppc_freq_invariance_init(void)
 
 static void cppc_freq_invariance_exit(void)
 {
-	if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+	if (fie_disabled)
 		return;
 
 	kthread_destroy_worker(kworker_fie);
@@ -936,6 +952,7 @@ static void cppc_check_hisi_workaround(void)
 		    wa_info[i].oem_revision == tbl->oem_revision) {
 			/* Overwrite the get() callback */
 			cppc_cpufreq_driver.get = hisi_cppc_cpufreq_get_rate;
+			fie_disabled = FIE_DISABLED;
 			break;
 		}
 	}
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index f73d357ecdf5..c5614444031f 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -140,6 +140,7 @@ extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs);
 extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls);
 extern int cppc_set_enable(int cpu, bool enable);
 extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps);
+extern bool cppc_perf_ctrs_in_pcc(void);
 extern bool acpi_cpc_valid(void);
 extern bool cppc_allow_fast_switch(void);
 extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data);
@@ -173,6 +174,10 @@ static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps)
 {
 	return -ENOTSUPP;
 }
+static inline bool cppc_perf_ctrs_in_pcc(void)
+{
+	return false;
+}
 static inline bool acpi_cpc_valid(void)
 {
 	return false;
-- 
cgit v1.2.3


From f23470e659964c41697e9c3e950fe80ecae143de Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sat, 17 Sep 2022 23:56:22 +0200
Subject: ACPI: fan: Reorder symbols to get rid of a few forward declarations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Having the driver struct at the end of the file isn't only more usual, it
also allows to drop a few forward declarations which are just useless
repetition.

While at it also move the MODULE_... macros to the end of the file.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/fan_core.c | 58 +++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 31 deletions(-)

diff --git a/drivers/acpi/fan_core.c b/drivers/acpi/fan_core.c
index b9a9a59ddcc1..52a0b303b70a 100644
--- a/drivers/acpi/fan_core.c
+++ b/drivers/acpi/fan_core.c
@@ -19,43 +19,12 @@
 
 #include "fan.h"
 
-MODULE_AUTHOR("Paul Diefenbaugh");
-MODULE_DESCRIPTION("ACPI Fan Driver");
-MODULE_LICENSE("GPL");
-
-static int acpi_fan_probe(struct platform_device *pdev);
-static int acpi_fan_remove(struct platform_device *pdev);
-
 static const struct acpi_device_id fan_device_ids[] = {
 	ACPI_FAN_DEVICE_IDS,
 	{"", 0},
 };
 MODULE_DEVICE_TABLE(acpi, fan_device_ids);
 
-#ifdef CONFIG_PM_SLEEP
-static int acpi_fan_suspend(struct device *dev);
-static int acpi_fan_resume(struct device *dev);
-static const struct dev_pm_ops acpi_fan_pm = {
-	.resume = acpi_fan_resume,
-	.freeze = acpi_fan_suspend,
-	.thaw = acpi_fan_resume,
-	.restore = acpi_fan_resume,
-};
-#define FAN_PM_OPS_PTR (&acpi_fan_pm)
-#else
-#define FAN_PM_OPS_PTR NULL
-#endif
-
-static struct platform_driver acpi_fan_driver = {
-	.probe = acpi_fan_probe,
-	.remove = acpi_fan_remove,
-	.driver = {
-		.name = "acpi-fan",
-		.acpi_match_table = fan_device_ids,
-		.pm = FAN_PM_OPS_PTR,
-	},
-};
-
 /* thermal cooling device callbacks */
 static int fan_get_max_state(struct thermal_cooling_device *cdev, unsigned long
 			     *state)
@@ -459,6 +428,33 @@ static int acpi_fan_resume(struct device *dev)
 
 	return result;
 }
+
+static const struct dev_pm_ops acpi_fan_pm = {
+	.resume = acpi_fan_resume,
+	.freeze = acpi_fan_suspend,
+	.thaw = acpi_fan_resume,
+	.restore = acpi_fan_resume,
+};
+#define FAN_PM_OPS_PTR (&acpi_fan_pm)
+
+#else
+
+#define FAN_PM_OPS_PTR NULL
+
 #endif
 
+static struct platform_driver acpi_fan_driver = {
+	.probe = acpi_fan_probe,
+	.remove = acpi_fan_remove,
+	.driver = {
+		.name = "acpi-fan",
+		.acpi_match_table = fan_device_ids,
+		.pm = FAN_PM_OPS_PTR,
+	},
+};
+
 module_platform_driver(acpi_fan_driver);
+
+MODULE_AUTHOR("Paul Diefenbaugh");
+MODULE_DESCRIPTION("ACPI Fan Driver");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 382c5fec89f3b0ce870a17f028c547a9f95b7834 Mon Sep 17 00:00:00 2001
From: ye xingchen <ye.xingchen@zte.com.cn>
Date: Wed, 21 Sep 2022 09:28:34 +0000
Subject: ACPI: APEI: Remove unneeded result variables

Return the erst_get_record_id_begin() and apei_exec_write_register()
return values directly instead of storing them in redundant local
variables.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: ye xingchen <ye.xingchen@zte.com.cn>
[ rjw: Changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/apei/apei-base.c | 5 +----
 drivers/acpi/apei/erst.c      | 6 +-----
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c
index 9f49272cad39..9b52482b4ed5 100644
--- a/drivers/acpi/apei/apei-base.c
+++ b/drivers/acpi/apei/apei-base.c
@@ -125,12 +125,9 @@ EXPORT_SYMBOL_GPL(apei_exec_write_register);
 int apei_exec_write_register_value(struct apei_exec_context *ctx,
 				   struct acpi_whea_header *entry)
 {
-	int rc;
-
 	ctx->value = entry->value;
-	rc = apei_exec_write_register(ctx, entry);
 
-	return rc;
+	return apei_exec_write_register(ctx, entry);
 }
 EXPORT_SYMBOL_GPL(apei_exec_write_register_value);
 
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 31b077eedb58..247989060e29 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -1020,14 +1020,10 @@ static int reader_pos;
 
 static int erst_open_pstore(struct pstore_info *psi)
 {
-	int rc;
-
 	if (erst_disable)
 		return -ENODEV;
 
-	rc = erst_get_record_id_begin(&reader_pos);
-
-	return rc;
+	return erst_get_record_id_begin(&reader_pos);
 }
 
 static int erst_close_pstore(struct pstore_info *psi)
-- 
cgit v1.2.3


From 53e7380c9338af588ddbadb4ac9d878351770de4 Mon Sep 17 00:00:00 2001
From: Vijayenthiran Subramaniam <vijayenthiran.subramaniam@arm.com>
Date: Wed, 21 Sep 2022 20:02:44 +0530
Subject: ACPI: AMBA: Add ARM DMA-330 controller to the supported list

Add ACPI ID for ARM DMA-330 controller to AMBA id supported list to
allow the probing of the device.

Signed-off-by: Vijayenthiran Subramaniam <vijayenthiran.subramaniam@arm.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_amba.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/acpi/acpi_amba.c b/drivers/acpi/acpi_amba.c
index ab8a4e0191b1..b89120196af1 100644
--- a/drivers/acpi/acpi_amba.c
+++ b/drivers/acpi/acpi_amba.c
@@ -21,6 +21,7 @@
 
 static const struct acpi_device_id amba_id_list[] = {
 	{"ARMH0061", 0}, /* PL061 GPIO Device */
+	{"ARMH0330", 0}, /* ARM DMA Controller DMA-330 */
 	{"ARMHC500", 0}, /* ARM CoreSight ETM4x */
 	{"ARMHC501", 0}, /* ARM CoreSight ETR */
 	{"ARMHC502", 0}, /* ARM CoreSight STM */
-- 
cgit v1.2.3


From 6e5cbe7c4b41824e500acbb42411da692d1435f1 Mon Sep 17 00:00:00 2001
From: Kellen Renshaw <kellen.renshaw@canonical.com>
Date: Wed, 21 Sep 2022 10:24:22 -0700
Subject: ACPI: resource: Add ASUS model S5402ZA to quirks

The Asus Vivobook S5402ZA has the same keyboard issue as Asus Vivobook
K3402ZA/K3502ZA. The kernel overrides IRQ 1 to Edge_High when it
should be Active_Low.

This patch adds the S5402ZA model to the quirk list.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=216158
Tested-by: Kellen Renshaw <kellen.renshaw@canonical.com>
Signed-off-by: Kellen Renshaw <kellen.renshaw@canonical.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/resource.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index e40b2153911d..514d89656dde 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -414,6 +414,13 @@ static const struct dmi_system_id asus_laptop[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "K3502ZA"),
 		},
 	},
+	{
+		.ident = "Asus Vivobook S5402ZA",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_BOARD_NAME, "S5402ZA"),
+		},
+	},
 	{ }
 };
 
-- 
cgit v1.2.3


From cca8a7efea6402f463239a1bb337d01b0cad7b2e Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Thu, 22 Sep 2022 00:04:35 +0100
Subject: ACPI: scan: Add acpi_dev_get_next_consumer_dev()

In commit b83e2b306736 ("ACPI: scan: Add function to fetch dependent
of ACPI device") we added a means of fetching the first device to
declare itself dependent on another ACPI device in the _DEP method.
One assumption in that patch was that there would only be a single
consuming device, but this has not held.

Replace that function with a new function that fetches the next consumer
of a supplier device. Where no "previous" consumer is passed in, it
behaves identically to the original function.

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Daniel Scally <djrscally@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/scan.c                         | 40 +++++++++++++++++++++--------
 drivers/platform/x86/intel/int3472/common.c |  2 +-
 include/acpi/acpi_bus.h                     |  4 ++-
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index f6b9ea937de1..d0277427e7ee 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -2235,9 +2235,22 @@ ok:
 	return 0;
 }
 
-static int acpi_dev_get_first_consumer_dev_cb(struct acpi_dep_data *dep, void *data)
+static int acpi_dev_get_next_consumer_dev_cb(struct acpi_dep_data *dep, void *data)
 {
-	struct acpi_device *adev;
+	struct acpi_device **adev_p = data;
+	struct acpi_device *adev = *adev_p;
+
+	/*
+	 * If we're passed a 'previous' consumer device then we need to skip
+	 * any consumers until we meet the previous one, and then NULL @data
+	 * so the next one can be returned.
+	 */
+	if (adev) {
+		if (dep->consumer == adev->handle)
+			*adev_p = NULL;
+
+		return 0;
+	}
 
 	adev = acpi_get_acpi_dev(dep->consumer);
 	if (adev) {
@@ -2368,25 +2381,32 @@ bool acpi_dev_ready_for_enumeration(const struct acpi_device *device)
 EXPORT_SYMBOL_GPL(acpi_dev_ready_for_enumeration);
 
 /**
- * acpi_dev_get_first_consumer_dev - Return ACPI device dependent on @supplier
+ * acpi_dev_get_next_consumer_dev - Return the next adev dependent on @supplier
  * @supplier: Pointer to the dependee device
+ * @start: Pointer to the current dependent device
  *
- * Returns the first &struct acpi_device which declares itself dependent on
+ * Returns the next &struct acpi_device which declares itself dependent on
  * @supplier via the _DEP buffer, parsed from the acpi_dep_list.
  *
- * The caller is responsible for putting the reference to adev when it is no
- * longer needed.
+ * If the returned adev is not passed as @start to this function, the caller is
+ * responsible for putting the reference to adev when it is no longer needed.
  */
-struct acpi_device *acpi_dev_get_first_consumer_dev(struct acpi_device *supplier)
+struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier,
+						   struct acpi_device *start)
 {
-	struct acpi_device *adev = NULL;
+	struct acpi_device *adev = start;
 
 	acpi_walk_dep_device_list(supplier->handle,
-				  acpi_dev_get_first_consumer_dev_cb, &adev);
+				  acpi_dev_get_next_consumer_dev_cb, &adev);
+
+	acpi_dev_put(start);
+
+	if (adev == start)
+		return NULL;
 
 	return adev;
 }
-EXPORT_SYMBOL_GPL(acpi_dev_get_first_consumer_dev);
+EXPORT_SYMBOL_GPL(acpi_dev_get_next_consumer_dev);
 
 /**
  * acpi_bus_scan - Add ACPI device node objects in a given namespace scope.
diff --git a/drivers/platform/x86/intel/int3472/common.c b/drivers/platform/x86/intel/int3472/common.c
index 77cf058e4168..9db2bb0bbba4 100644
--- a/drivers/platform/x86/intel/int3472/common.c
+++ b/drivers/platform/x86/intel/int3472/common.c
@@ -62,7 +62,7 @@ int skl_int3472_get_sensor_adev_and_name(struct device *dev,
 	struct acpi_device *sensor;
 	int ret = 0;
 
-	sensor = acpi_dev_get_first_consumer_dev(adev);
+	sensor = acpi_dev_get_next_consumer_dev(adev, NULL);
 	if (!sensor) {
 		dev_err(dev, "INT3472 seems to have no dependents.\n");
 		return -ENODEV;
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 42f76f2c2d49..c66d17c3298b 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -742,7 +742,9 @@ bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const ch
 
 void acpi_dev_clear_dependencies(struct acpi_device *supplier);
 bool acpi_dev_ready_for_enumeration(const struct acpi_device *device);
-struct acpi_device *acpi_dev_get_first_consumer_dev(struct acpi_device *supplier);
+struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier,
+						   struct acpi_device *start);
+
 struct acpi_device *
 acpi_dev_get_next_match_dev(struct acpi_device *adev, const char *hid, const char *uid, s64 hrv);
 struct acpi_device *
-- 
cgit v1.2.3


From 62c8bc0d27a784b09114d67c33e8fa83f350fa77 Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Thu, 22 Sep 2022 00:04:36 +0100
Subject: ACPI: bus: Add iterator for dependent devices

Add a helper macro to iterate over ACPI devices that are flagged
as consumers of an initial supplier ACPI device.

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Daniel Scally <djrscally@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpi_bus.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index c66d17c3298b..f016c155b462 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -745,6 +745,17 @@ bool acpi_dev_ready_for_enumeration(const struct acpi_device *device);
 struct acpi_device *acpi_dev_get_next_consumer_dev(struct acpi_device *supplier,
 						   struct acpi_device *start);
 
+/**
+ * for_each_acpi_consumer_dev - iterate over the consumer ACPI devices for a
+ *				given supplier
+ * @supplier: Pointer to the supplier's ACPI device
+ * @consumer: Pointer to &struct acpi_device to hold the consumer, initially NULL
+ */
+#define for_each_acpi_consumer_dev(supplier, consumer)			\
+	for (consumer = acpi_dev_get_next_consumer_dev(supplier, NULL);	\
+	     consumer;							\
+	     consumer = acpi_dev_get_next_consumer_dev(supplier, consumer))
+
 struct acpi_device *
 acpi_dev_get_next_match_dev(struct acpi_device *adev, const char *hid, const char *uid, s64 hrv);
 struct acpi_device *
-- 
cgit v1.2.3


From 43cf36974d760a3d1c705a83de89ac58059e5f0b Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Thu, 22 Sep 2022 00:04:37 +0100
Subject: platform/x86: int3472: Support multiple clock consumers

At present, the tps68470.c only supports a single clock consumer when
passing platform data to the clock driver. In some devices multiple
sensors depend on the clock provided by a single TPS68470 and so all
need to be able to acquire the clock. Support passing multiple
consumers as platform data.

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Daniel Scally <djrscally@gmail.com>
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/clk/clk-tps68470.c                    | 13 ++++--
 drivers/platform/x86/intel/int3472/tps68470.c | 59 +++++++++++++++++++++++----
 include/linux/platform_data/tps68470.h        |  7 +++-
 3 files changed, 67 insertions(+), 12 deletions(-)

diff --git a/drivers/clk/clk-tps68470.c b/drivers/clk/clk-tps68470.c
index e5fbefd6ac2d..38f44b5b9b1b 100644
--- a/drivers/clk/clk-tps68470.c
+++ b/drivers/clk/clk-tps68470.c
@@ -200,7 +200,9 @@ static int tps68470_clk_probe(struct platform_device *pdev)
 		.flags = CLK_SET_RATE_GATE,
 	};
 	struct tps68470_clkdata *tps68470_clkdata;
+	struct tps68470_clk_consumer *consumer;
 	int ret;
+	int i;
 
 	tps68470_clkdata = devm_kzalloc(&pdev->dev, sizeof(*tps68470_clkdata),
 					GFP_KERNEL);
@@ -223,10 +225,13 @@ static int tps68470_clk_probe(struct platform_device *pdev)
 		return ret;
 
 	if (pdata) {
-		ret = devm_clk_hw_register_clkdev(&pdev->dev,
-						  &tps68470_clkdata->clkout_hw,
-						  pdata->consumer_con_id,
-						  pdata->consumer_dev_name);
+		for (i = 0; i < pdata->n_consumers; i++) {
+			consumer = &pdata->consumers[i];
+			ret = devm_clk_hw_register_clkdev(&pdev->dev,
+							  &tps68470_clkdata->clkout_hw,
+							  consumer->consumer_con_id,
+							  consumer->consumer_dev_name);
+		}
 	}
 
 	return ret;
diff --git a/drivers/platform/x86/intel/int3472/tps68470.c b/drivers/platform/x86/intel/int3472/tps68470.c
index 22f61b47f9e5..8a684030933d 100644
--- a/drivers/platform/x86/intel/int3472/tps68470.c
+++ b/drivers/platform/x86/intel/int3472/tps68470.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Author: Dan Scally <djrscally@gmail.com> */
 
+#include <linux/acpi.h>
 #include <linux/i2c.h>
 #include <linux/kernel.h>
 #include <linux/mfd/core.h>
@@ -95,20 +96,64 @@ static int skl_int3472_tps68470_calc_type(struct acpi_device *adev)
 	return DESIGNED_FOR_WINDOWS;
 }
 
+/*
+ * Return the size of the flexible array member, because we'll need that later
+ * on to pass .pdata_size to cells.
+ */
+static int
+skl_int3472_fill_clk_pdata(struct device *dev, struct tps68470_clk_platform_data **clk_pdata)
+{
+	struct acpi_device *adev = ACPI_COMPANION(dev);
+	struct acpi_device *consumer;
+	unsigned int n_consumers = 0;
+	const char *sensor_name;
+	unsigned int i = 0;
+
+	for_each_acpi_consumer_dev(adev, consumer)
+		n_consumers++;
+
+	if (!n_consumers) {
+		dev_err(dev, "INT3472 seems to have no dependents\n");
+		return -ENODEV;
+	}
+
+	*clk_pdata = devm_kzalloc(dev, struct_size(*clk_pdata, consumers, n_consumers),
+				  GFP_KERNEL);
+	if (!*clk_pdata)
+		return -ENOMEM;
+
+	(*clk_pdata)->n_consumers = n_consumers;
+	i = 0;
+
+	for_each_acpi_consumer_dev(adev, consumer) {
+		sensor_name = devm_kasprintf(dev, GFP_KERNEL, I2C_DEV_NAME_FORMAT,
+					     acpi_dev_name(consumer));
+		if (!sensor_name)
+			return -ENOMEM;
+
+		(*clk_pdata)->consumers[i].consumer_dev_name = sensor_name;
+		i++;
+	}
+
+	acpi_dev_put(consumer);
+
+	return n_consumers;
+}
+
 static int skl_int3472_tps68470_probe(struct i2c_client *client)
 {
 	struct acpi_device *adev = ACPI_COMPANION(&client->dev);
 	const struct int3472_tps68470_board_data *board_data;
-	struct tps68470_clk_platform_data clk_pdata = {};
+	struct tps68470_clk_platform_data *clk_pdata;
 	struct mfd_cell *cells;
 	struct regmap *regmap;
+	int n_consumers;
 	int device_type;
 	int ret;
 
-	ret = skl_int3472_get_sensor_adev_and_name(&client->dev, NULL,
-						   &clk_pdata.consumer_dev_name);
-	if (ret)
-		return ret;
+	n_consumers = skl_int3472_fill_clk_pdata(&client->dev, &clk_pdata);
+	if (n_consumers < 0)
+		return n_consumers;
 
 	regmap = devm_regmap_init_i2c(client, &tps68470_regmap_config);
 	if (IS_ERR(regmap)) {
@@ -142,8 +187,8 @@ static int skl_int3472_tps68470_probe(struct i2c_client *client)
 		 * the clk + regulators must be ready when this happens.
 		 */
 		cells[0].name = "tps68470-clk";
-		cells[0].platform_data = &clk_pdata;
-		cells[0].pdata_size = sizeof(clk_pdata);
+		cells[0].platform_data = clk_pdata;
+		cells[0].pdata_size = struct_size(clk_pdata, consumers, n_consumers);
 		cells[1].name = "tps68470-regulator";
 		cells[1].platform_data = (void *)board_data->tps68470_regulator_pdata;
 		cells[1].pdata_size = sizeof(struct tps68470_regulator_platform_data);
diff --git a/include/linux/platform_data/tps68470.h b/include/linux/platform_data/tps68470.h
index 126d082c3f2e..e605a2cab07f 100644
--- a/include/linux/platform_data/tps68470.h
+++ b/include/linux/platform_data/tps68470.h
@@ -27,9 +27,14 @@ struct tps68470_regulator_platform_data {
 	const struct regulator_init_data *reg_init_data[TPS68470_NUM_REGULATORS];
 };
 
-struct tps68470_clk_platform_data {
+struct tps68470_clk_consumer {
 	const char *consumer_dev_name;
 	const char *consumer_con_id;
 };
 
+struct tps68470_clk_platform_data {
+	unsigned int n_consumers;
+	struct tps68470_clk_consumer consumers[];
+};
+
 #endif
-- 
cgit v1.2.3


From 06a659d1f0a0ae3d104155655397e593296a65fa Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Thu, 22 Sep 2022 00:04:38 +0100
Subject: platform/x86: int3472: Support multiple gpio lookups in board data

Currently, we only support passing a single gpiod_lookup_table as part
of the board data for the tps68470 driver. This carries the implicit
assumption that each TPS68470 device will only support a single
sensor, which does not hold true.

Extend the code to support the possibility of multiple sensors each
having a gpiod_lookup_table, and opportunistically add the lookup
table for the Surface Go line's IR camera.

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Daniel Scally <djrscally@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/platform/x86/intel/int3472/tps68470.c       | 17 ++++++++++++-----
 drivers/platform/x86/intel/int3472/tps68470.h       |  3 ++-
 .../x86/intel/int3472/tps68470_board_data.c         | 21 ++++++++++++++++++---
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/drivers/platform/x86/intel/int3472/tps68470.c b/drivers/platform/x86/intel/int3472/tps68470.c
index 8a684030933d..49fc379fe680 100644
--- a/drivers/platform/x86/intel/int3472/tps68470.c
+++ b/drivers/platform/x86/intel/int3472/tps68470.c
@@ -150,6 +150,7 @@ static int skl_int3472_tps68470_probe(struct i2c_client *client)
 	int n_consumers;
 	int device_type;
 	int ret;
+	int i;
 
 	n_consumers = skl_int3472_fill_clk_pdata(&client->dev, &clk_pdata);
 	if (n_consumers < 0)
@@ -194,15 +195,18 @@ static int skl_int3472_tps68470_probe(struct i2c_client *client)
 		cells[1].pdata_size = sizeof(struct tps68470_regulator_platform_data);
 		cells[2].name = "tps68470-gpio";
 
-		gpiod_add_lookup_table(board_data->tps68470_gpio_lookup_table);
+		for (i = 0; i < board_data->n_gpiod_lookups; i++)
+			gpiod_add_lookup_table(board_data->tps68470_gpio_lookup_tables[i]);
 
 		ret = devm_mfd_add_devices(&client->dev, PLATFORM_DEVID_NONE,
 					   cells, TPS68470_WIN_MFD_CELL_COUNT,
 					   NULL, 0, NULL);
 		kfree(cells);
 
-		if (ret)
-			gpiod_remove_lookup_table(board_data->tps68470_gpio_lookup_table);
+		if (ret) {
+			for (i = 0; i < board_data->n_gpiod_lookups; i++)
+				gpiod_remove_lookup_table(board_data->tps68470_gpio_lookup_tables[i]);
+		}
 
 		break;
 	case DESIGNED_FOR_CHROMEOS:
@@ -226,10 +230,13 @@ static int skl_int3472_tps68470_probe(struct i2c_client *client)
 static int skl_int3472_tps68470_remove(struct i2c_client *client)
 {
 	const struct int3472_tps68470_board_data *board_data;
+	int i;
 
 	board_data = int3472_tps68470_get_board_data(dev_name(&client->dev));
-	if (board_data)
-		gpiod_remove_lookup_table(board_data->tps68470_gpio_lookup_table);
+	if (board_data) {
+		for (i = 0; i < board_data->n_gpiod_lookups; i++)
+			gpiod_remove_lookup_table(board_data->tps68470_gpio_lookup_tables[i]);
+	}
 
 	return 0;
 }
diff --git a/drivers/platform/x86/intel/int3472/tps68470.h b/drivers/platform/x86/intel/int3472/tps68470.h
index cfd33eb62740..35915e701593 100644
--- a/drivers/platform/x86/intel/int3472/tps68470.h
+++ b/drivers/platform/x86/intel/int3472/tps68470.h
@@ -16,8 +16,9 @@ struct tps68470_regulator_platform_data;
 
 struct int3472_tps68470_board_data {
 	const char *dev_name;
-	struct gpiod_lookup_table *tps68470_gpio_lookup_table;
 	const struct tps68470_regulator_platform_data *tps68470_regulator_pdata;
+	unsigned int n_gpiod_lookups;
+	struct gpiod_lookup_table *tps68470_gpio_lookup_tables[];
 };
 
 const struct int3472_tps68470_board_data *int3472_tps68470_get_board_data(const char *dev_name);
diff --git a/drivers/platform/x86/intel/int3472/tps68470_board_data.c b/drivers/platform/x86/intel/int3472/tps68470_board_data.c
index 525f09a3b5ff..e6cc8f40f5af 100644
--- a/drivers/platform/x86/intel/int3472/tps68470_board_data.c
+++ b/drivers/platform/x86/intel/int3472/tps68470_board_data.c
@@ -96,7 +96,7 @@ static const struct tps68470_regulator_platform_data surface_go_tps68470_pdata =
 	},
 };
 
-static struct gpiod_lookup_table surface_go_tps68470_gpios = {
+static struct gpiod_lookup_table surface_go_int347a_gpios = {
 	.dev_id = "i2c-INT347A:00",
 	.table = {
 		GPIO_LOOKUP("tps68470-gpio", 9, "reset", GPIO_ACTIVE_LOW),
@@ -105,16 +105,31 @@ static struct gpiod_lookup_table surface_go_tps68470_gpios = {
 	}
 };
 
+static struct gpiod_lookup_table surface_go_int347e_gpios = {
+	.dev_id = "i2c-INT347E:00",
+	.table = {
+		GPIO_LOOKUP("tps68470-gpio", 5, "enable", GPIO_ACTIVE_HIGH),
+		{ }
+	}
+};
+
 static const struct int3472_tps68470_board_data surface_go_tps68470_board_data = {
 	.dev_name = "i2c-INT3472:05",
-	.tps68470_gpio_lookup_table = &surface_go_tps68470_gpios,
 	.tps68470_regulator_pdata = &surface_go_tps68470_pdata,
+	.n_gpiod_lookups = 2,
+	.tps68470_gpio_lookup_tables = {
+		&surface_go_int347a_gpios,
+		&surface_go_int347e_gpios,
+	},
 };
 
 static const struct int3472_tps68470_board_data surface_go3_tps68470_board_data = {
 	.dev_name = "i2c-INT3472:01",
-	.tps68470_gpio_lookup_table = &surface_go_tps68470_gpios,
 	.tps68470_regulator_pdata = &surface_go_tps68470_pdata,
+	.n_gpiod_lookups = 1,
+	.tps68470_gpio_lookup_tables = {
+		&surface_go_int347a_gpios
+	},
 };
 
 static const struct dmi_system_id int3472_tps68470_board_data_table[] = {
-- 
cgit v1.2.3


From 2a5a191c67ba4513f0a6ea459218129c23734ce2 Mon Sep 17 00:00:00 2001
From: Daniel Scally <djrscally@gmail.com>
Date: Thu, 22 Sep 2022 00:04:39 +0100
Subject: platform/x86: int3472: Add board data for Surface Go2 IR camera

Add the board data describing the regulators for the Microsoft
Surface Go line's IR camera.

Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Daniel Scally <djrscally@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../x86/intel/int3472/tps68470_board_data.c        | 33 ++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/drivers/platform/x86/intel/int3472/tps68470_board_data.c b/drivers/platform/x86/intel/int3472/tps68470_board_data.c
index e6cc8f40f5af..309eab9c0558 100644
--- a/drivers/platform/x86/intel/int3472/tps68470_board_data.c
+++ b/drivers/platform/x86/intel/int3472/tps68470_board_data.c
@@ -30,6 +30,15 @@ static struct regulator_consumer_supply int347a_vcm_consumer_supplies[] = {
 static struct regulator_consumer_supply int347a_vsio_consumer_supplies[] = {
 	REGULATOR_SUPPLY("dovdd", "i2c-INT347A:00"),
 	REGULATOR_SUPPLY("vsio", "i2c-INT347A:00-VCM"),
+	REGULATOR_SUPPLY("vddd", "i2c-INT347E:00"),
+};
+
+static struct regulator_consumer_supply int347a_aux1_consumer_supplies[] = {
+	REGULATOR_SUPPLY("vdda", "i2c-INT347E:00"),
+};
+
+static struct regulator_consumer_supply int347a_aux2_consumer_supplies[] = {
+	REGULATOR_SUPPLY("vdddo", "i2c-INT347E:00"),
 };
 
 static const struct regulator_init_data surface_go_tps68470_core_reg_init_data = {
@@ -86,6 +95,28 @@ static const struct regulator_init_data surface_go_tps68470_vsio_reg_init_data =
 	.consumer_supplies = int347a_vsio_consumer_supplies,
 };
 
+static const struct regulator_init_data surface_go_tps68470_aux1_reg_init_data = {
+	.constraints = {
+		.min_uV = 2815200,
+		.max_uV = 2815200,
+		.apply_uV = 1,
+		.valid_ops_mask = REGULATOR_CHANGE_STATUS,
+	},
+	.num_consumer_supplies = ARRAY_SIZE(int347a_aux1_consumer_supplies),
+	.consumer_supplies = int347a_aux1_consumer_supplies,
+};
+
+static const struct regulator_init_data surface_go_tps68470_aux2_reg_init_data = {
+	.constraints = {
+		.min_uV = 1800600,
+		.max_uV = 1800600,
+		.apply_uV = 1,
+		.valid_ops_mask = REGULATOR_CHANGE_STATUS,
+	},
+	.num_consumer_supplies = ARRAY_SIZE(int347a_aux2_consumer_supplies),
+	.consumer_supplies = int347a_aux2_consumer_supplies,
+};
+
 static const struct tps68470_regulator_platform_data surface_go_tps68470_pdata = {
 	.reg_init_data = {
 		[TPS68470_CORE] = &surface_go_tps68470_core_reg_init_data,
@@ -93,6 +124,8 @@ static const struct tps68470_regulator_platform_data surface_go_tps68470_pdata =
 		[TPS68470_VCM]  = &surface_go_tps68470_vcm_reg_init_data,
 		[TPS68470_VIO] = &surface_go_tps68470_vio_reg_init_data,
 		[TPS68470_VSIO] = &surface_go_tps68470_vsio_reg_init_data,
+		[TPS68470_AUX1] = &surface_go_tps68470_aux1_reg_init_data,
+		[TPS68470_AUX2] = &surface_go_tps68470_aux2_reg_init_data,
 	},
 };
 
-- 
cgit v1.2.3


From 53e41b85939547cda40a7b9748c5a1f9b1f1a865 Mon Sep 17 00:00:00 2001
From: Shang XiaoJing <shangxiaojing@huawei.com>
Date: Thu, 22 Sep 2022 20:31:41 +0800
Subject: thermal: int340x: processor_thermal: Use module_pci_driver() macro

Since PCI provides helper macro module_pci_driver(), the
module_init/exit code can be replaced with it.

Signed-off-by: Shang XiaoJing <shangxiaojing@huawei.com>
[ rjw: Subject and changelog edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../intel/int340x_thermal/processor_thermal_device_pci.c    | 13 +------------
 .../int340x_thermal/processor_thermal_device_pci_legacy.c   | 13 +------------
 2 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c
index c2dc4c158b9d..bf1b1cdfade4 100644
--- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c
+++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c
@@ -373,18 +373,7 @@ static struct pci_driver proc_thermal_pci_driver = {
 	.driver.pm	= &proc_thermal_pci_pm,
 };
 
-static int __init proc_thermal_init(void)
-{
-	return pci_register_driver(&proc_thermal_pci_driver);
-}
-
-static void __exit proc_thermal_exit(void)
-{
-	pci_unregister_driver(&proc_thermal_pci_driver);
-}
-
-module_init(proc_thermal_init);
-module_exit(proc_thermal_exit);
+module_pci_driver(proc_thermal_pci_driver);
 
 MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
 MODULE_DESCRIPTION("Processor Thermal Reporting Device Driver");
diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci_legacy.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci_legacy.c
index 4571a1a53b84..09e032f822f3 100644
--- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci_legacy.c
+++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci_legacy.c
@@ -151,18 +151,7 @@ static struct pci_driver proc_thermal_pci_driver = {
 	.driver.pm	= &proc_thermal_pci_pm,
 };
 
-static int __init proc_thermal_init(void)
-{
-	return pci_register_driver(&proc_thermal_pci_driver);
-}
-
-static void __exit proc_thermal_exit(void)
-{
-	pci_unregister_driver(&proc_thermal_pci_driver);
-}
-
-module_init(proc_thermal_init);
-module_exit(proc_thermal_exit);
+module_pci_driver(proc_thermal_pci_driver);
 
 MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
 MODULE_DESCRIPTION("Processor Thermal Reporting Device Driver");
-- 
cgit v1.2.3


From e66332a4bc89e3219baaabc7bb549d17a2c42ce9 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 22 Sep 2022 20:04:40 +0200
Subject: PM: runtime: Return -EINPROGRESS from rpm_resume() in the RPM_NOWAIT
 case

The prospective callers of rpm_resume() passing RPM_NOWAIT to it may
be confused when it returns 0 without actually resuming the device
which may happen if the device is suspending at the given time and it
will only resume when the suspend in progress has completed.  To avoid
that confusion, return -EINPROGRESS from rpm_resume() in that case.

Since none of the current callers passing RPM_NOWAIT to rpm_resume()
check its return value, this change has no functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/base/power/runtime.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 997be3ac20a7..b52049098d4e 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -792,10 +792,13 @@ static int rpm_resume(struct device *dev, int rpmflags)
 		DEFINE_WAIT(wait);
 
 		if (rpmflags & (RPM_ASYNC | RPM_NOWAIT)) {
-			if (dev->power.runtime_status == RPM_SUSPENDING)
+			if (dev->power.runtime_status == RPM_SUSPENDING) {
 				dev->power.deferred_resume = true;
-			else
+				if (rpmflags & RPM_NOWAIT)
+					retval = -EINPROGRESS;
+			} else {
 				retval = -EINPROGRESS;
+			}
 			goto out;
 		}
 
-- 
cgit v1.2.3


From 4c081324df5608b73428662ca54d5221ea03a6bd Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sat, 24 Sep 2022 13:47:36 +0800
Subject: powercap: intel_rapl: Use standard Energy Unit for SPR Dram RAPL
 domain

Intel Xeon servers used to use a fixed energy resolution (15.3uj) for
Dram RAPL domain. But on SPR, Dram RAPL domain follows the standard
energy resolution as described in MSR_RAPL_POWER_UNIT.

Remove the SPR dram_domain_energy_unit quirk.

Fixes: 2d798d9f5967 ("powercap: intel_rapl: add support for Sapphire Rapids")
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Wang Wendy <wendy.wang@intel.com>
Cc: 5.9+ <stable@vger.kernel.org> # 5.9+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 4688d0647b22..26d00b1853b4 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -1038,7 +1038,6 @@ static const struct rapl_defaults rapl_defaults_spr_server = {
 	.check_unit = rapl_check_unit_core,
 	.set_floor_freq = set_floor_freq_default,
 	.compute_time_window = rapl_compute_time_window_core,
-	.dram_domain_energy_unit = 15300,
 	.psys_domain_energy_unit = 1000000000,
 	.spr_psys_bits = true,
 };
-- 
cgit v1.2.3


From 100a5737938049706987d06a4ab73e82b8277085 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 21 Sep 2022 15:40:48 -0500
Subject: ACPI: x86: s2idle: Move _HID handling for AMD systems into structures

Right now the information about which cases to use for what are in a
comment, but this is error prone.  Instead move all information into
a dedicated structure.

Tested-by: catalin@antebit.com
Reviewed-by: Philipp Zabel <philipp.zabel@gmail.com>
Tested-by: Philipp Zabel <philipp.zabel@gmail.com> # GA402RJ
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/x86/s2idle.c | 63 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 17 deletions(-)

diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index f9ac12b778e6..28a3ef9a6bc1 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -363,6 +363,39 @@ out:
 	return ret;
 }
 
+struct amd_lps0_hid_device_data {
+	const unsigned int rev_id;
+	const bool check_off_by_one;
+	const bool prefer_amd_guid;
+};
+
+static const struct amd_lps0_hid_device_data amd_picasso = {
+	.rev_id = 0,
+	.check_off_by_one = true,
+	.prefer_amd_guid = false,
+};
+
+static const struct amd_lps0_hid_device_data amd_cezanne = {
+	.rev_id = 0,
+	.check_off_by_one = false,
+	.prefer_amd_guid = false,
+};
+
+static const struct amd_lps0_hid_device_data amd_rembrandt = {
+	.rev_id = 2,
+	.check_off_by_one = false,
+	.prefer_amd_guid = true,
+};
+
+static const struct acpi_device_id amd_hid_ids[] = {
+	{"AMD0004",	(kernel_ulong_t)&amd_picasso,	},
+	{"AMD0005",	(kernel_ulong_t)&amd_picasso,	},
+	{"AMDI0005",	(kernel_ulong_t)&amd_picasso,	},
+	{"AMDI0006",	(kernel_ulong_t)&amd_cezanne,	},
+	{"AMDI0007",	(kernel_ulong_t)&amd_rembrandt,	},
+	{}
+};
+
 static int lps0_device_attach(struct acpi_device *adev,
 			      const struct acpi_device_id *not_used)
 {
@@ -370,31 +403,27 @@ static int lps0_device_attach(struct acpi_device *adev,
 		return 0;
 
 	if (acpi_s2idle_vendor_amd()) {
-		/* AMD0004, AMD0005, AMDI0005:
-		 * - Should use rev_id 0x0
-		 * - function mask > 0x3: Should use AMD method, but has off by one bug
-		 * - function mask = 0x3: Should use Microsoft method
-		 * AMDI0006:
-		 * - should use rev_id 0x0
-		 * - function mask = 0x3: Should use Microsoft method
-		 * AMDI0007:
-		 * - Should use rev_id 0x2
-		 * - Should only use AMD method
-		 */
-		const char *hid = acpi_device_hid(adev);
-		rev_id = strcmp(hid, "AMDI0007") ? 0 : 2;
+		static const struct acpi_device_id *dev_id;
+		const struct amd_lps0_hid_device_data *data;
+
+		for (dev_id = &amd_hid_ids[0]; dev_id->id[0]; dev_id++)
+			if (acpi_dev_hid_uid_match(adev, dev_id->id, NULL))
+				break;
+		if (dev_id)
+			data = (const struct amd_lps0_hid_device_data *) dev_id->driver_data;
+		else
+			return 0;
+		rev_id = data->rev_id;
 		lps0_dsm_func_mask = validate_dsm(adev->handle,
 					ACPI_LPS0_DSM_UUID_AMD, rev_id, &lps0_dsm_guid);
 		lps0_dsm_func_mask_microsoft = validate_dsm(adev->handle,
 					ACPI_LPS0_DSM_UUID_MICROSOFT, 0,
 					&lps0_dsm_guid_microsoft);
-		if (lps0_dsm_func_mask > 0x3 && (!strcmp(hid, "AMD0004") ||
-						 !strcmp(hid, "AMD0005") ||
-						 !strcmp(hid, "AMDI0005"))) {
+		if (lps0_dsm_func_mask > 0x3 && data->check_off_by_one) {
 			lps0_dsm_func_mask = (lps0_dsm_func_mask << 1) | 0x1;
 			acpi_handle_debug(adev->handle, "_DSM UUID %s: Adjusted function mask: 0x%x\n",
 					  ACPI_LPS0_DSM_UUID_AMD, lps0_dsm_func_mask);
-		} else if (lps0_dsm_func_mask_microsoft > 0 &&
+		} else if (lps0_dsm_func_mask_microsoft > 0 && data->prefer_amd_guid &&
 				(!strcmp(hid, "AMDI0007") ||
 				 !strcmp(hid, "AMDI0008"))) {
 			lps0_dsm_func_mask_microsoft = -EINVAL;
-- 
cgit v1.2.3


From fd894f05cf30956296384b0c4a81892ed21ac1d9 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 21 Sep 2022 15:40:49 -0500
Subject: ACPI: x86: s2idle: If a new AMD _HID is missing assume Rembrandt

A mistake was made that only AMDI0007 was set to rev of "2", but
it should have been also set for AMDI008. If an ID is missing from
the _HID table, then assume it matches Rembrandt behavior.

This implicitly means that if any other behavior changes happen
in the future missing IDs must be added to that table.

Tested-by: catalin@antebit.com
Reviewed-by: Philipp Zabel <philipp.zabel@gmail.com>
Tested-by: Philipp Zabel <philipp.zabel@gmail.com> # GA402RJ
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/x86/s2idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index 28a3ef9a6bc1..2c5931d247a2 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -412,7 +412,7 @@ static int lps0_device_attach(struct acpi_device *adev,
 		if (dev_id)
 			data = (const struct amd_lps0_hid_device_data *) dev_id->driver_data;
 		else
-			return 0;
+			data = &amd_rembrandt;
 		rev_id = data->rev_id;
 		lps0_dsm_func_mask = validate_dsm(adev->handle,
 					ACPI_LPS0_DSM_UUID_AMD, rev_id, &lps0_dsm_guid);
-- 
cgit v1.2.3


From a0bc002393d42ab7d895b0e82e730ce7622bdbff Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 21 Sep 2022 15:40:50 -0500
Subject: ACPI: x86: s2idle: Add module parameter to prefer Microsoft GUID

OEMs have made some mistakes in the past for the AMD GUID support
and not populated the method properly.  To add an escape hatch for
this problem introduce a module parameter that can force using
the Microsoft GUID.

This is intentionally introduced to both Intel and AMD codepaths
to allow using the parameter as a debugging tactic on either.

Reviewed-by: Philipp Zabel <philipp.zabel@gmail.com>
Tested-by: Philipp Zabel <philipp.zabel@gmail.com> # GA402RJ
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/x86/s2idle.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index 2c5931d247a2..725d2a6c87ed 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -27,6 +27,10 @@ static bool sleep_no_lps0 __read_mostly;
 module_param(sleep_no_lps0, bool, 0644);
 MODULE_PARM_DESC(sleep_no_lps0, "Do not use the special LPS0 device interface");
 
+static bool prefer_microsoft_dsm_guid __read_mostly;
+module_param(prefer_microsoft_dsm_guid, bool, 0644);
+MODULE_PARM_DESC(prefer_microsoft_dsm_guid, "Prefer using Microsoft GUID in LPS0 device _DSM evaluation");
+
 static const struct acpi_device_id lps0_device_ids[] = {
 	{"PNP0D80", },
 	{"", },
@@ -402,6 +406,9 @@ static int lps0_device_attach(struct acpi_device *adev,
 	if (lps0_device_handle)
 		return 0;
 
+	lps0_dsm_func_mask_microsoft = validate_dsm(adev->handle,
+						    ACPI_LPS0_DSM_UUID_MICROSOFT, 0,
+						    &lps0_dsm_guid_microsoft);
 	if (acpi_s2idle_vendor_amd()) {
 		static const struct acpi_device_id *dev_id;
 		const struct amd_lps0_hid_device_data *data;
@@ -416,16 +423,12 @@ static int lps0_device_attach(struct acpi_device *adev,
 		rev_id = data->rev_id;
 		lps0_dsm_func_mask = validate_dsm(adev->handle,
 					ACPI_LPS0_DSM_UUID_AMD, rev_id, &lps0_dsm_guid);
-		lps0_dsm_func_mask_microsoft = validate_dsm(adev->handle,
-					ACPI_LPS0_DSM_UUID_MICROSOFT, 0,
-					&lps0_dsm_guid_microsoft);
 		if (lps0_dsm_func_mask > 0x3 && data->check_off_by_one) {
 			lps0_dsm_func_mask = (lps0_dsm_func_mask << 1) | 0x1;
 			acpi_handle_debug(adev->handle, "_DSM UUID %s: Adjusted function mask: 0x%x\n",
 					  ACPI_LPS0_DSM_UUID_AMD, lps0_dsm_func_mask);
 		} else if (lps0_dsm_func_mask_microsoft > 0 && data->prefer_amd_guid &&
-				(!strcmp(hid, "AMDI0007") ||
-				 !strcmp(hid, "AMDI0008"))) {
+				!prefer_microsoft_dsm_guid) {
 			lps0_dsm_func_mask_microsoft = -EINVAL;
 			acpi_handle_debug(adev->handle, "_DSM Using AMD method\n");
 		}
@@ -433,7 +436,8 @@ static int lps0_device_attach(struct acpi_device *adev,
 		rev_id = 1;
 		lps0_dsm_func_mask = validate_dsm(adev->handle,
 					ACPI_LPS0_DSM_UUID, rev_id, &lps0_dsm_guid);
-		lps0_dsm_func_mask_microsoft = -EINVAL;
+		if (!prefer_microsoft_dsm_guid)
+			lps0_dsm_func_mask_microsoft = -EINVAL;
 	}
 
 	if (lps0_dsm_func_mask < 0 && lps0_dsm_func_mask_microsoft < 0)
-- 
cgit v1.2.3


From d0f61e89f08dd46a090da50f5d747204673f70ea Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 21 Sep 2022 15:40:51 -0500
Subject: ACPI: x86: s2idle: Add a quirk for ASUS TUF Gaming A17 FA707RE

ASUS TUF Gaming A17 FA707RE has problems with ACPI events after
s2idle resume.  It's from a missing call to an ASL method in AMD
the s2idle calling path. Force the system to use the Microsoft
Modern Standby calling path instead.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=216101
Reported-and-tested-by: catalin@antebit.com
Reviewed-by: Philipp Zabel <philipp.zabel@gmail.com>
Tested-by: Philipp Zabel <philipp.zabel@gmail.com> # GA402RJ
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/x86/s2idle.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index 725d2a6c87ed..99eae362de6d 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -17,6 +17,7 @@
 
 #include <linux/acpi.h>
 #include <linux/device.h>
+#include <linux/dmi.h>
 #include <linux/suspend.h>
 
 #include "../sleep.h"
@@ -400,6 +401,28 @@ static const struct acpi_device_id amd_hid_ids[] = {
 	{}
 };
 
+static int lps0_prefer_microsoft(const struct dmi_system_id *id)
+{
+	pr_debug("Preferring Microsoft GUID.\n");
+	prefer_microsoft_dsm_guid = true;
+	return 0;
+}
+
+static const struct dmi_system_id s2idle_dmi_table[] __initconst = {
+	{
+		/*
+		 * ASUS TUF Gaming A17 FA707RE
+		 * https://bugzilla.kernel.org/show_bug.cgi?id=216101
+		 */
+		.callback = lps0_prefer_microsoft,
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ASUS TUF Gaming A17"),
+		},
+	},
+	{}
+};
+
 static int lps0_device_attach(struct acpi_device *adev,
 			      const struct acpi_device_id *not_used)
 {
@@ -566,8 +589,9 @@ static const struct platform_s2idle_ops acpi_s2idle_ops_lps0 = {
 	.end = acpi_s2idle_end,
 };
 
-void acpi_s2idle_setup(void)
+void __init acpi_s2idle_setup(void)
 {
+	dmi_check_system(s2idle_dmi_table);
 	acpi_scan_add_handler(&lps0_handler);
 	s2idle_set_ops(&acpi_s2idle_ops_lps0);
 }
-- 
cgit v1.2.3


From ddeea2c3cb881adee0f979bfd61a90fb057ef3e6 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 21 Sep 2022 15:40:52 -0500
Subject: ACPI: x86: s2idle: Add a quirk for ASUS ROG Zephyrus G14

ASUS ROG Zephyrus G14 is affected by the same BIOS bug as ASUS TUF
Gaming A17 where important ASL is not called in the AMD code path.
Use the Microsoft codepath instead.

Reported-and-suggested-by: Philipp Zabel <philipp.zabel@gmail.com>
Tested-by: Philipp Zabel <philipp.zabel@gmail.com>
Tested-by: Matthew Anderson <ruinairas1992@gmail.com>
Tested-by: Marko Cekrlic <marko.cekrlic.26@gmail.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/x86/s2idle.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index 99eae362de6d..2cd381f6c002 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -420,6 +420,14 @@ static const struct dmi_system_id s2idle_dmi_table[] __initconst = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "ASUS TUF Gaming A17"),
 		},
 	},
+	{
+		/* ASUS ROG Zephyrus G14 (2022) */
+		.callback = lps0_prefer_microsoft,
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ROG Zephyrus G14 GA402"),
+		},
+	},
 	{}
 };
 
-- 
cgit v1.2.3


From 888ca9c7955e3969df84f5a1bda2143be9fa365a Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 21 Sep 2022 15:40:53 -0500
Subject: ACPI: x86: s2idle: Add a quirk for Lenovo Slim 7 Pro 14ARH7

Lenovo Slim 7 Pro 14ARH7 has a sporadically non-functional keyboard
when resuming from s2idle.  This is caused by some missing calls to the
EC that don't occur in the AMD codepath but only in the Microsoft codepath.

Add the system to the quirk list to force Microsoft codepath.

Reported-by: Travis Glenn Hansen <travisghansen@yahoo.com>
Reported-by: Sebastian S. <iam@decentr.al>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216473
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216438
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/x86/s2idle.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index 2cd381f6c002..c811eeed42cd 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -428,6 +428,18 @@ static const struct dmi_system_id s2idle_dmi_table[] __initconst = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "ROG Zephyrus G14 GA402"),
 		},
 	},
+	{
+		/*
+		 * Lenovo Yoga Slim 7 Pro X 14ARH7
+		 * https://bugzilla.kernel.org/show_bug.cgi?id=216473 : 82V2
+		 * https://bugzilla.kernel.org/show_bug.cgi?id=216438 : 82TL
+		 */
+		.callback = lps0_prefer_microsoft,
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "82"),
+		},
+	},
 	{}
 };
 
-- 
cgit v1.2.3


From 631b54519e8e58758e4541756c1818dcebd2dcc9 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 21 Sep 2022 15:40:54 -0500
Subject: ACPI: x86: s2idle: Add a quirk for ASUSTeK COMPUTER INC. ROG Flow X13

ASUSTeK COMPUTER INC. ROG Flow X13 has a problem with fans upon wakeup from
s2idle. In examining the ASL, functions 3 and 4 are not called in the AMD
codepath but only in the Microsoft codepath.

Add the system to the quirk list to force Microsoft codepath.

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2148
Tested-by: short-circuit <davidedp91@gmail.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/x86/s2idle.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index c811eeed42cd..eedd21d8a284 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -440,6 +440,17 @@ static const struct dmi_system_id s2idle_dmi_table[] __initconst = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "82"),
 		},
 	},
+	{
+		/*
+		 * ASUSTeK COMPUTER INC. ROG Flow X13 GV301RE_GV301RE
+		 * https://gitlab.freedesktop.org/drm/amd/-/issues/2148
+		 */
+		.callback = lps0_prefer_microsoft,
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ROG Flow X13 GV301"),
+		},
+	},
 	{}
 };
 
-- 
cgit v1.2.3


From e09bd5757b5227d6804b30c58d4587f7f87d1afa Mon Sep 17 00:00:00 2001
From: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Date: Wed, 13 Jul 2022 13:15:11 +0200
Subject: PM / devfreq: mtk-cci: Handle sram regulator probe deferral

If the regulator_get_optional()	call for the SRAM regulator returns
a probe deferral, we must bail out and retry probing later: failing
to do this will	produce	unstabilities on platforms requiring the
handling for this regulator.

Fixes: b615b00c42da ("PM / devfreq: mediatek: Introduce MediaTek CCI devfreq driver")
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/devfreq/mtk-cci-devfreq.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/devfreq/mtk-cci-devfreq.c b/drivers/devfreq/mtk-cci-devfreq.c
index 71abb3fbd042..e5458ada5197 100644
--- a/drivers/devfreq/mtk-cci-devfreq.c
+++ b/drivers/devfreq/mtk-cci-devfreq.c
@@ -291,9 +291,13 @@ static int mtk_ccifreq_probe(struct platform_device *pdev)
 	}
 
 	drv->sram_reg = devm_regulator_get_optional(dev, "sram");
-	if (IS_ERR(drv->sram_reg))
+	if (IS_ERR(drv->sram_reg)) {
+		ret = PTR_ERR(drv->sram_reg);
+		if (ret == -EPROBE_DEFER)
+			goto out_free_resources;
+
 		drv->sram_reg = NULL;
-	else {
+	} else {
 		ret = regulator_enable(drv->sram_reg);
 		if (ret) {
 			dev_err(dev, "failed to enable sram regulator\n");
-- 
cgit v1.2.3


From fb2ac84f8acccdec644d26dfc8ba6554f30cd6c0 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 10 Sep 2022 16:15:56 +0200
Subject: PM / devfreq: rockchip-dfi: Fix an error message

There is a typo in the message. The clock name should be 'pclk_ddr_mon'.
Fix it.

While at it, switch to dev_err_probe() which is less verbose, filters
-EPROBE_DEFER, and log the error code in a human readable way.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/devfreq/event/rockchip-dfi.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/devfreq/event/rockchip-dfi.c b/drivers/devfreq/event/rockchip-dfi.c
index 9a88faaf8b27..39ac069cabc7 100644
--- a/drivers/devfreq/event/rockchip-dfi.c
+++ b/drivers/devfreq/event/rockchip-dfi.c
@@ -189,10 +189,9 @@ static int rockchip_dfi_probe(struct platform_device *pdev)
 		return PTR_ERR(data->regs);
 
 	data->clk = devm_clk_get(dev, "pclk_ddr_mon");
-	if (IS_ERR(data->clk)) {
-		dev_err(dev, "Cannot get the clk dmc_clk\n");
-		return PTR_ERR(data->clk);
-	}
+	if (IS_ERR(data->clk))
+		return dev_err_probe(dev, PTR_ERR(data->clk),
+				     "Cannot get the clk pclk_ddr_mon\n");
 
 	/* try to find the optional reference to the pmu syscon */
 	node = of_parse_phandle(np, "rockchip,pmu", 0);
-- 
cgit v1.2.3


From f991b117120569838676b9512f6e7d755fdfc12b Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Fri, 23 Sep 2022 22:50:16 +0800
Subject: cpufreq: tegra194: change tegra239_cpufreq_soc to static

tegra239_cpufreq_soc is only used in tegra194-cpufreq.c now, change it
to static.

Fixes: 676886010707 ("cpufreq: tegra194: Add support for Tegra239")
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/tegra194-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c
index cfc1b225f811..c2004cae3f02 100644
--- a/drivers/cpufreq/tegra194-cpufreq.c
+++ b/drivers/cpufreq/tegra194-cpufreq.c
@@ -161,7 +161,7 @@ static const struct tegra_cpufreq_soc tegra234_cpufreq_soc = {
 	.num_clusters = 3,
 };
 
-const struct tegra_cpufreq_soc tegra239_cpufreq_soc = {
+static const struct tegra_cpufreq_soc tegra239_cpufreq_soc = {
 	.ops = &tegra234_cpufreq_ops,
 	.actmon_cntr_base = 0x4000,
 	.maxcpus_per_cluster = 8,
-- 
cgit v1.2.3


From f7968c22bd427f98bacccc44f11523ba6b498e7d Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Sat, 24 Sep 2022 17:21:59 +0800
Subject: cpufreq: Add __init annotation to module init funcs

Add missing __init annotation to module init funcs.

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/highbank-cpufreq.c | 2 +-
 drivers/cpufreq/sti-cpufreq.c      | 2 +-
 drivers/cpufreq/ti-cpufreq.c       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/cpufreq/highbank-cpufreq.c b/drivers/cpufreq/highbank-cpufreq.c
index ac57cddc5f2f..a45864701143 100644
--- a/drivers/cpufreq/highbank-cpufreq.c
+++ b/drivers/cpufreq/highbank-cpufreq.c
@@ -55,7 +55,7 @@ static struct notifier_block hb_cpufreq_clk_nb = {
 	.notifier_call = hb_cpufreq_clk_notify,
 };
 
-static int hb_cpufreq_driver_init(void)
+static int __init hb_cpufreq_driver_init(void)
 {
 	struct platform_device_info devinfo = { .name = "cpufreq-dt", };
 	struct device *cpu_dev;
diff --git a/drivers/cpufreq/sti-cpufreq.c b/drivers/cpufreq/sti-cpufreq.c
index a67df90848c2..1a63aeea8711 100644
--- a/drivers/cpufreq/sti-cpufreq.c
+++ b/drivers/cpufreq/sti-cpufreq.c
@@ -252,7 +252,7 @@ static int sti_cpufreq_fetch_syscon_registers(void)
 	return 0;
 }
 
-static int sti_cpufreq_init(void)
+static int __init sti_cpufreq_init(void)
 {
 	int ret;
 
diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c
index df85a77d476b..f64180dd2005 100644
--- a/drivers/cpufreq/ti-cpufreq.c
+++ b/drivers/cpufreq/ti-cpufreq.c
@@ -398,7 +398,7 @@ fail_put_node:
 	return ret;
 }
 
-static int ti_cpufreq_init(void)
+static int __init ti_cpufreq_init(void)
 {
 	const struct of_device_id *match;
 
-- 
cgit v1.2.3


From 4335417da2b8d6d9b2d4411b5f9e248e5bb2d380 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali@kernel.org>
Date: Thu, 14 Jul 2022 20:33:25 +0200
Subject: gpio: mvebu: Fix check for pwm support on non-A8K platforms
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pwm support incompatible with Armada 80x0/70x0 API is not only in
Armada 370, but also in Armada XP, 38x and 39x. So basically every non-A8K
platform. Fix check for pwm support appropriately.

Fixes: 85b7d8abfec7 ("gpio: mvebu: add pwm support for Armada 8K/7K")
Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Bartosz Golaszewski <brgl@bgdev.pl>
---
 drivers/gpio/gpio-mvebu.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/gpio/gpio-mvebu.c b/drivers/gpio/gpio-mvebu.c
index aa126ab80f0c..1bb317b8dcce 100644
--- a/drivers/gpio/gpio-mvebu.c
+++ b/drivers/gpio/gpio-mvebu.c
@@ -790,8 +790,12 @@ static int mvebu_pwm_probe(struct platform_device *pdev,
 	u32 offset;
 	u32 set;
 
-	if (of_device_is_compatible(mvchip->chip.of_node,
-				    "marvell,armada-370-gpio")) {
+	if (mvchip->soc_variant == MVEBU_GPIO_SOC_VARIANT_A8K) {
+		int ret = of_property_read_u32(dev->of_node,
+					       "marvell,pwm-offset", &offset);
+		if (ret < 0)
+			return 0;
+	} else {
 		/*
 		 * There are only two sets of PWM configuration registers for
 		 * all the GPIO lines on those SoCs which this driver reserves
@@ -801,13 +805,6 @@ static int mvebu_pwm_probe(struct platform_device *pdev,
 		if (!platform_get_resource_byname(pdev, IORESOURCE_MEM, "pwm"))
 			return 0;
 		offset = 0;
-	} else if (mvchip->soc_variant == MVEBU_GPIO_SOC_VARIANT_A8K) {
-		int ret = of_property_read_u32(dev->of_node,
-					       "marvell,pwm-offset", &offset);
-		if (ret < 0)
-			return 0;
-	} else {
-		return 0;
 	}
 
 	if (IS_ERR(mvchip->clk))
-- 
cgit v1.2.3


From c4c0efb06f17fa4a37ad99e7752b18a5405c76dc Mon Sep 17 00:00:00 2001
From: Xuewen Yan <xuewen.yan@unisoc.com>
Date: Mon, 26 Sep 2022 16:34:05 +0800
Subject: cpufreq: qcom-cpufreq-hw: Add cpufreq qos for LMh

Before update thermal pressure, the max cpufreq should be limited.
Add QOS control for Lmh throttle cpufreq.

Signed-off-by: Xuewen Yan <xuewen.yan@unisoc.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/qcom-cpufreq-hw.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index bb32659820ce..833589bc95e4 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -13,6 +13,7 @@
 #include <linux/of_address.h>
 #include <linux/of_platform.h>
 #include <linux/pm_opp.h>
+#include <linux/pm_qos.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/units.h>
@@ -56,6 +57,8 @@ struct qcom_cpufreq_data {
 	struct cpufreq_policy *policy;
 
 	bool per_core_dcvs;
+
+	struct freq_qos_request throttle_freq_req;
 };
 
 static unsigned long cpu_hw_rate, xo_rate;
@@ -321,6 +324,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
 
 	throttled_freq = freq_hz / HZ_PER_KHZ;
 
+	freq_qos_update_request(&data->throttle_freq_req, throttled_freq);
+
 	/* Update thermal pressure (the boost frequencies are accepted) */
 	arch_update_thermal_pressure(policy->related_cpus, throttled_freq);
 
@@ -413,6 +418,14 @@ static int qcom_cpufreq_hw_lmh_init(struct cpufreq_policy *policy, int index)
 	if (data->throttle_irq < 0)
 		return data->throttle_irq;
 
+	ret = freq_qos_add_request(&policy->constraints,
+				   &data->throttle_freq_req, FREQ_QOS_MAX,
+				   FREQ_QOS_MAX_DEFAULT_VALUE);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Failed to add freq constraint (%d)\n", ret);
+		return ret;
+	}
+
 	data->cancel_throttle = false;
 	data->policy = policy;
 
@@ -479,6 +492,7 @@ static void qcom_cpufreq_hw_lmh_exit(struct qcom_cpufreq_data *data)
 	if (data->throttle_irq <= 0)
 		return;
 
+	freq_qos_remove_request(&data->throttle_freq_req);
 	free_irq(data->throttle_irq, data);
 }
 
-- 
cgit v1.2.3


From e42c9c54f2e731885ea2bd92c478c85d879637a6 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 22 Sep 2022 23:32:05 -0700
Subject: perf tools: Get a perf cgroup more portably in BPF

The perf_event_cgrp_id can be different on other configurations.

To be more portable as CO-RE, it needs to get the cgroup subsys id using
the bpf_core_enum_value() helper.

Suggested-by: Ian Rogers <irogers@google.com>
Reviewed-by: Ian Rogers <irogers@google.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Hao Luo <haoluo@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: bpf@vger.kernel.org
Link: https://lore.kernel.org/r/20220923063205.772936-1-namhyung@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/bpf_skel/bperf_cgroup.bpf.c | 11 ++++++++++-
 tools/perf/util/bpf_skel/off_cpu.bpf.c      | 18 ++++++++++++++----
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
index c72f8ad96f75..9aa8cdd93de4 100644
--- a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
+++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
@@ -48,6 +48,7 @@ const volatile __u32 num_cpus = 1;
 
 int enabled = 0;
 int use_cgroup_v2 = 0;
+int perf_subsys_id = -1;
 
 static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
 {
@@ -58,7 +59,15 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
 	int level;
 	int cnt;
 
-	cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_event_cgrp_id], cgroup);
+	if (perf_subsys_id == -1) {
+#if __has_builtin(__builtin_preserve_enum_value)
+		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
+						     perf_event_cgrp_id);
+#else
+		perf_subsys_id = perf_event_cgrp_id;
+#endif
+	}
+	cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
 	level = BPF_CORE_READ(cgrp, level);
 
 	for (cnt = 0; i < MAX_LEVELS; i++) {
diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
index c4ba2bcf179f..38e3b287dbb2 100644
--- a/tools/perf/util/bpf_skel/off_cpu.bpf.c
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
@@ -94,6 +94,8 @@ const volatile bool has_prev_state = false;
 const volatile bool needs_cgroup = false;
 const volatile bool uses_cgroup_v1 = false;
 
+int perf_subsys_id = -1;
+
 /*
  * Old kernel used to call it task_struct->state and now it's '__state'.
  * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
@@ -119,11 +121,19 @@ static inline __u64 get_cgroup_id(struct task_struct *t)
 {
 	struct cgroup *cgrp;
 
-	if (uses_cgroup_v1)
-		cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_event_cgrp_id], cgroup);
-	else
-		cgrp = BPF_CORE_READ(t, cgroups, dfl_cgrp);
+	if (!uses_cgroup_v1)
+		return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
+
+	if (perf_subsys_id == -1) {
+#if __has_builtin(__builtin_preserve_enum_value)
+		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
+						     perf_event_cgrp_id);
+#else
+		perf_subsys_id = perf_event_cgrp_id;
+#endif
+	}
 
+	cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
 	return BPF_CORE_READ(cgrp, kn, id);
 }
 
-- 
cgit v1.2.3


From e28c07871c3f2107e316c2590d4703496bd114f4 Mon Sep 17 00:00:00 2001
From: Zhengjun Xing <zhengjun.xing@linux.intel.com>
Date: Fri, 23 Sep 2022 11:00:12 +0800
Subject: perf print-events: Fix "perf list" can not display the PMU prefix for
 some hybrid cache events

Some hybrid hardware cache events are only available on one CPU PMU. For
example, 'L1-dcache-load-misses' is only available on cpu_core.

We have supported in the perf list clearly reporting this info, the
function works fine before but recently the argument "config" in API
is_event_supported() is changed from "u64" to "unsigned int" which
caused a regression, the "perf list" then can not display the PMU prefix
for some hybrid cache events.

For the hybrid systems, the PMU type ID is stored at config[63:32],
define config to "unsigned int" will miss the PMU type ID information,
then the regression happened, the config should be defined as "u64".

Before:
 # ./perf list |grep "Hardware cache event"
  L1-dcache-load-misses                              [Hardware cache event]
  L1-dcache-loads                                    [Hardware cache event]
  L1-dcache-stores                                   [Hardware cache event]
  L1-icache-load-misses                              [Hardware cache event]
  L1-icache-loads                                    [Hardware cache event]
  LLC-load-misses                                    [Hardware cache event]
  LLC-loads                                          [Hardware cache event]
  LLC-store-misses                                   [Hardware cache event]
  LLC-stores                                         [Hardware cache event]
  branch-load-misses                                 [Hardware cache event]
  branch-loads                                       [Hardware cache event]
  dTLB-load-misses                                   [Hardware cache event]
  dTLB-loads                                         [Hardware cache event]
  dTLB-store-misses                                  [Hardware cache event]
  dTLB-stores                                        [Hardware cache event]
  iTLB-load-misses                                   [Hardware cache event]
  node-load-misses                                   [Hardware cache event]
  node-loads                                         [Hardware cache event]

After:
 # ./perf list |grep "Hardware cache event"
  L1-dcache-loads                                    [Hardware cache event]
  L1-dcache-stores                                   [Hardware cache event]
  L1-icache-load-misses                              [Hardware cache event]
  LLC-load-misses                                    [Hardware cache event]
  LLC-loads                                          [Hardware cache event]
  LLC-store-misses                                   [Hardware cache event]
  LLC-stores                                         [Hardware cache event]
  branch-load-misses                                 [Hardware cache event]
  branch-loads                                       [Hardware cache event]
  cpu_atom/L1-icache-loads/                          [Hardware cache event]
  cpu_core/L1-dcache-load-misses/                    [Hardware cache event]
  cpu_core/node-load-misses/                         [Hardware cache event]
  cpu_core/node-loads/                               [Hardware cache event]
  dTLB-load-misses                                   [Hardware cache event]
  dTLB-loads                                         [Hardware cache event]
  dTLB-store-misses                                  [Hardware cache event]
  dTLB-stores                                        [Hardware cache event]
  iTLB-load-misses                                   [Hardware cache event]

Fixes: 9b7c7728f4e4ba8d ("perf parse-events: Break out tracepoint and printing")
Reported-by: Yi Ammy <ammy.yi@intel.com>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Xing Zhengjun <zhengjun.xing@linux.intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220923030013.3726410-1-zhengjun.xing@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/print-events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c
index ba1ab5134685..04050d4f6db8 100644
--- a/tools/perf/util/print-events.c
+++ b/tools/perf/util/print-events.c
@@ -239,7 +239,7 @@ void print_sdt_events(const char *subsys_glob, const char *event_glob,
 	strlist__delete(sdtlist);
 }
 
-static bool is_event_supported(u8 type, unsigned int config)
+static bool is_event_supported(u8 type, u64 config)
 {
 	bool ret = true;
 	int open_return;
-- 
cgit v1.2.3


From 71c86cda750b001100e0d6dc04a88449b7381a59 Mon Sep 17 00:00:00 2001
From: Zhengjun Xing <zhengjun.xing@linux.intel.com>
Date: Fri, 23 Sep 2022 11:00:13 +0800
Subject: perf parse-events: Remove "not supported" hybrid cache events

By default, we create two hybrid cache events, one is for cpu_core, and
another is for cpu_atom. But Some hybrid hardware cache events are only
available on one CPU PMU. For example, the 'L1-dcache-load-misses' is only
available on cpu_core, while the 'L1-icache-loads' is only available on
cpu_atom. We need to remove "not supported" hybrid cache events. By
extending is_event_supported() to global API and using it to check if the
hybrid cache events are supported before being created, we can remove the
"not supported" hybrid cache events.

Before:

 # ./perf stat -e L1-dcache-load-misses,L1-icache-loads -a sleep 1

 Performance counter stats for 'system wide':

            52,570      cpu_core/L1-dcache-load-misses/
   <not supported>      cpu_atom/L1-dcache-load-misses/
   <not supported>      cpu_core/L1-icache-loads/
         1,471,817      cpu_atom/L1-icache-loads/

       1.004915229 seconds time elapsed

After:

 # ./perf stat -e L1-dcache-load-misses,L1-icache-loads -a sleep 1

 Performance counter stats for 'system wide':

            54,510      cpu_core/L1-dcache-load-misses/
         1,441,286      cpu_atom/L1-icache-loads/

       1.005114281 seconds time elapsed

Fixes: 30def61f64bac5f5 ("perf parse-events: Create two hybrid cache events")
Reported-by: Yi Ammy <ammy.yi@intel.com>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Xing Zhengjun <zhengjun.xing@linux.intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jin Yao <yao.jin@linux.intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220923030013.3726410-2-zhengjun.xing@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/parse-events-hybrid.c | 21 +++++++++++++++----
 tools/perf/util/parse-events.c        | 39 +++++++++++++++++++++++++++++++++++
 tools/perf/util/parse-events.h        |  1 +
 tools/perf/util/print-events.c        | 39 -----------------------------------
 4 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/tools/perf/util/parse-events-hybrid.c b/tools/perf/util/parse-events-hybrid.c
index 284f8eabd3b9..7c9f9150bad5 100644
--- a/tools/perf/util/parse-events-hybrid.c
+++ b/tools/perf/util/parse-events-hybrid.c
@@ -33,7 +33,8 @@ static void config_hybrid_attr(struct perf_event_attr *attr,
 	 * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied.
 	 */
 	attr->type = type;
-	attr->config = attr->config | ((__u64)pmu_type << PERF_PMU_TYPE_SHIFT);
+	attr->config = (attr->config & PERF_HW_EVENT_MASK) |
+			((__u64)pmu_type << PERF_PMU_TYPE_SHIFT);
 }
 
 static int create_event_hybrid(__u32 config_type, int *idx,
@@ -48,13 +49,25 @@ static int create_event_hybrid(__u32 config_type, int *idx,
 	__u64 config = attr->config;
 
 	config_hybrid_attr(attr, config_type, pmu->type);
+
+	/*
+	 * Some hybrid hardware cache events are only available on one CPU
+	 * PMU. For example, the 'L1-dcache-load-misses' is only available
+	 * on cpu_core, while the 'L1-icache-loads' is only available on
+	 * cpu_atom. We need to remove "not supported" hybrid cache events.
+	 */
+	if (attr->type == PERF_TYPE_HW_CACHE
+	    && !is_event_supported(attr->type, attr->config))
+		return 0;
+
 	evsel = parse_events__add_event_hybrid(list, idx, attr, name, metric_id,
 					       pmu, config_terms);
-	if (evsel)
+	if (evsel) {
 		evsel->pmu_name = strdup(pmu->name);
-	else
+		if (!evsel->pmu_name)
+			return -ENOMEM;
+	} else
 		return -ENOMEM;
-
 	attr->type = type;
 	attr->config = config;
 	return 0;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index f05e15acd33f..f3b2c2a87456 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -28,6 +28,7 @@
 #include "util/parse-events-hybrid.h"
 #include "util/pmu-hybrid.h"
 #include "tracepoint.h"
+#include "thread_map.h"
 
 #define MAX_NAME_LEN 100
 
@@ -157,6 +158,44 @@ struct event_symbol event_symbols_sw[PERF_COUNT_SW_MAX] = {
 #define PERF_EVENT_TYPE(config)		__PERF_EVENT_FIELD(config, TYPE)
 #define PERF_EVENT_ID(config)		__PERF_EVENT_FIELD(config, EVENT)
 
+bool is_event_supported(u8 type, u64 config)
+{
+	bool ret = true;
+	int open_return;
+	struct evsel *evsel;
+	struct perf_event_attr attr = {
+		.type = type,
+		.config = config,
+		.disabled = 1,
+	};
+	struct perf_thread_map *tmap = thread_map__new_by_tid(0);
+
+	if (tmap == NULL)
+		return false;
+
+	evsel = evsel__new(&attr);
+	if (evsel) {
+		open_return = evsel__open(evsel, NULL, tmap);
+		ret = open_return >= 0;
+
+		if (open_return == -EACCES) {
+			/*
+			 * This happens if the paranoid value
+			 * /proc/sys/kernel/perf_event_paranoid is set to 2
+			 * Re-run with exclude_kernel set; we don't do that
+			 * by default as some ARM machines do not support it.
+			 *
+			 */
+			evsel->core.attr.exclude_kernel = 1;
+			ret = evsel__open(evsel, NULL, tmap) >= 0;
+		}
+		evsel__delete(evsel);
+	}
+
+	perf_thread_map__put(tmap);
+	return ret;
+}
+
 const char *event_type(int type)
 {
 	switch (type) {
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index 7e6a601d9cd0..07df7bb7b042 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -19,6 +19,7 @@ struct option;
 struct perf_pmu;
 
 bool have_tracepoints(struct list_head *evlist);
+bool is_event_supported(u8 type, u64 config);
 
 const char *event_type(int type);
 
diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c
index 04050d4f6db8..c4d5d87fae2f 100644
--- a/tools/perf/util/print-events.c
+++ b/tools/perf/util/print-events.c
@@ -22,7 +22,6 @@
 #include "probe-file.h"
 #include "string2.h"
 #include "strlist.h"
-#include "thread_map.h"
 #include "tracepoint.h"
 #include "pfm.h"
 #include "pmu-hybrid.h"
@@ -239,44 +238,6 @@ void print_sdt_events(const char *subsys_glob, const char *event_glob,
 	strlist__delete(sdtlist);
 }
 
-static bool is_event_supported(u8 type, u64 config)
-{
-	bool ret = true;
-	int open_return;
-	struct evsel *evsel;
-	struct perf_event_attr attr = {
-		.type = type,
-		.config = config,
-		.disabled = 1,
-	};
-	struct perf_thread_map *tmap = thread_map__new_by_tid(0);
-
-	if (tmap == NULL)
-		return false;
-
-	evsel = evsel__new(&attr);
-	if (evsel) {
-		open_return = evsel__open(evsel, NULL, tmap);
-		ret = open_return >= 0;
-
-		if (open_return == -EACCES) {
-			/*
-			 * This happens if the paranoid value
-			 * /proc/sys/kernel/perf_event_paranoid is set to 2
-			 * Re-run with exclude_kernel set; we don't do that
-			 * by default as some ARM machines do not support it.
-			 *
-			 */
-			evsel->core.attr.exclude_kernel = 1;
-			ret = evsel__open(evsel, NULL, tmap) >= 0;
-		}
-		evsel__delete(evsel);
-	}
-
-	perf_thread_map__put(tmap);
-	return ret;
-}
-
 int print_hwcache_events(const char *event_glob, bool name_only)
 {
 	unsigned int type, op, i, evt_i = 0, evt_num = 0, npmus = 0;
-- 
cgit v1.2.3


From f4a2aade6809c6573f420c6fc1031797dfe8a4d2 Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Wed, 21 Sep 2022 20:22:54 +0530
Subject: perf tests powerpc: Fix branch stack sampling test to include sanity
 check for branch filter

Commit b55878c90ab92a24 ("perf test: Add test for branch stack
sampling") added test for branch stack sampling. There is a sanity check
in the beginning to skip the test if the hardware doesn't support branch
stack sampling.

Snippet
<<>>
skip the test if the hardware doesn't support branch stack sampling
perf record -b -o- -B true > /dev/null 2>&1 || exit 2
<<>>

But the testcase also uses branch sample types: save_type, any. if any
platform doesn't support the branch filters used in the test, the testcase
will fail. In powerpc, currently mutliple branch filters are not supported
and hence this test fails in powerpc. Fix the sanity check to look at
the support for branch filters used in this test before proceeding with
the test.

Fixes: b55878c90ab92a24 ("perf test: Add test for branch stack sampling")
Reported-by: Disha Goel <disgoel@linux.vnet.ibm.com>
Reviewed-by: Kajol Jain <kjain@linux.ibm.com>
Signed-off-by: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: German Gomez <german.gomez@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nageswara R Sastry <rnsastry@linux.ibm.com>
Link: https://lore.kernel.org/r/20220921145255.20972-2-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/test_brstack.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/tests/shell/test_brstack.sh b/tools/perf/tests/shell/test_brstack.sh
index c644f94a6500..ec801cffae6b 100755
--- a/tools/perf/tests/shell/test_brstack.sh
+++ b/tools/perf/tests/shell/test_brstack.sh
@@ -12,7 +12,8 @@ if ! [ -x "$(command -v cc)" ]; then
 fi
 
 # skip the test if the hardware doesn't support branch stack sampling
-perf record -b -o- -B true > /dev/null 2>&1 || exit 2
+# and if the architecture doesn't support filter types: any,save_type,u
+perf record -b -o- -B --branch-filter any,save_type,u true > /dev/null 2>&1 || exit 2
 
 TMPDIR=$(mktemp -d /tmp/__perf_test.program.XXXXX)
 
-- 
cgit v1.2.3


From 6ef7d362123ecb5bf6d163bb9c7fd6ba2d8c968c Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 21 Sep 2022 15:52:58 +0200
Subject: drm/i915/gt: Restrict forced preemption to the active context

When we submit a new pair of contexts to ELSP for execution, we start a
timer by which point we expect the HW to have switched execution to the
pending contexts. If the promotion to the new pair of contexts has not
occurred, we declare the executing context to have hung and force the
preemption to take place by resetting the engine and resubmitting the
new contexts.

This can lead to an unfair situation where almost all of the preemption
timeout is consumed by the first context which just switches into the
second context immediately prior to the timer firing and triggering the
preemption reset (assuming that the timer interrupts before we process
the CS events for the context switch). The second context hasn't yet had
a chance to yield to the incoming ELSP (and send the ACk for the
promotion) and so ends up being blamed for the reset.

If we see that a context switch has occurred since setting the
preemption timeout, but have not yet received the ACK for the ELSP
promotion, rearm the preemption timer and check again. This is
especially significant if the first context was not schedulable and so
we used the shortest timer possible, greatly increasing the chance of
accidentally blaming the second innocent context.

Fixes: 3a7a92aba8fb ("drm/i915/execlists: Force preemption")
Fixes: d12acee84ffb ("drm/i915/execlists: Cancel banned contexts on schedule-out")
Reported-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Andi Shyti <andi.shyti@linux.intel.com>
Reviewed-by: Andrzej Hajda <andrzej.hajda@intel.com>
Tested-by: Andrzej Hajda <andrzej.hajda@intel.com>
Cc: <stable@vger.kernel.org> # v5.5+
Signed-off-by: Andi Shyti <andi.shyti@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220921135258.1714873-1-andrzej.hajda@intel.com
(cherry picked from commit 107ba1a2c705f4358f2602ec2f2fd821bb651f42)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_types.h        | 15 +++++++++++++++
 .../gpu/drm/i915/gt/intel_execlists_submission.c    | 21 ++++++++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 633a7e5dba3b..6b5d4ea22b67 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -165,6 +165,21 @@ struct intel_engine_execlists {
 	 */
 	struct timer_list preempt;
 
+	/**
+	 * @preempt_target: active request at the time of the preemption request
+	 *
+	 * We force a preemption to occur if the pending contexts have not
+	 * been promoted to active upon receipt of the CS ack event within
+	 * the timeout. This timeout maybe chosen based on the target,
+	 * using a very short timeout if the context is no longer schedulable.
+	 * That short timeout may not be applicable to other contexts, so
+	 * if a context switch should happen within before the preemption
+	 * timeout, we may shoot early at an innocent context. To prevent this,
+	 * we record which context was active at the time of the preemption
+	 * request and only reset that context upon the timeout.
+	 */
+	const struct i915_request *preempt_target;
+
 	/**
 	 * @ccid: identifier for contexts submitted to this engine
 	 */
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 4b909cb88cdf..c718e6dc40b5 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -1241,6 +1241,9 @@ static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
 	if (!rq)
 		return 0;
 
+	/* Only allow ourselves to force reset the currently active context */
+	engine->execlists.preempt_target = rq;
+
 	/* Force a fast reset for terminated contexts (ignoring sysfs!) */
 	if (unlikely(intel_context_is_banned(rq->context) || bad_request(rq)))
 		return INTEL_CONTEXT_BANNED_PREEMPT_TIMEOUT_MS;
@@ -2427,8 +2430,24 @@ static void execlists_submission_tasklet(struct tasklet_struct *t)
 	GEM_BUG_ON(inactive - post > ARRAY_SIZE(post));
 
 	if (unlikely(preempt_timeout(engine))) {
+		const struct i915_request *rq = *engine->execlists.active;
+
+		/*
+		 * If after the preempt-timeout expired, we are still on the
+		 * same active request/context as before we initiated the
+		 * preemption, reset the engine.
+		 *
+		 * However, if we have processed a CS event to switch contexts,
+		 * but not yet processed the CS event for the pending
+		 * preemption, reset the timer allowing the new context to
+		 * gracefully exit.
+		 */
 		cancel_timer(&engine->execlists.preempt);
-		engine->execlists.error_interrupt |= ERROR_PREEMPT;
+		if (rq == engine->execlists.preempt_target)
+			engine->execlists.error_interrupt |= ERROR_PREEMPT;
+		else
+			set_timer_ms(&engine->execlists.preempt,
+				     active_preempt_timeout(engine, rq));
 	}
 
 	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
-- 
cgit v1.2.3


From 3b7610302a75fc1032a6c9462862bec6948f85c9 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 1 Sep 2022 12:05:32 -0400
Subject: fs: dlm: fix possible use after free if tracing

This patch fixes a possible use after free if tracing for the specific
event is enabled. To avoid the use after free we introduce a out_put
label like all other user lock specific requests and safe in a boolean
to do a put or not which depends on the execution path of
dlm_user_request().

Cc: stable@vger.kernel.org
Fixes: 7a3de7324c2b ("fs: dlm: trace user space callbacks")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index c830feb26384..94a72ede5764 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5835,6 +5835,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 {
 	struct dlm_lkb *lkb;
 	struct dlm_args args;
+	bool do_put = true;
 	int error;
 
 	dlm_lock_recovery(ls);
@@ -5851,9 +5852,8 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
 		if (!ua->lksb.sb_lvbptr) {
 			kfree(ua);
-			__put_lkb(ls, lkb);
 			error = -ENOMEM;
-			goto out_trace_end;
+			goto out_put;
 		}
 	}
 #ifdef CONFIG_DLM_DEPRECATED_API
@@ -5867,8 +5867,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 		kfree(ua->lksb.sb_lvbptr);
 		ua->lksb.sb_lvbptr = NULL;
 		kfree(ua);
-		__put_lkb(ls, lkb);
-		goto out_trace_end;
+		goto out_put;
 	}
 
 	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
@@ -5887,8 +5886,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 		error = 0;
 		fallthrough;
 	default:
-		__put_lkb(ls, lkb);
-		goto out_trace_end;
+		goto out_put;
 	}
 
 	/* add this new lkb to the per-process list of locks */
@@ -5896,8 +5894,11 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
 	hold_lkb(lkb);
 	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
 	spin_unlock(&ua->proc->locks_spin);
- out_trace_end:
+	do_put = false;
+ out_put:
 	trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error, false);
+	if (do_put)
+		__put_lkb(ls, lkb);
  out:
 	dlm_unlock_recovery(ls);
 	return error;
-- 
cgit v1.2.3


From 30ea703a38ef76ca119673cd8bdd05c6e068e2ac Mon Sep 17 00:00:00 2001
From: Luciano Leão <lucianorsleao@gmail.com>
Date: Thu, 22 Sep 2022 17:00:54 -0300
Subject: x86/cpu: Include the header of init_ia32_feat_ctl()'s prototype
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Include the header containing the prototype of init_ia32_feat_ctl(),
solving the following warning:

  $ make W=1 arch/x86/kernel/cpu/feat_ctl.o
  arch/x86/kernel/cpu/feat_ctl.c:112:6: warning: no previous prototype for ‘init_ia32_feat_ctl’ [-Wmissing-prototypes]
    112 | void init_ia32_feat_ctl(struct cpuinfo_x86 *c)

This warning appeared after commit

  5d5103595e9e5 ("x86/cpu: Reinitialize IA32_FEAT_CTL MSR on BSP during wakeup")

had moved the function init_ia32_feat_ctl()'s prototype from
arch/x86/kernel/cpu/cpu.h to arch/x86/include/asm/cpu.h.

Note that, before the commit mentioned above, the header include "cpu.h"
(arch/x86/kernel/cpu/cpu.h) was added by commit

  0e79ad863df43 ("x86/cpu: Fix a -Wmissing-prototypes warning for init_ia32_feat_ctl()")

solely to fix init_ia32_feat_ctl()'s missing prototype. So, the header
include "cpu.h" is no longer necessary.

  [ bp: Massage commit message. ]

Fixes: 5d5103595e9e5 ("x86/cpu: Reinitialize IA32_FEAT_CTL MSR on BSP during wakeup")
Signed-off-by: Luciano Leão <lucianorsleao@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Nícolas F. R. A. Prado <n@nfraprado.net>
Link: https://lore.kernel.org/r/20220922200053.1357470-1-lucianorsleao@gmail.com
---
 arch/x86/kernel/cpu/feat_ctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 993697e71854..03851240c3e3 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -1,11 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/tboot.h>
 
+#include <asm/cpu.h>
 #include <asm/cpufeature.h>
 #include <asm/msr-index.h>
 #include <asm/processor.h>
 #include <asm/vmx.h>
-#include "cpu.h"
 
 #undef pr_fmt
 #define pr_fmt(fmt)	"x86/cpu: " fmt
-- 
cgit v1.2.3


From 3f8ef65af927db247418d4e1db49164d7a158fc5 Mon Sep 17 00:00:00 2001
From: Liu Jian <liujian56@huawei.com>
Date: Tue, 23 Aug 2022 21:37:54 +0800
Subject: net: If sock is dead don't access sock's sk_wq in
 sk_stream_wait_memory

Fixes the below NULL pointer dereference:

  [...]
  [   14.471200] Call Trace:
  [   14.471562]  <TASK>
  [   14.471882]  lock_acquire+0x245/0x2e0
  [   14.472416]  ? remove_wait_queue+0x12/0x50
  [   14.473014]  ? _raw_spin_lock_irqsave+0x17/0x50
  [   14.473681]  _raw_spin_lock_irqsave+0x3d/0x50
  [   14.474318]  ? remove_wait_queue+0x12/0x50
  [   14.474907]  remove_wait_queue+0x12/0x50
  [   14.475480]  sk_stream_wait_memory+0x20d/0x340
  [   14.476127]  ? do_wait_intr_irq+0x80/0x80
  [   14.476704]  do_tcp_sendpages+0x287/0x600
  [   14.477283]  tcp_bpf_push+0xab/0x260
  [   14.477817]  tcp_bpf_sendmsg_redir+0x297/0x500
  [   14.478461]  ? __local_bh_enable_ip+0x77/0xe0
  [   14.479096]  tcp_bpf_send_verdict+0x105/0x470
  [   14.479729]  tcp_bpf_sendmsg+0x318/0x4f0
  [   14.480311]  sock_sendmsg+0x2d/0x40
  [   14.480822]  ____sys_sendmsg+0x1b4/0x1c0
  [   14.481390]  ? copy_msghdr_from_user+0x62/0x80
  [   14.482048]  ___sys_sendmsg+0x78/0xb0
  [   14.482580]  ? vmf_insert_pfn_prot+0x91/0x150
  [   14.483215]  ? __do_fault+0x2a/0x1a0
  [   14.483738]  ? do_fault+0x15e/0x5d0
  [   14.484246]  ? __handle_mm_fault+0x56b/0x1040
  [   14.484874]  ? lock_is_held_type+0xdf/0x130
  [   14.485474]  ? find_held_lock+0x2d/0x90
  [   14.486046]  ? __sys_sendmsg+0x41/0x70
  [   14.486587]  __sys_sendmsg+0x41/0x70
  [   14.487105]  ? intel_pmu_drain_pebs_core+0x350/0x350
  [   14.487822]  do_syscall_64+0x34/0x80
  [   14.488345]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
  [...]

The test scenario has the following flow:

thread1                               thread2
-----------                           ---------------
 tcp_bpf_sendmsg
  tcp_bpf_send_verdict
   tcp_bpf_sendmsg_redir              sock_close
    tcp_bpf_push_locked                 __sock_release
     tcp_bpf_push                         //inet_release
      do_tcp_sendpages                    sock->ops->release
       sk_stream_wait_memory          	   // tcp_close
          sk_wait_event                      sk->sk_prot->close
           release_sock(__sk);
            ***
                                                lock_sock(sk);
                                                  __tcp_close
                                                    sock_orphan(sk)
                                                      sk->sk_wq  = NULL
                                                release_sock
            ****
           lock_sock(__sk);
          remove_wait_queue(sk_sleep(sk), &wait);
             sk_sleep(sk)
             //NULL pointer dereference
             &rcu_dereference_raw(sk->sk_wq)->wait

While waiting for memory in thread1, the socket is released with its wait
queue because thread2 has closed it. This caused by tcp_bpf_send_verdict
didn't increase the f_count of psock->sk_redir->sk_socket->file in thread1.

We should check if SOCK_DEAD flag is set on wakeup in sk_stream_wait_memory
before accessing the wait queue.

Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Liu Jian <liujian56@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/bpf/20220823133755.314697-2-liujian56@huawei.com
---
 net/core/stream.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/stream.c b/net/core/stream.c
index ccc083cdef23..1105057ce00a 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -159,7 +159,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 		*timeo_p = current_timeo;
 	}
 out:
-	remove_wait_queue(sk_sleep(sk), &wait);
+	if (!sock_flag(sk, SOCK_DEAD))
+		remove_wait_queue(sk_sleep(sk), &wait);
 	return err;
 
 do_error:
-- 
cgit v1.2.3


From 043a7356dbd0f44b2a2161649d89f4a43f3b0180 Mon Sep 17 00:00:00 2001
From: Liu Jian <liujian56@huawei.com>
Date: Tue, 23 Aug 2022 21:37:55 +0800
Subject: selftests/bpf: Add wait send memory test for sockmap redirect

Add one test for wait redirect sock's send memory test for sockmap.

Signed-off-by: Liu Jian <liujian56@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220823133755.314697-3-liujian56@huawei.com
---
 tools/testing/selftests/bpf/test_sockmap.c | 42 ++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index dcb038e342d8..e768181a1bd7 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -138,6 +138,7 @@ struct sockmap_options {
 	bool data_test;
 	bool drop_expected;
 	bool check_recved_len;
+	bool tx_wait_mem;
 	int iov_count;
 	int iov_length;
 	int rate;
@@ -578,6 +579,10 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 			sent = sendmsg(fd, &msg, flags);
 
 			if (!drop && sent < 0) {
+				if (opt->tx_wait_mem && errno == EACCES) {
+					errno = 0;
+					goto out_errno;
+				}
 				perror("sendmsg loop error");
 				goto out_errno;
 			} else if (drop && sent >= 0) {
@@ -644,6 +649,15 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
 				goto out_errno;
 			}
 
+			if (opt->tx_wait_mem) {
+				FD_ZERO(&w);
+				FD_SET(fd, &w);
+				slct = select(max_fd + 1, NULL, NULL, &w, &timeout);
+				errno = 0;
+				close(fd);
+				goto out_errno;
+			}
+
 			errno = 0;
 			if (peek_flag) {
 				flags |= MSG_PEEK;
@@ -752,6 +766,22 @@ static int sendmsg_test(struct sockmap_options *opt)
 			return err;
 	}
 
+	if (opt->tx_wait_mem) {
+		struct timeval timeout;
+		int rxtx_buf_len = 1024;
+
+		timeout.tv_sec = 3;
+		timeout.tv_usec = 0;
+
+		err = setsockopt(c2, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(struct timeval));
+		err |= setsockopt(c2, SOL_SOCKET, SO_SNDBUFFORCE, &rxtx_buf_len, sizeof(int));
+		err |= setsockopt(p2, SOL_SOCKET, SO_RCVBUFFORCE, &rxtx_buf_len, sizeof(int));
+		if (err) {
+			perror("setsockopt failed()");
+			return errno;
+		}
+	}
+
 	rxpid = fork();
 	if (rxpid == 0) {
 		if (txmsg_pop || txmsg_start_pop)
@@ -788,6 +818,9 @@ static int sendmsg_test(struct sockmap_options *opt)
 		return errno;
 	}
 
+	if (opt->tx_wait_mem)
+		close(c2);
+
 	txpid = fork();
 	if (txpid == 0) {
 		if (opt->sendpage)
@@ -1452,6 +1485,14 @@ static void test_txmsg_redir(int cgrp, struct sockmap_options *opt)
 	test_send(opt, cgrp);
 }
 
+static void test_txmsg_redir_wait_sndmem(int cgrp, struct sockmap_options *opt)
+{
+	txmsg_redir = 1;
+	opt->tx_wait_mem = true;
+	test_send_large(opt, cgrp);
+	opt->tx_wait_mem = false;
+}
+
 static void test_txmsg_drop(int cgrp, struct sockmap_options *opt)
 {
 	txmsg_drop = 1;
@@ -1800,6 +1841,7 @@ static int populate_progs(char *bpf_file)
 struct _test test[] = {
 	{"txmsg test passthrough", test_txmsg_pass},
 	{"txmsg test redirect", test_txmsg_redir},
+	{"txmsg test redirect wait send mem", test_txmsg_redir_wait_sndmem},
 	{"txmsg test drop", test_txmsg_drop},
 	{"txmsg test ingress redirect", test_txmsg_ingress_redir},
 	{"txmsg test skb", test_txmsg_skb},
-- 
cgit v1.2.3


From bec217197b412d74168c6a42fc0f76d0cc9cad00 Mon Sep 17 00:00:00 2001
From: Liu Jian <liujian56@huawei.com>
Date: Wed, 7 Sep 2022 15:13:11 +0800
Subject: skmsg: Schedule psock work if the cached skb exists on the psock

In sk_psock_backlog function, for ingress direction skb, if no new data
packet arrives after the skb is cached, the cached skb does not have a
chance to be added to the receive queue of psock. As a result, the cached
skb cannot be received by the upper-layer application. Fix this by reschedule
the psock work to dispose the cached skb in sk_msg_recvmsg function.

Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: Liu Jian <liujian56@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220907071311.60534-1-liujian56@huawei.com
---
 net/core/skmsg.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 188f8558d27d..ca70525621c7 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -434,8 +434,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 			if (copied + copy > len)
 				copy = len - copied;
 			copy = copy_page_to_iter(page, sge->offset, copy, iter);
-			if (!copy)
-				return copied ? copied : -EFAULT;
+			if (!copy) {
+				copied = copied ? copied : -EFAULT;
+				goto out;
+			}
 
 			copied += copy;
 			if (likely(!peek)) {
@@ -455,7 +457,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 				 * didn't copy the entire length lets just break.
 				 */
 				if (copy != sge->length)
-					return copied;
+					goto out;
 				sk_msg_iter_var_next(i);
 			}
 
@@ -477,7 +479,9 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
 		}
 		msg_rx = sk_psock_peek_msg(psock);
 	}
-
+out:
+	if (psock->work_state.skb && copied > 0)
+		schedule_work(&psock->work);
 	return copied;
 }
 EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
-- 
cgit v1.2.3


From b15b2e307c3a1970d92da77a3ef57ee53d119d8e Mon Sep 17 00:00:00 2001
From: Yue Hu <huyue2@coolpad.com>
Date: Fri, 23 Sep 2022 10:11:22 +0800
Subject: erofs: support on-disk compressed fragments data

Introduce on-disk compressed fragments data feature.

This approach adds a new field called `h_fragmentoff' in the per-file
compression header to indicate the fragment offset of each tail pcluster
or the whole file in the special packed inode.

Similar to ztailpacking, it will also find and record the 'headlcn'
of the tail pcluster when initializing per-inode zmap for making
follow-on requests more easy.

Signed-off-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/YzHKxcFTlHGgXeH9@B-P7TQMD6M-0146.local
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/erofs_fs.h | 33 +++++++++++++++++++++++++++------
 fs/erofs/internal.h | 16 +++++++++++++---
 fs/erofs/super.c    | 15 +++++++++++++++
 fs/erofs/sysfs.c    |  2 ++
 fs/erofs/zdata.c    | 50 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/erofs/zmap.c     | 53 ++++++++++++++++++++++++++++++++++++++++++++++-------
 6 files changed, 152 insertions(+), 17 deletions(-)

diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 5c1de6d7ad71..b5d763aa8ff0 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -25,6 +25,7 @@
 #define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE	0x00000008
 #define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2	0x00000008
 #define EROFS_FEATURE_INCOMPAT_ZTAILPACKING	0x00000010
+#define EROFS_FEATURE_INCOMPAT_FRAGMENTS	0x00000020
 #define EROFS_ALL_FEATURE_INCOMPAT		\
 	(EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
 	 EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
@@ -32,7 +33,8 @@
 	 EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
 	 EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
 	 EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
-	 EROFS_FEATURE_INCOMPAT_ZTAILPACKING)
+	 EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
+	 EROFS_FEATURE_INCOMPAT_FRAGMENTS)
 
 #define EROFS_SB_EXTSLOT_SIZE	16
 
@@ -71,7 +73,9 @@ struct erofs_super_block {
 	} __packed u1;
 	__le16 extra_devices;	/* # of devices besides the primary device */
 	__le16 devt_slotoff;	/* startoff = devt_slotoff * devt_slotsize */
-	__u8 reserved2[38];
+	__u8 reserved[6];
+	__le64 packed_nid;	/* nid of the special packed inode */
+	__u8 reserved2[24];
 };
 
 /*
@@ -296,17 +300,26 @@ struct z_erofs_lzma_cfgs {
  * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
  * bit 3 : tailpacking inline pcluster (0 - off; 1 - on)
  * bit 4 : interlaced plain pcluster (0 - off; 1 - on)
+ * bit 5 : fragment pcluster (0 - off; 1 - on)
  */
 #define Z_EROFS_ADVISE_COMPACTED_2B		0x0001
 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1		0x0002
 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2		0x0004
 #define Z_EROFS_ADVISE_INLINE_PCLUSTER		0x0008
 #define Z_EROFS_ADVISE_INTERLACED_PCLUSTER	0x0010
+#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER	0x0020
 
+#define Z_EROFS_FRAGMENT_INODE_BIT              7
 struct z_erofs_map_header {
-	__le16	h_reserved1;
-	/* indicates the encoded size of tailpacking data */
-	__le16  h_idata_size;
+	union {
+		/* fragment data offset in the packed inode */
+		__le32  h_fragmentoff;
+		struct {
+			__le16  h_reserved1;
+			/* indicates the encoded size of tailpacking data */
+			__le16  h_idata_size;
+		};
+	};
 	__le16	h_advise;
 	/*
 	 * bit 0-3 : algorithm type of head 1 (logical cluster type 01);
@@ -315,7 +328,8 @@ struct z_erofs_map_header {
 	__u8	h_algorithmtype;
 	/*
 	 * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
-	 * bit 3-7 : reserved.
+	 * bit 3-6 : reserved;
+	 * bit 7   : move the whole file into packed inode or not.
 	 */
 	__u8	h_clusterbits;
 };
@@ -404,6 +418,10 @@ struct erofs_dirent {
 /* check the EROFS on-disk layout strictly at compile time */
 static inline void erofs_check_ondisk_layout_definitions(void)
 {
+	const __le64 fmh = *(__le64 *)&(struct z_erofs_map_header) {
+		.h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT
+	};
+
 	BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
 	BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32);
 	BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
@@ -421,6 +439,9 @@ static inline void erofs_check_ondisk_layout_definitions(void)
 
 	BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
 		     Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
+	/* exclude old compiler versions like gcc 7.5.0 */
+	BUILD_BUG_ON(__builtin_constant_p(fmh) ?
+		     fmh != cpu_to_le64(1ULL << 63) : 0);
 }
 
 #endif
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 8dbfeb5f4f84..9f89c1da6229 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -131,6 +131,7 @@ struct erofs_sb_info {
 	struct inode *managed_cache;
 
 	struct erofs_sb_lz4_info lz4;
+	struct inode *packed_inode;
 #endif	/* CONFIG_EROFS_FS_ZIP */
 	struct erofs_dev_context *devs;
 	struct dax_device *dax_dev;
@@ -289,6 +290,7 @@ EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE)
 EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
 EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2)
 EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING)
+EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
 EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
 
 /* atomic flag definitions */
@@ -324,8 +326,13 @@ struct erofs_inode {
 			unsigned char  z_algorithmtype[2];
 			unsigned char  z_logical_clusterbits;
 			unsigned long  z_tailextent_headlcn;
-			erofs_off_t    z_idataoff;
-			unsigned short z_idata_size;
+			union {
+				struct {
+					erofs_off_t    z_idataoff;
+					unsigned short z_idata_size;
+				};
+				erofs_off_t z_fragmentoff;
+			};
 		};
 #endif	/* CONFIG_EROFS_FS_ZIP */
 	};
@@ -384,6 +391,7 @@ extern const struct address_space_operations z_erofs_aops;
 enum {
 	BH_Encoded = BH_PrivateStart,
 	BH_FullMapped,
+	BH_Fragment,
 };
 
 /* Has a disk mapping */
@@ -394,6 +402,8 @@ enum {
 #define EROFS_MAP_ENCODED	(1 << BH_Encoded)
 /* The length of extent is full */
 #define EROFS_MAP_FULL_MAPPED	(1 << BH_FullMapped)
+/* Located in the special packed inode */
+#define EROFS_MAP_FRAGMENT	(1 << BH_Fragment)
 
 struct erofs_map_blocks {
 	struct erofs_buf buf;
@@ -415,7 +425,7 @@ struct erofs_map_blocks {
 #define EROFS_GET_BLOCKS_FIEMAP	0x0002
 /* Used to map the whole extent if non-negligible data is requested for LZMA */
 #define EROFS_GET_BLOCKS_READMORE	0x0004
-/* Used to map tail extent for tailpacking inline pcluster */
+/* Used to map tail extent for tailpacking inline or fragment pcluster */
 #define EROFS_GET_BLOCKS_FINDTAIL	0x0008
 
 enum {
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 9f7fe6c04e65..ce20562ca91f 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -381,6 +381,17 @@ static int erofs_read_superblock(struct super_block *sb)
 #endif
 	sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
 	sbi->root_nid = le16_to_cpu(dsb->root_nid);
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->packed_inode = NULL;
+	if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) {
+		sbi->packed_inode =
+			erofs_iget(sb, le64_to_cpu(dsb->packed_nid), false);
+		if (IS_ERR(sbi->packed_inode)) {
+			ret = PTR_ERR(sbi->packed_inode);
+			goto out;
+		}
+	}
+#endif
 	sbi->inos = le64_to_cpu(dsb->inos);
 
 	sbi->build_time = le64_to_cpu(dsb->build_time);
@@ -411,6 +422,8 @@ static int erofs_read_superblock(struct super_block *sb)
 		erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
 	if (erofs_is_fscache_mode(sb))
 		erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
+	if (erofs_sb_has_fragments(sbi))
+		erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!");
 out:
 	erofs_put_metabuf(&buf);
 	return ret;
@@ -947,6 +960,8 @@ static void erofs_put_super(struct super_block *sb)
 #ifdef CONFIG_EROFS_FS_ZIP
 	iput(sbi->managed_cache);
 	sbi->managed_cache = NULL;
+	iput(sbi->packed_inode);
+	sbi->packed_inode = NULL;
 #endif
 	erofs_fscache_unregister_fs(sb);
 }
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index 341fb43ad587..dd6eb7eccf9a 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -76,6 +76,7 @@ EROFS_ATTR_FEATURE(device_table);
 EROFS_ATTR_FEATURE(compr_head2);
 EROFS_ATTR_FEATURE(sb_chksum);
 EROFS_ATTR_FEATURE(ztailpacking);
+EROFS_ATTR_FEATURE(fragments);
 
 static struct attribute *erofs_feat_attrs[] = {
 	ATTR_LIST(zero_padding),
@@ -86,6 +87,7 @@ static struct attribute *erofs_feat_attrs[] = {
 	ATTR_LIST(compr_head2),
 	ATTR_LIST(sb_chksum),
 	ATTR_LIST(ztailpacking),
+	ATTR_LIST(fragments),
 	NULL,
 };
 ATTRIBUTE_GROUPS(erofs_feat);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 5792ca9e0d5e..c92a72f5bca6 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -650,6 +650,35 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
 		la < fe->headoffset;
 }
 
+static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
+				 struct page *page, unsigned int pageofs,
+				 unsigned int len)
+{
+	struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode;
+	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+	u8 *src, *dst;
+	unsigned int i, cnt;
+
+	pos += EROFS_I(inode)->z_fragmentoff;
+	for (i = 0; i < len; i += cnt) {
+		cnt = min_t(unsigned int, len - i,
+			    EROFS_BLKSIZ - erofs_blkoff(pos));
+		src = erofs_bread(&buf, packed_inode,
+				  erofs_blknr(pos), EROFS_KMAP);
+		if (IS_ERR(src)) {
+			erofs_put_metabuf(&buf);
+			return PTR_ERR(src);
+		}
+
+		dst = kmap_local_page(page);
+		memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt);
+		kunmap_local(dst);
+		pos += cnt;
+	}
+	erofs_put_metabuf(&buf);
+	return 0;
+}
+
 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
 				struct page *page, struct page **pagepool)
 {
@@ -688,7 +717,8 @@ repeat:
 		/* didn't get a valid pcluster previously (very rare) */
 	}
 
-	if (!(map->m_flags & EROFS_MAP_MAPPED))
+	if (!(map->m_flags & EROFS_MAP_MAPPED) ||
+	    map->m_flags & EROFS_MAP_FRAGMENT)
 		goto hitted;
 
 	err = z_erofs_collector_begin(fe);
@@ -735,6 +765,24 @@ hitted:
 		zero_user_segment(page, cur, end);
 		goto next_part;
 	}
+	if (map->m_flags & EROFS_MAP_FRAGMENT) {
+		unsigned int pageofs, skip, len;
+
+		if (offset > map->m_la) {
+			pageofs = 0;
+			skip = offset - map->m_la;
+		} else {
+			pageofs = map->m_la & ~PAGE_MASK;
+			skip = 0;
+		}
+		len = min_t(unsigned int, map->m_llen - skip, end - cur);
+		err = z_erofs_read_fragment(inode, skip, page, pageofs, len);
+		if (err)
+			goto out;
+		++spiltted;
+		tight = false;
+		goto next_part;
+	}
 
 	exclusive = (!cur && (!spiltted || tight));
 	if (cur)
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 7196235a441c..d1723910251c 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -17,7 +17,7 @@ int z_erofs_fill_inode(struct inode *inode)
 	struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
 
 	if (!erofs_sb_has_big_pcluster(sbi) &&
-	    !erofs_sb_has_ztailpacking(sbi) &&
+	    !erofs_sb_has_ztailpacking(sbi) && !erofs_sb_has_fragments(sbi) &&
 	    vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
 		vi->z_advise = 0;
 		vi->z_algorithmtype[0] = 0;
@@ -55,10 +55,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
 	if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
 		goto out_unlock;
 
-	DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
-		  !erofs_sb_has_ztailpacking(EROFS_SB(sb)) &&
-		  vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
-
 	pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
 		    vi->xattr_isize, 8);
 	kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos),
@@ -69,6 +65,16 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
 	}
 
 	h = kaddr + erofs_blkoff(pos);
+	/*
+	 * if the highest bit of the 8-byte map header is set, the whole file
+	 * is stored in the packed inode. The rest bits keeps z_fragmentoff.
+	 */
+	if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) {
+		vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
+		vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63);
+		vi->z_tailextent_headlcn = 0;
+		goto unmap_done;
+	}
 	vi->z_advise = le16_to_cpu(h->h_advise);
 	vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
 	vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
@@ -123,6 +129,20 @@ unmap_done:
 		if (err < 0)
 			goto out_unlock;
 	}
+
+	if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER &&
+	    !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) {
+		struct erofs_map_blocks map = {
+			.buf = __EROFS_BUF_INITIALIZER
+		};
+
+		vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
+		err = z_erofs_do_map_blocks(inode, &map,
+					    EROFS_GET_BLOCKS_FINDTAIL);
+		erofs_put_metabuf(&map.buf);
+		if (err < 0)
+			goto out_unlock;
+	}
 	/* paired with smp_mb() at the beginning of the function */
 	smp_mb();
 	set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
@@ -598,6 +618,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
 {
 	struct erofs_inode *const vi = EROFS_I(inode);
 	bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER;
+	bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
 	struct z_erofs_maprecorder m = {
 		.inode = inode,
 		.map = map,
@@ -666,12 +687,19 @@ static int z_erofs_do_map_blocks(struct inode *inode,
 
 	map->m_llen = end - map->m_la;
 
-	if (flags & EROFS_GET_BLOCKS_FINDTAIL)
+	if (flags & EROFS_GET_BLOCKS_FINDTAIL) {
 		vi->z_tailextent_headlcn = m.lcn;
+		/* for non-compact indexes, fragmentoff is 64 bits */
+		if (fragment &&
+		    vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
+			vi->z_fragmentoff |= (u64)m.pblk << 32;
+	}
 	if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) {
 		map->m_flags |= EROFS_MAP_META;
 		map->m_pa = vi->z_idataoff;
 		map->m_plen = vi->z_idata_size;
+	} else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
+		map->m_flags |= EROFS_MAP_FRAGMENT;
 	} else {
 		map->m_pa = blknr_to_addr(m.pblk);
 		err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
@@ -715,6 +743,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
 			    struct erofs_map_blocks *map,
 			    int flags)
 {
+	struct erofs_inode *const vi = EROFS_I(inode);
 	int err = 0;
 
 	trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
@@ -731,6 +760,15 @@ int z_erofs_map_blocks_iter(struct inode *inode,
 	if (err)
 		goto out;
 
+	if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
+	    !vi->z_tailextent_headlcn) {
+		map->m_la = 0;
+		map->m_llen = inode->i_size;
+		map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED |
+				EROFS_MAP_FRAGMENT;
+		goto out;
+	}
+
 	err = z_erofs_do_map_blocks(inode, map, flags);
 out:
 	trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
@@ -757,7 +795,8 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
 	iomap->length = map.m_llen;
 	if (map.m_flags & EROFS_MAP_MAPPED) {
 		iomap->type = IOMAP_MAPPED;
-		iomap->addr = map.m_pa;
+		iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ?
+			      IOMAP_NULL_ADDR : map.m_pa;
 	} else {
 		iomap->type = IOMAP_HOLE;
 		iomap->addr = IOMAP_NULL_ADDR;
-- 
cgit v1.2.3


From 5c2a64252c5dc4cfe78e5b2a531c118894e3d155 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Fri, 23 Sep 2022 09:49:15 +0800
Subject: erofs: introduce partial-referenced pclusters

Due to deduplication for compressed data, pclusters can be partially
referenced with their prefixes.

Together with the user-space implementation, it enables EROFS
variable-length global compressed data deduplication with rolling
hash.

Link: https://lore.kernel.org/r/20220923014915.4362-1-hsiangkao@linux.alibaba.com
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/decompressor_lzma.c | 3 +++
 fs/erofs/erofs_fs.h          | 7 ++++++-
 fs/erofs/internal.h          | 4 ++++
 fs/erofs/super.c             | 2 ++
 fs/erofs/sysfs.c             | 2 ++
 fs/erofs/zdata.c             | 1 +
 fs/erofs/zmap.c              | 6 +++++-
 7 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 5e59b3f523eb..091fd5adf818 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -217,6 +217,9 @@ again:
 			strm->buf.out_size = min_t(u32, outlen,
 						   PAGE_SIZE - pageofs);
 			outlen -= strm->buf.out_size;
+			if (!rq->out[no] && rq->fillgaps)	/* deduped */
+				rq->out[no] = erofs_allocpage(pagepool,
+						GFP_KERNEL | __GFP_NOFAIL);
 			if (rq->out[no])
 				strm->buf.out = kmap(rq->out[no]) + pageofs;
 			pageofs = 0;
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index b5d763aa8ff0..dbcd24371002 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -26,6 +26,7 @@
 #define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2	0x00000008
 #define EROFS_FEATURE_INCOMPAT_ZTAILPACKING	0x00000010
 #define EROFS_FEATURE_INCOMPAT_FRAGMENTS	0x00000020
+#define EROFS_FEATURE_INCOMPAT_DEDUPE		0x00000020
 #define EROFS_ALL_FEATURE_INCOMPAT		\
 	(EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
 	 EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
@@ -34,7 +35,8 @@
 	 EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
 	 EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
 	 EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
-	 EROFS_FEATURE_INCOMPAT_FRAGMENTS)
+	 EROFS_FEATURE_INCOMPAT_FRAGMENTS | \
+	 EROFS_FEATURE_INCOMPAT_DEDUPE)
 
 #define EROFS_SB_EXTSLOT_SIZE	16
 
@@ -371,6 +373,9 @@ enum {
 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS        2
 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT         0
 
+/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */
+#define Z_EROFS_VLE_DI_PARTIAL_REF		(1 << 15)
+
 /*
  * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
  * compressed block count of a compressed extent (in logical clusters, aka.
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 9f89c1da6229..a6333c283e3d 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -291,6 +291,7 @@ EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
 EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2)
 EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING)
 EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
+EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
 EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
 
 /* atomic flag definitions */
@@ -392,6 +393,7 @@ enum {
 	BH_Encoded = BH_PrivateStart,
 	BH_FullMapped,
 	BH_Fragment,
+	BH_Partialref,
 };
 
 /* Has a disk mapping */
@@ -404,6 +406,8 @@ enum {
 #define EROFS_MAP_FULL_MAPPED	(1 << BH_FullMapped)
 /* Located in the special packed inode */
 #define EROFS_MAP_FRAGMENT	(1 << BH_Fragment)
+/* The extent refers to partial decompressed data */
+#define EROFS_MAP_PARTIAL_REF	(1 << BH_Partialref)
 
 struct erofs_map_blocks {
 	struct erofs_buf buf;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index ce20562ca91f..8040534ae5c0 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -424,6 +424,8 @@ static int erofs_read_superblock(struct super_block *sb)
 		erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
 	if (erofs_sb_has_fragments(sbi))
 		erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!");
+	if (erofs_sb_has_dedupe(sbi))
+		erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!");
 out:
 	erofs_put_metabuf(&buf);
 	return ret;
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index dd6eb7eccf9a..783bb7b21b51 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -77,6 +77,7 @@ EROFS_ATTR_FEATURE(compr_head2);
 EROFS_ATTR_FEATURE(sb_chksum);
 EROFS_ATTR_FEATURE(ztailpacking);
 EROFS_ATTR_FEATURE(fragments);
+EROFS_ATTR_FEATURE(dedupe);
 
 static struct attribute *erofs_feat_attrs[] = {
 	ATTR_LIST(zero_padding),
@@ -88,6 +89,7 @@ static struct attribute *erofs_feat_attrs[] = {
 	ATTR_LIST(sb_chksum),
 	ATTR_LIST(ztailpacking),
 	ATTR_LIST(fragments),
+	ATTR_LIST(dedupe),
 	NULL,
 };
 ATTRIBUTE_GROUPS(erofs_feat);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index c92a72f5bca6..cce56dde135c 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -814,6 +814,7 @@ retry:
 		fe->pcl->multibases = true;
 
 	if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
+	    !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
 	    fe->pcl->length == map->m_llen)
 		fe->pcl->partial = false;
 	if (fe->pcl->length < offset + end - map->m_la) {
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index d1723910251c..ccdddb755be8 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -163,6 +163,7 @@ struct z_erofs_maprecorder {
 	u16 delta[2];
 	erofs_blk_t pblk, compressedblks;
 	erofs_off_t nextpackoff;
+	bool partialref;
 };
 
 static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
@@ -221,6 +222,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
 	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
 	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+		if (advise & Z_EROFS_VLE_DI_PARTIAL_REF)
+			m->partialref = true;
 		m->clusterofs = le16_to_cpu(di->di_clusterofs);
 		m->pblk = le32_to_cpu(di->di_u.blkaddr);
 		break;
@@ -684,7 +687,8 @@ static int z_erofs_do_map_blocks(struct inode *inode,
 		err = -EOPNOTSUPP;
 		goto unmap_out;
 	}
-
+	if (m.partialref)
+		map->m_flags |= EROFS_MAP_PARTIAL_REF;
 	map->m_llen = end - map->m_la;
 
 	if (flags & EROFS_GET_BLOCKS_FINDTAIL) {
-- 
cgit v1.2.3


From f143ff397a3f991e8b48542f77aad900845f436e Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:43 -0700
Subject: treewide: Filter out CC_FLAGS_CFI

In preparation for removing CC_FLAGS_CFI from CC_FLAGS_LTO, explicitly
filter out CC_FLAGS_CFI in all the makefiles where we currently filter
out CC_FLAGS_LTO.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-2-samitolvanen@google.com
---
 arch/arm64/kernel/vdso/Makefile       | 3 ++-
 arch/x86/entry/vdso/Makefile          | 3 ++-
 drivers/firmware/efi/libstub/Makefile | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index bafbf78fab77..619e2dc7ee14 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -40,7 +40,8 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
 # kernel with CONFIG_WERROR enabled.
 CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) \
 				$(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) \
-				$(CC_FLAGS_LTO) -Wmissing-prototypes -Wmissing-declarations
+				$(CC_FLAGS_LTO) $(CC_FLAGS_CFI) \
+				-Wmissing-prototypes -Wmissing-declarations
 KASAN_SANITIZE			:= n
 KCSAN_SANITIZE			:= n
 UBSAN_SANITIZE			:= n
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 12f6c4d714cd..381d3333b996 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -91,7 +91,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),)
 endif
 endif
 
-$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
 $(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
 
 #
@@ -153,6 +153,7 @@ KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_CFI),$(KBUILD_CFLAGS_32))
 KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
 KBUILD_CFLAGS_32 += -fno-stack-protector
 KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index d0537573501e..234fb2910622 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -39,6 +39,8 @@ KBUILD_CFLAGS			:= $(cflags-y) -Os -DDISABLE_BRANCH_PROFILING \
 
 # remove SCS flags from all objects in this directory
 KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_SCS), $(KBUILD_CFLAGS))
+# disable CFI
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_CFI), $(KBUILD_CFLAGS))
 # disable LTO
 KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS))
 
-- 
cgit v1.2.3


From d0f9562ee43a135b941715d9e5e607de88898aca Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:44 -0700
Subject: scripts/kallsyms: Ignore __kcfi_typeid_

The compiler generates __kcfi_typeid_ symbols for annotating assembly
functions with type information. These are constants that can be
referenced in assembly code and are resolved by the linker. Ignore
them in kallsyms.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-3-samitolvanen@google.com
---
 scripts/kallsyms.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index f18e6dfc68c5..ccdf0c897f31 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -119,6 +119,7 @@ static bool is_ignored_symbol(const char *name, char type)
 		"__ThumbV7PILongThunk_",
 		"__LA25Thunk_",		/* mips lld */
 		"__microLA25Thunk_",
+		"__kcfi_typeid_",	/* CFI type identifiers */
 		NULL
 	};
 
-- 
cgit v1.2.3


From 9fca7115827b2e5f48d84e50bceb4edfd4cb6375 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:45 -0700
Subject: cfi: Remove CONFIG_CFI_CLANG_SHADOW

In preparation to switching to -fsanitize=kcfi, remove support for the
CFI module shadow that will no longer be needed.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-4-samitolvanen@google.com
---
 arch/Kconfig         |  10 ---
 include/linux/cfi.h  |  12 ---
 kernel/cfi.c         | 236 +--------------------------------------------------
 kernel/module/main.c |  15 ----
 4 files changed, 1 insertion(+), 272 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 5dbf11a5ba4e..5fd875e18c99 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -754,16 +754,6 @@ config CFI_CLANG
 
 	    https://clang.llvm.org/docs/ControlFlowIntegrity.html
 
-config CFI_CLANG_SHADOW
-	bool "Use CFI shadow to speed up cross-module checks"
-	default y
-	depends on CFI_CLANG && MODULES
-	help
-	  If you select this option, the kernel builds a fast look-up table of
-	  CFI check functions in loaded modules to reduce performance overhead.
-
-	  If unsure, say Y.
-
 config CFI_PERMISSIVE
 	bool "Use CFI in permissive mode"
 	depends on CFI_CLANG
diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index c6dfc1ed0626..4ab51c067007 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -20,18 +20,6 @@ extern void __cfi_check(uint64_t id, void *ptr, void *diag);
 #define __CFI_ADDRESSABLE(fn, __attr) \
 	const void *__cfi_jt_ ## fn __visible __attr = (void *)&fn
 
-#ifdef CONFIG_CFI_CLANG_SHADOW
-
-extern void cfi_module_add(struct module *mod, unsigned long base_addr);
-extern void cfi_module_remove(struct module *mod, unsigned long base_addr);
-
-#else
-
-static inline void cfi_module_add(struct module *mod, unsigned long base_addr) {}
-static inline void cfi_module_remove(struct module *mod, unsigned long base_addr) {}
-
-#endif /* CONFIG_CFI_CLANG_SHADOW */
-
 #else /* !CONFIG_CFI_CLANG */
 
 #ifdef CONFIG_X86_KERNEL_IBT
diff --git a/kernel/cfi.c b/kernel/cfi.c
index 2046276ee234..e8bc1b370edc 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -32,237 +32,6 @@ static inline void handle_cfi_failure(void *ptr)
 }
 
 #ifdef CONFIG_MODULES
-#ifdef CONFIG_CFI_CLANG_SHADOW
-/*
- * Index type. A 16-bit index can address at most (2^16)-2 pages (taking
- * into account SHADOW_INVALID), i.e. ~256M with 4k pages.
- */
-typedef u16 shadow_t;
-#define SHADOW_INVALID		((shadow_t)~0UL)
-
-struct cfi_shadow {
-	/* Page index for the beginning of the shadow */
-	unsigned long base;
-	/* An array of __cfi_check locations (as indices to the shadow) */
-	shadow_t shadow[1];
-} __packed;
-
-/*
- * The shadow covers ~128M from the beginning of the module region. If
- * the region is larger, we fall back to __module_address for the rest.
- */
-#define __SHADOW_RANGE		(_UL(SZ_128M) >> PAGE_SHIFT)
-
-/* The in-memory size of struct cfi_shadow, always at least one page */
-#define __SHADOW_PAGES		((__SHADOW_RANGE * sizeof(shadow_t)) >> PAGE_SHIFT)
-#define SHADOW_PAGES		max(1UL, __SHADOW_PAGES)
-#define SHADOW_SIZE		(SHADOW_PAGES << PAGE_SHIFT)
-
-/* The actual size of the shadow array, minus metadata */
-#define SHADOW_ARR_SIZE		(SHADOW_SIZE - offsetof(struct cfi_shadow, shadow))
-#define SHADOW_ARR_SLOTS	(SHADOW_ARR_SIZE / sizeof(shadow_t))
-
-static DEFINE_MUTEX(shadow_update_lock);
-static struct cfi_shadow __rcu *cfi_shadow __read_mostly;
-
-/* Returns the index in the shadow for the given address */
-static inline int ptr_to_shadow(const struct cfi_shadow *s, unsigned long ptr)
-{
-	unsigned long index;
-	unsigned long page = ptr >> PAGE_SHIFT;
-
-	if (unlikely(page < s->base))
-		return -1; /* Outside of module area */
-
-	index = page - s->base;
-
-	if (index >= SHADOW_ARR_SLOTS)
-		return -1; /* Cannot be addressed with shadow */
-
-	return (int)index;
-}
-
-/* Returns the page address for an index in the shadow */
-static inline unsigned long shadow_to_ptr(const struct cfi_shadow *s,
-	int index)
-{
-	if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
-		return 0;
-
-	return (s->base + index) << PAGE_SHIFT;
-}
-
-/* Returns the __cfi_check function address for the given shadow location */
-static inline unsigned long shadow_to_check_fn(const struct cfi_shadow *s,
-	int index)
-{
-	if (unlikely(index < 0 || index >= SHADOW_ARR_SLOTS))
-		return 0;
-
-	if (unlikely(s->shadow[index] == SHADOW_INVALID))
-		return 0;
-
-	/* __cfi_check is always page aligned */
-	return (s->base + s->shadow[index]) << PAGE_SHIFT;
-}
-
-static void prepare_next_shadow(const struct cfi_shadow __rcu *prev,
-		struct cfi_shadow *next)
-{
-	int i, index, check;
-
-	/* Mark everything invalid */
-	memset(next->shadow, 0xFF, SHADOW_ARR_SIZE);
-
-	if (!prev)
-		return; /* No previous shadow */
-
-	/* If the base address didn't change, an update is not needed */
-	if (prev->base == next->base) {
-		memcpy(next->shadow, prev->shadow, SHADOW_ARR_SIZE);
-		return;
-	}
-
-	/* Convert the previous shadow to the new address range */
-	for (i = 0; i < SHADOW_ARR_SLOTS; ++i) {
-		if (prev->shadow[i] == SHADOW_INVALID)
-			continue;
-
-		index = ptr_to_shadow(next, shadow_to_ptr(prev, i));
-		if (index < 0)
-			continue;
-
-		check = ptr_to_shadow(next,
-				shadow_to_check_fn(prev, prev->shadow[i]));
-		if (check < 0)
-			continue;
-
-		next->shadow[index] = (shadow_t)check;
-	}
-}
-
-static void add_module_to_shadow(struct cfi_shadow *s, struct module *mod,
-			unsigned long min_addr, unsigned long max_addr)
-{
-	int check_index;
-	unsigned long check = (unsigned long)mod->cfi_check;
-	unsigned long ptr;
-
-	if (unlikely(!PAGE_ALIGNED(check))) {
-		pr_warn("cfi: not using shadow for module %s\n", mod->name);
-		return;
-	}
-
-	check_index = ptr_to_shadow(s, check);
-	if (check_index < 0)
-		return; /* Module not addressable with shadow */
-
-	/* For each page, store the check function index in the shadow */
-	for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
-		int index = ptr_to_shadow(s, ptr);
-
-		if (index >= 0) {
-			/* Each page must only contain one module */
-			WARN_ON_ONCE(s->shadow[index] != SHADOW_INVALID);
-			s->shadow[index] = (shadow_t)check_index;
-		}
-	}
-}
-
-static void remove_module_from_shadow(struct cfi_shadow *s, struct module *mod,
-		unsigned long min_addr, unsigned long max_addr)
-{
-	unsigned long ptr;
-
-	for (ptr = min_addr; ptr <= max_addr; ptr += PAGE_SIZE) {
-		int index = ptr_to_shadow(s, ptr);
-
-		if (index >= 0)
-			s->shadow[index] = SHADOW_INVALID;
-	}
-}
-
-typedef void (*update_shadow_fn)(struct cfi_shadow *, struct module *,
-			unsigned long min_addr, unsigned long max_addr);
-
-static void update_shadow(struct module *mod, unsigned long base_addr,
-		update_shadow_fn fn)
-{
-	struct cfi_shadow *prev;
-	struct cfi_shadow *next;
-	unsigned long min_addr, max_addr;
-
-	next = vmalloc(SHADOW_SIZE);
-
-	mutex_lock(&shadow_update_lock);
-	prev = rcu_dereference_protected(cfi_shadow,
-					 mutex_is_locked(&shadow_update_lock));
-
-	if (next) {
-		next->base = base_addr >> PAGE_SHIFT;
-		prepare_next_shadow(prev, next);
-
-		min_addr = (unsigned long)mod->core_layout.base;
-		max_addr = min_addr + mod->core_layout.text_size;
-		fn(next, mod, min_addr & PAGE_MASK, max_addr & PAGE_MASK);
-
-		set_memory_ro((unsigned long)next, SHADOW_PAGES);
-	}
-
-	rcu_assign_pointer(cfi_shadow, next);
-	mutex_unlock(&shadow_update_lock);
-	synchronize_rcu();
-
-	if (prev) {
-		set_memory_rw((unsigned long)prev, SHADOW_PAGES);
-		vfree(prev);
-	}
-}
-
-void cfi_module_add(struct module *mod, unsigned long base_addr)
-{
-	update_shadow(mod, base_addr, add_module_to_shadow);
-}
-
-void cfi_module_remove(struct module *mod, unsigned long base_addr)
-{
-	update_shadow(mod, base_addr, remove_module_from_shadow);
-}
-
-static inline cfi_check_fn ptr_to_check_fn(const struct cfi_shadow __rcu *s,
-	unsigned long ptr)
-{
-	int index;
-
-	if (unlikely(!s))
-		return NULL; /* No shadow available */
-
-	index = ptr_to_shadow(s, ptr);
-	if (index < 0)
-		return NULL; /* Cannot be addressed with shadow */
-
-	return (cfi_check_fn)shadow_to_check_fn(s, index);
-}
-
-static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
-{
-	cfi_check_fn fn;
-
-	rcu_read_lock_sched_notrace();
-	fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
-	rcu_read_unlock_sched_notrace();
-
-	return fn;
-}
-
-#else /* !CONFIG_CFI_CLANG_SHADOW */
-
-static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
-{
-	return NULL;
-}
-
-#endif /* CONFIG_CFI_CLANG_SHADOW */
 
 static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
 {
@@ -298,10 +67,7 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr)
 		ct_irq_enter();
 	}
 
-	if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW))
-		fn = find_shadow_check_fn(ptr);
-	if (!fn)
-		fn = find_module_check_fn(ptr);
+	fn = find_module_check_fn(ptr);
 
 	if (rcu_idle) {
 		ct_irq_exit();
diff --git a/kernel/module/main.c b/kernel/module/main.c
index a4e4d84b6f4e..0228f44b58e5 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1144,8 +1144,6 @@ void __weak module_arch_freeing_init(struct module *mod)
 {
 }
 
-static void cfi_cleanup(struct module *mod);
-
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1190,9 +1188,6 @@ static void free_module(struct module *mod)
 		       mod->name);
 	mutex_unlock(&module_mutex);
 
-	/* Clean up CFI for the module. */
-	cfi_cleanup(mod);
-
 	/* This may be empty, but that's OK */
 	module_arch_freeing_init(mod);
 	module_memfree(mod->init_layout.base);
@@ -2875,7 +2870,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	synchronize_rcu();
 	kfree(mod->args);
  free_arch_cleanup:
-	cfi_cleanup(mod);
 	module_arch_cleanup(mod);
  free_modinfo:
 	free_modinfo(mod);
@@ -2984,15 +2978,6 @@ static void cfi_init(struct module *mod)
 		mod->exit = *exit;
 #endif
 	rcu_read_unlock_sched();
-
-	cfi_module_add(mod, mod_tree.addr_min);
-#endif
-}
-
-static void cfi_cleanup(struct module *mod)
-{
-#ifdef CONFIG_CFI_CLANG
-	cfi_module_remove(mod, mod_tree.addr_min);
 #endif
 }
 
-- 
cgit v1.2.3


From 92efda8eb15295a07f450828b2db14485bfc09c2 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:46 -0700
Subject: cfi: Drop __CFI_ADDRESSABLE

The __CFI_ADDRESSABLE macro is used for init_module and cleanup_module
to ensure we have the address of the CFI jump table, and with
CONFIG_X86_KERNEL_IBT to ensure LTO won't optimize away the symbols.
As __CFI_ADDRESSABLE is no longer necessary with -fsanitize=kcfi, add
a more flexible version of the __ADDRESSABLE macro and always ensure
these symbols won't be dropped.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-5-samitolvanen@google.com
---
 include/linux/cfi.h      | 20 --------------------
 include/linux/compiler.h |  6 ++++--
 include/linux/module.h   |  4 ++--
 3 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 4ab51c067007..2cdbc0fbd0ab 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -13,26 +13,6 @@ typedef void (*cfi_check_fn)(uint64_t id, void *ptr, void *diag);
 /* Compiler-generated function in each module, and the kernel */
 extern void __cfi_check(uint64_t id, void *ptr, void *diag);
 
-/*
- * Force the compiler to generate a CFI jump table entry for a function
- * and store the jump table address to __cfi_jt_<function>.
- */
-#define __CFI_ADDRESSABLE(fn, __attr) \
-	const void *__cfi_jt_ ## fn __visible __attr = (void *)&fn
-
-#else /* !CONFIG_CFI_CLANG */
-
-#ifdef CONFIG_X86_KERNEL_IBT
-
-#define __CFI_ADDRESSABLE(fn, __attr) \
-	const void *__cfi_jt_ ## fn __visible __attr = (void *)&fn
-
-#endif /* CONFIG_X86_KERNEL_IBT */
-
 #endif /* CONFIG_CFI_CLANG */
 
-#ifndef __CFI_ADDRESSABLE
-#define __CFI_ADDRESSABLE(fn, __attr)
-#endif
-
 #endif /* _LINUX_CFI_H */
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 7713d7bcdaea..7bfafc69172a 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -221,9 +221,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
  * otherwise, or eliminated entirely due to lack of references that are
  * visible to the compiler.
  */
-#define __ADDRESSABLE(sym) \
-	static void * __section(".discard.addressable") __used \
+#define ___ADDRESSABLE(sym, __attrs) \
+	static void * __used __attrs \
 		__UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym;
+#define __ADDRESSABLE(sym) \
+	___ADDRESSABLE(sym, __section(".discard.addressable"))
 
 /**
  * offset_to_ptr - convert a relative memory offset to an absolute pointer
diff --git a/include/linux/module.h b/include/linux/module.h
index 518296ea7f73..8937b020ec04 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -132,7 +132,7 @@ extern void cleanup_module(void);
 	{ return initfn; }					\
 	int init_module(void) __copy(initfn)			\
 		__attribute__((alias(#initfn)));		\
-	__CFI_ADDRESSABLE(init_module, __initdata);
+	___ADDRESSABLE(init_module, __initdata);
 
 /* This is only required if you want to be unloadable. */
 #define module_exit(exitfn)					\
@@ -140,7 +140,7 @@ extern void cleanup_module(void);
 	{ return exitfn; }					\
 	void cleanup_module(void) __copy(exitfn)		\
 		__attribute__((alias(#exitfn)));		\
-	__CFI_ADDRESSABLE(cleanup_module, __exitdata);
+	___ADDRESSABLE(cleanup_module, __exitdata);
 
 #endif
 
-- 
cgit v1.2.3


From 89245600941e4e0f87d77f60ee269b5e61ef4e49 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:47 -0700
Subject: cfi: Switch to -fsanitize=kcfi

Switch from Clang's original forward-edge control-flow integrity
implementation to -fsanitize=kcfi, which is better suited for the
kernel, as it doesn't require LTO, doesn't use a jump table that
requires altering function references, and won't break cross-module
function address equality.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-6-samitolvanen@google.com
---
 Makefile                          |  13 +---
 arch/Kconfig                      |   8 ++-
 include/asm-generic/vmlinux.lds.h |  37 +++++-----
 include/linux/cfi.h               |  29 ++++++--
 include/linux/compiler-clang.h    |  14 +---
 include/linux/module.h            |   6 +-
 kernel/cfi.c                      | 144 ++++++++++++++++++--------------------
 kernel/module/main.c              |  35 +--------
 scripts/module.lds.S              |  23 ++----
 9 files changed, 133 insertions(+), 176 deletions(-)

diff --git a/Makefile b/Makefile
index a4f71076cacb..43e08c9f95e9 100644
--- a/Makefile
+++ b/Makefile
@@ -921,18 +921,7 @@ export CC_FLAGS_LTO
 endif
 
 ifdef CONFIG_CFI_CLANG
-CC_FLAGS_CFI	:= -fsanitize=cfi \
-		   -fsanitize-cfi-cross-dso \
-		   -fno-sanitize-cfi-canonical-jump-tables \
-		   -fno-sanitize-trap=cfi \
-		   -fno-sanitize-blacklist
-
-ifdef CONFIG_CFI_PERMISSIVE
-CC_FLAGS_CFI	+= -fsanitize-recover=cfi
-endif
-
-# If LTO flags are filtered out, we must also filter out CFI.
-CC_FLAGS_LTO	+= $(CC_FLAGS_CFI)
+CC_FLAGS_CFI	:= -fsanitize=kcfi
 KBUILD_CFLAGS	+= $(CC_FLAGS_CFI)
 export CC_FLAGS_CFI
 endif
diff --git a/arch/Kconfig b/arch/Kconfig
index 5fd875e18c99..1c1eca0c0019 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -738,11 +738,13 @@ config ARCH_SUPPORTS_CFI_CLANG
 	  An architecture should select this option if it can support Clang's
 	  Control-Flow Integrity (CFI) checking.
 
+config ARCH_USES_CFI_TRAPS
+	bool
+
 config CFI_CLANG
 	bool "Use Clang's Control Flow Integrity (CFI)"
-	depends on LTO_CLANG && ARCH_SUPPORTS_CFI_CLANG
-	depends on CLANG_VERSION >= 140000
-	select KALLSYMS
+	depends on ARCH_SUPPORTS_CFI_CLANG
+	depends on $(cc-option,-fsanitize=kcfi)
 	help
 	  This option enables Clang’s forward-edge Control Flow Integrity
 	  (CFI) checking, where the compiler injects a runtime check to each
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 7515a465ec03..7501edfce11e 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -421,6 +421,22 @@
 	__end_ro_after_init = .;
 #endif
 
+/*
+ * .kcfi_traps contains a list KCFI trap locations.
+ */
+#ifndef KCFI_TRAPS
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+#define KCFI_TRAPS							\
+	__kcfi_traps : AT(ADDR(__kcfi_traps) - LOAD_OFFSET) {		\
+		__start___kcfi_traps = .;				\
+		KEEP(*(.kcfi_traps))					\
+		__stop___kcfi_traps = .;				\
+	}
+#else
+#define KCFI_TRAPS
+#endif
+#endif
+
 /*
  * Read only Data
  */
@@ -529,6 +545,8 @@
 		__stop___modver = .;					\
 	}								\
 									\
+	KCFI_TRAPS							\
+									\
 	RO_EXCEPTION_TABLE						\
 	NOTES								\
 	BTF								\
@@ -537,21 +555,6 @@
 	__end_rodata = .;
 
 
-/*
- * .text..L.cfi.jumptable.* contain Control-Flow Integrity (CFI)
- * jump table entries.
- */
-#ifdef CONFIG_CFI_CLANG
-#define TEXT_CFI_JT							\
-		. = ALIGN(PMD_SIZE);					\
-		__cfi_jt_start = .;					\
-		*(.text..L.cfi.jumptable .text..L.cfi.jumptable.*)	\
-		. = ALIGN(PMD_SIZE);					\
-		__cfi_jt_end = .;
-#else
-#define TEXT_CFI_JT
-#endif
-
 /*
  * Non-instrumentable text section
  */
@@ -579,7 +582,6 @@
 		*(.text..refcount)					\
 		*(.ref.text)						\
 		*(.text.asan.* .text.tsan.*)				\
-		TEXT_CFI_JT						\
 	MEM_KEEP(init.text*)						\
 	MEM_KEEP(exit.text*)						\
 
@@ -1008,8 +1010,7 @@
  * keep any .init_array.* sections.
  * https://bugs.llvm.org/show_bug.cgi?id=46478
  */
-#if defined(CONFIG_GCOV_KERNEL) || defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KCSAN) || \
-	defined(CONFIG_CFI_CLANG)
+#if defined(CONFIG_GCOV_KERNEL) || defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KCSAN)
 # ifdef CONFIG_CONSTRUCTORS
 #  define SANITIZER_DISCARDS						\
 	*(.eh_frame)
diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 2cdbc0fbd0ab..5e134f4ce8b7 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -2,17 +2,38 @@
 /*
  * Clang Control Flow Integrity (CFI) support.
  *
- * Copyright (C) 2021 Google LLC
+ * Copyright (C) 2022 Google LLC
  */
 #ifndef _LINUX_CFI_H
 #define _LINUX_CFI_H
 
+#include <linux/bug.h>
+#include <linux/module.h>
+
 #ifdef CONFIG_CFI_CLANG
-typedef void (*cfi_check_fn)(uint64_t id, void *ptr, void *diag);
+enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
+				      unsigned long *target, u32 type);
 
-/* Compiler-generated function in each module, and the kernel */
-extern void __cfi_check(uint64_t id, void *ptr, void *diag);
+static inline enum bug_trap_type report_cfi_failure_noaddr(struct pt_regs *regs,
+							   unsigned long addr)
+{
+	return report_cfi_failure(regs, addr, NULL, 0);
+}
 
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+bool is_cfi_trap(unsigned long addr);
+#endif
 #endif /* CONFIG_CFI_CLANG */
 
+#ifdef CONFIG_MODULES
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+			 struct module *mod);
+#else
+static inline void module_cfi_finalize(const Elf_Ehdr *hdr,
+				       const Elf_Shdr *sechdrs,
+				       struct module *mod) {}
+#endif /* CONFIG_ARCH_USES_CFI_TRAPS */
+#endif /* CONFIG_MODULES */
+
 #endif /* _LINUX_CFI_H */
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index c84fec767445..42e55579d649 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -66,17 +66,9 @@
 # define __noscs	__attribute__((__no_sanitize__("shadow-call-stack")))
 #endif
 
-#define __nocfi		__attribute__((__no_sanitize__("cfi")))
-#define __cficanonical	__attribute__((__cfi_canonical_jump_table__))
-
-#if defined(CONFIG_CFI_CLANG)
-/*
- * With CONFIG_CFI_CLANG, the compiler replaces function address
- * references with the address of the function's CFI jump table
- * entry. The function_nocfi macro always returns the address of the
- * actual function instead.
- */
-#define function_nocfi(x)	__builtin_function_start(x)
+#if __has_feature(kcfi)
+/* Disable CFI checking inside a function. */
+#define __nocfi		__attribute__((__no_sanitize__("kcfi")))
 #endif
 
 /*
diff --git a/include/linux/module.h b/include/linux/module.h
index 8937b020ec04..ec61fb53979a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -27,7 +27,6 @@
 #include <linux/tracepoint-defs.h>
 #include <linux/srcu.h>
 #include <linux/static_call_types.h>
-#include <linux/cfi.h>
 
 #include <linux/percpu.h>
 #include <asm/module.h>
@@ -387,8 +386,9 @@ struct module {
 	const s32 *crcs;
 	unsigned int num_syms;
 
-#ifdef CONFIG_CFI_CLANG
-	cfi_check_fn cfi_check;
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+	s32 *kcfi_traps;
+	s32 *kcfi_traps_end;
 #endif
 
 	/* Kernel parameters. */
diff --git a/kernel/cfi.c b/kernel/cfi.c
index e8bc1b370edc..08caad776717 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -1,105 +1,101 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Clang Control Flow Integrity (CFI) error and slowpath handling.
+ * Clang Control Flow Integrity (CFI) error handling.
  *
- * Copyright (C) 2021 Google LLC
+ * Copyright (C) 2022 Google LLC
  */
 
-#include <linux/hardirq.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/printk.h>
-#include <linux/ratelimit.h>
-#include <linux/rcupdate.h>
-#include <linux/vmalloc.h>
-#include <asm/cacheflush.h>
-#include <asm/set_memory.h>
-
-/* Compiler-defined handler names */
-#ifdef CONFIG_CFI_PERMISSIVE
-#define cfi_failure_handler	__ubsan_handle_cfi_check_fail
-#else
-#define cfi_failure_handler	__ubsan_handle_cfi_check_fail_abort
-#endif
-
-static inline void handle_cfi_failure(void *ptr)
+#include <linux/cfi.h>
+
+enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
+				      unsigned long *target, u32 type)
 {
-	if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
-		WARN_RATELIMIT(1, "CFI failure (target: %pS):\n", ptr);
+	if (target)
+		pr_err("CFI failure at %pS (target: %pS; expected type: 0x%08x)\n",
+		       (void *)addr, (void *)*target, type);
 	else
-		panic("CFI failure (target: %pS)\n", ptr);
+		pr_err("CFI failure at %pS (no target information)\n",
+		       (void *)addr);
+
+	if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) {
+		__warn(NULL, 0, (void *)addr, 0, regs, NULL);
+		return BUG_TRAP_TYPE_WARN;
+	}
+
+	return BUG_TRAP_TYPE_BUG;
 }
 
-#ifdef CONFIG_MODULES
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+static inline unsigned long trap_address(s32 *p)
+{
+	return (unsigned long)((long)p + (long)*p);
+}
 
-static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
+static bool is_trap(unsigned long addr, s32 *start, s32 *end)
 {
-	cfi_check_fn fn = NULL;
-	struct module *mod;
+	s32 *p;
 
-	rcu_read_lock_sched_notrace();
-	mod = __module_address(ptr);
-	if (mod)
-		fn = mod->cfi_check;
-	rcu_read_unlock_sched_notrace();
+	for (p = start; p < end; ++p) {
+		if (trap_address(p) == addr)
+			return true;
+	}
 
-	return fn;
+	return false;
 }
 
-static inline cfi_check_fn find_check_fn(unsigned long ptr)
+#ifdef CONFIG_MODULES
+/* Populates `kcfi_trap(_end)?` fields in `struct module`. */
+void module_cfi_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
+			 struct module *mod)
 {
-	cfi_check_fn fn = NULL;
-	unsigned long flags;
-	bool rcu_idle;
-
-	if (is_kernel_text(ptr))
-		return __cfi_check;
-
-	/*
-	 * Indirect call checks can happen when RCU is not watching. Both
-	 * the shadow and __module_address use RCU, so we need to wake it
-	 * up if necessary.
-	 */
-	rcu_idle = !rcu_is_watching();
-	if (rcu_idle) {
-		local_irq_save(flags);
-		ct_irq_enter();
-	}
+	char *secstrings;
+	unsigned int i;
 
-	fn = find_module_check_fn(ptr);
+	mod->kcfi_traps = NULL;
+	mod->kcfi_traps_end = NULL;
 
-	if (rcu_idle) {
-		ct_irq_exit();
-		local_irq_restore(flags);
-	}
+	secstrings = (char *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+	for (i = 1; i < hdr->e_shnum; i++) {
+		if (strcmp(secstrings + sechdrs[i].sh_name, "__kcfi_traps"))
+			continue;
 
-	return fn;
+		mod->kcfi_traps = (s32 *)sechdrs[i].sh_addr;
+		mod->kcfi_traps_end = (s32 *)(sechdrs[i].sh_addr + sechdrs[i].sh_size);
+		break;
+	}
 }
 
-void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+static bool is_module_cfi_trap(unsigned long addr)
 {
-	cfi_check_fn fn = find_check_fn((unsigned long)ptr);
+	struct module *mod;
+	bool found = false;
 
-	if (likely(fn))
-		fn(id, ptr, diag);
-	else /* Don't allow unchecked modules */
-		handle_cfi_failure(ptr);
-}
-EXPORT_SYMBOL(__cfi_slowpath_diag);
+	rcu_read_lock_sched_notrace();
 
-#else /* !CONFIG_MODULES */
+	mod = __module_address(addr);
+	if (mod)
+		found = is_trap(addr, mod->kcfi_traps, mod->kcfi_traps_end);
 
-void __cfi_slowpath_diag(uint64_t id, void *ptr, void *diag)
+	rcu_read_unlock_sched_notrace();
+
+	return found;
+}
+#else /* CONFIG_MODULES */
+static inline bool is_module_cfi_trap(unsigned long addr)
 {
-	handle_cfi_failure(ptr); /* No modules */
+	return false;
 }
-EXPORT_SYMBOL(__cfi_slowpath_diag);
-
 #endif /* CONFIG_MODULES */
 
-void cfi_failure_handler(void *data, void *ptr, void *vtable)
+extern s32 __start___kcfi_traps[];
+extern s32 __stop___kcfi_traps[];
+
+bool is_cfi_trap(unsigned long addr)
 {
-	handle_cfi_failure(ptr);
+	if (is_trap(addr, __start___kcfi_traps, __stop___kcfi_traps))
+		return true;
+
+	return is_module_cfi_trap(addr);
 }
-EXPORT_SYMBOL(cfi_failure_handler);
+#endif /* CONFIG_ARCH_USES_CFI_TRAPS */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 0228f44b58e5..70c0b2c6fef8 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -53,6 +53,7 @@
 #include <linux/bsearch.h>
 #include <linux/dynamic_debug.h>
 #include <linux/audit.h>
+#include <linux/cfi.h>
 #include <uapi/linux/module.h>
 #include "internal.h"
 
@@ -2597,8 +2598,9 @@ static int complete_formation(struct module *mod, struct load_info *info)
 	if (err < 0)
 		goto out;
 
-	/* This relies on module_mutex for list integrity. */
+	/* These rely on module_mutex for list integrity. */
 	module_bug_finalize(info->hdr, info->sechdrs, mod);
+	module_cfi_finalize(info->hdr, info->sechdrs, mod);
 
 	if (module_check_misalignment(mod))
 		goto out_misaligned;
@@ -2660,8 +2662,6 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,
 	return 0;
 }
 
-static void cfi_init(struct module *mod);
-
 /*
  * Allocate and load the module: note that size of section 0 is always
  * zero, and we rely on this for optional sections.
@@ -2791,9 +2791,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
 	flush_module_icache(mod);
 
-	/* Setup CFI for the module. */
-	cfi_init(mod);
-
 	/* Now copy in args */
 	mod->args = strndup_user(uargs, ~0UL >> 1);
 	if (IS_ERR(mod->args)) {
@@ -2955,32 +2952,6 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
 	return ((void *)addr >= start && (void *)addr < start + size);
 }
 
-static void cfi_init(struct module *mod)
-{
-#ifdef CONFIG_CFI_CLANG
-	initcall_t *init;
-#ifdef CONFIG_MODULE_UNLOAD
-	exitcall_t *exit;
-#endif
-
-	rcu_read_lock_sched();
-	mod->cfi_check = (cfi_check_fn)
-		find_kallsyms_symbol_value(mod, "__cfi_check");
-	init = (initcall_t *)
-		find_kallsyms_symbol_value(mod, "__cfi_jt_init_module");
-	/* Fix init/exit functions to point to the CFI jump table */
-	if (init)
-		mod->init = *init;
-#ifdef CONFIG_MODULE_UNLOAD
-	exit = (exitcall_t *)
-		find_kallsyms_symbol_value(mod, "__cfi_jt_cleanup_module");
-	if (exit)
-		mod->exit = *exit;
-#endif
-	rcu_read_unlock_sched();
-#endif
-}
-
 /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
 char *module_flags(struct module *mod, char *buf, bool show_state)
 {
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index 3a3aa2354ed8..da4bddd26171 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -3,20 +3,10 @@
  * Archs are free to supply their own linker scripts.  ld will
  * combine them automatically.
  */
-#ifdef CONFIG_CFI_CLANG
-# include <asm/page.h>
-# define ALIGN_CFI 		ALIGN(PAGE_SIZE)
-# define SANITIZER_DISCARDS	*(.eh_frame)
-#else
-# define ALIGN_CFI
-# define SANITIZER_DISCARDS
-#endif
-
 SECTIONS {
 	/DISCARD/ : {
 		*(.discard)
 		*(.discard.*)
-		SANITIZER_DISCARDS
 	}
 
 	__ksymtab		0 : { *(SORT(___ksymtab+*)) }
@@ -33,6 +23,10 @@ SECTIONS {
 
 	__patchable_function_entries : { *(__patchable_function_entries) }
 
+#ifdef CONFIG_ARCH_USES_CFI_TRAPS
+	__kcfi_traps 		: { KEEP(*(.kcfi_traps)) }
+#endif
+
 #ifdef CONFIG_LTO_CLANG
 	/*
 	 * With CONFIG_LTO_CLANG, LLD always enables -fdata-sections and
@@ -53,15 +47,6 @@ SECTIONS {
 		*(.rodata .rodata.[0-9a-zA-Z_]*)
 		*(.rodata..L*)
 	}
-
-	/*
-	 * With CONFIG_CFI_CLANG, we assume __cfi_check is at the beginning
-	 * of the .text section, and is aligned to PAGE_SIZE.
-	 */
-	.text : ALIGN_CFI {
-		*(.text.__cfi_check)
-		*(.text .text.[0-9a-zA-Z_]* .text..L.cfi*)
-	}
 #endif
 }
 
-- 
cgit v1.2.3


From e84e008e7b02c015047e76261726da1550130a59 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:48 -0700
Subject: cfi: Add type helper macros

With CONFIG_CFI_CLANG, assembly functions called indirectly
from C code must be annotated with type identifiers to pass CFI
checking.  In order to make this easier, the compiler emits a
__kcfi_typeid_<function> symbol for each address-taken function
declaration in C, which contains the expected type identifier that
we can refer to in assembly code.

Add a typed version of SYM_FUNC_START, which emits the type
identifier before the function. Architectures that support KCFI can
define their own __CFI_TYPE macro to override the default preamble
format.

As an example, for the x86_64 blowfish_dec_blk function, the
compiler emits the following type symbol:

$ readelf -sW vmlinux | grep __kcfi_typeid_blowfish_dec_blk
120204: 00000000ef478db5     0 NOTYPE  WEAK   DEFAULT  ABS
	__kcfi_typeid_blowfish_dec_blk

And SYM_TYPED_FUNC_START will generate the following preamble based
on the __CFI_TYPE definition for the architecture:

$ objdump -dr arch/x86/crypto/blowfish-x86_64-asm_64.o
     ...
0000000000000400 <__cfi_blowfish_dec_blk>:
     ...
     40b:       b8 00 00 00 00          mov    $0x0,%eax
                   40c: R_X86_64_32 __kcfi_typeid_blowfish_dec_blk

0000000000000410 <blowfish_dec_blk>:
     ...

Note that the address of all assembly functions annotated with
SYM_TYPED_FUNC_START must be taken in C code that's linked into the
binary or the missing __kcfi_typeid_ symbol will result in a linker
error with CONFIG_CFI_CLANG. If the code that contains the indirect
call is not always compiled in, __ADDRESSABLE(functionname) can be
used to ensure that the __kcfi_typeid_ symbol is emitted.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-7-samitolvanen@google.com
---
 include/linux/cfi_types.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 include/linux/cfi_types.h

diff --git a/include/linux/cfi_types.h b/include/linux/cfi_types.h
new file mode 100644
index 000000000000..6b8713675765
--- /dev/null
+++ b/include/linux/cfi_types.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Clang Control Flow Integrity (CFI) type definitions.
+ */
+#ifndef _LINUX_CFI_TYPES_H
+#define _LINUX_CFI_TYPES_H
+
+#ifdef __ASSEMBLY__
+#include <linux/linkage.h>
+
+#ifdef CONFIG_CFI_CLANG
+/*
+ * Use the __kcfi_typeid_<function> type identifier symbol to
+ * annotate indirectly called assembly functions. The compiler emits
+ * these symbols for all address-taken function declarations in C
+ * code.
+ */
+#ifndef __CFI_TYPE
+#define __CFI_TYPE(name)				\
+	.4byte __kcfi_typeid_##name
+#endif
+
+#define SYM_TYPED_ENTRY(name, linkage, align...)	\
+	linkage(name) ASM_NL				\
+	align ASM_NL					\
+	__CFI_TYPE(name) ASM_NL				\
+	name:
+
+#define SYM_TYPED_START(name, linkage, align...)	\
+	SYM_TYPED_ENTRY(name, linkage, align)
+
+#else /* CONFIG_CFI_CLANG */
+
+#define SYM_TYPED_START(name, linkage, align...)	\
+	SYM_START(name, linkage, align)
+
+#endif /* CONFIG_CFI_CLANG */
+
+#ifndef SYM_TYPED_FUNC_START
+#define SYM_TYPED_FUNC_START(name) 			\
+	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _LINUX_CFI_TYPES_H */
-- 
cgit v1.2.3


From cf90d0383560de12330de8cf3f831b14cdd45914 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:49 -0700
Subject: lkdtm: Emit an indirect call for CFI tests

Clang can convert the indirect calls in lkdtm_CFI_FORWARD_PROTO into
direct calls. Move the call into a noinline function that accepts the
target address as an argument to ensure the compiler actually emits an
indirect call instead.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-8-samitolvanen@google.com
---
 drivers/misc/lkdtm/cfi.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/lkdtm/cfi.c b/drivers/misc/lkdtm/cfi.c
index 71483cb1e422..5245cf6013c9 100644
--- a/drivers/misc/lkdtm/cfi.c
+++ b/drivers/misc/lkdtm/cfi.c
@@ -20,6 +20,13 @@ static noinline int lkdtm_increment_int(int *counter)
 
 	return *counter;
 }
+
+/* Don't allow the compiler to inline the calls. */
+static noinline void lkdtm_indirect_call(void (*func)(int *))
+{
+	func(&called_count);
+}
+
 /*
  * This tries to call an indirect function with a mismatched prototype.
  */
@@ -29,15 +36,11 @@ static void lkdtm_CFI_FORWARD_PROTO(void)
 	 * Matches lkdtm_increment_void()'s prototype, but not
 	 * lkdtm_increment_int()'s prototype.
 	 */
-	void (*func)(int *);
-
 	pr_info("Calling matched prototype ...\n");
-	func = lkdtm_increment_void;
-	func(&called_count);
+	lkdtm_indirect_call(lkdtm_increment_void);
 
 	pr_info("Calling mismatched prototype ...\n");
-	func = (void *)lkdtm_increment_int;
-	func(&called_count);
+	lkdtm_indirect_call((void *)lkdtm_increment_int);
 
 	pr_err("FAIL: survived mismatched prototype function call!\n");
 	pr_expected_config(CONFIG_CFI_CLANG);
-- 
cgit v1.2.3


From 44f665b69c67f0a17a0c8748030ed30205532149 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:50 -0700
Subject: psci: Fix the function type for psci_initcall_t

Functions called through a psci_initcall_t pointer all have
non-const arguments. Fix the type definition to avoid tripping
indirect call checks with CFI_CLANG.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-9-samitolvanen@google.com
---
 drivers/firmware/psci/psci.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index cfb448eabdaa..75ef784a3789 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -274,7 +274,7 @@ static void set_conduit(enum arm_smccc_conduit conduit)
 	psci_conduit = conduit;
 }
 
-static int get_set_conduit_method(struct device_node *np)
+static int get_set_conduit_method(const struct device_node *np)
 {
 	const char *method;
 
@@ -528,7 +528,7 @@ typedef int (*psci_initcall_t)(const struct device_node *);
  *
  * Probe based on PSCI PSCI_VERSION function
  */
-static int __init psci_0_2_init(struct device_node *np)
+static int __init psci_0_2_init(const struct device_node *np)
 {
 	int err;
 
@@ -549,7 +549,7 @@ static int __init psci_0_2_init(struct device_node *np)
 /*
  * PSCI < v0.2 get PSCI Function IDs via DT.
  */
-static int __init psci_0_1_init(struct device_node *np)
+static int __init psci_0_1_init(const struct device_node *np)
 {
 	u32 id;
 	int err;
@@ -585,7 +585,7 @@ static int __init psci_0_1_init(struct device_node *np)
 	return 0;
 }
 
-static int __init psci_1_0_init(struct device_node *np)
+static int __init psci_1_0_init(const struct device_node *np)
 {
 	int err;
 
-- 
cgit v1.2.3


From c50d32859e70f6dbccb7d151408eb10afbbb7965 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:51 -0700
Subject: arm64: Add types to indirect called assembly functions

With CONFIG_CFI_CLANG, assembly functions indirectly called from C
code must be annotated with type identifiers to pass CFI checking. Use
SYM_TYPED_FUNC_START for the indirectly called functions, and ensure
we emit `bti c` also with SYM_TYPED_FUNC_START.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-10-samitolvanen@google.com
---
 arch/arm64/crypto/ghash-ce-core.S | 5 +++--
 arch/arm64/crypto/sm3-ce-core.S   | 3 ++-
 arch/arm64/include/asm/linkage.h  | 4 ++++
 arch/arm64/kernel/cpu-reset.S     | 5 +++--
 arch/arm64/mm/proc.S              | 5 +++--
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index 7868330dd54e..ebe5558929b7 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -6,6 +6,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/cfi_types.h>
 #include <asm/assembler.h>
 
 	SHASH		.req	v0
@@ -350,11 +351,11 @@ CPU_LE(	rev64		T1.16b, T1.16b	)
 	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
 	 *			   struct ghash_key const *k, const char *head)
 	 */
-SYM_FUNC_START(pmull_ghash_update_p64)
+SYM_TYPED_FUNC_START(pmull_ghash_update_p64)
 	__pmull_ghash	p64
 SYM_FUNC_END(pmull_ghash_update_p64)
 
-SYM_FUNC_START(pmull_ghash_update_p8)
+SYM_TYPED_FUNC_START(pmull_ghash_update_p8)
 	__pmull_ghash	p8
 SYM_FUNC_END(pmull_ghash_update_p8)
 
diff --git a/arch/arm64/crypto/sm3-ce-core.S b/arch/arm64/crypto/sm3-ce-core.S
index ef97d3187cb7..ca70cfacd0d0 100644
--- a/arch/arm64/crypto/sm3-ce-core.S
+++ b/arch/arm64/crypto/sm3-ce-core.S
@@ -6,6 +6,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/cfi_types.h>
 #include <asm/assembler.h>
 
 	.irp		b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
@@ -73,7 +74,7 @@
 	 *                       int blocks)
 	 */
 	.text
-SYM_FUNC_START(sm3_ce_transform)
+SYM_TYPED_FUNC_START(sm3_ce_transform)
 	/* load state */
 	ld1		{v8.4s-v9.4s}, [x0]
 	rev64		v8.4s, v8.4s
diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index 43f8c25b3fda..1436fa1cde24 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -39,4 +39,8 @@
 	SYM_START(name, SYM_L_WEAK, SYM_A_NONE)		\
 	bti c ;
 
+#define SYM_TYPED_FUNC_START(name)				\
+	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
+	bti c ;
+
 #endif
diff --git a/arch/arm64/kernel/cpu-reset.S b/arch/arm64/kernel/cpu-reset.S
index 48a8af97faa9..6b752fe89745 100644
--- a/arch/arm64/kernel/cpu-reset.S
+++ b/arch/arm64/kernel/cpu-reset.S
@@ -8,6 +8,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/cfi_types.h>
 #include <asm/assembler.h>
 #include <asm/sysreg.h>
 #include <asm/virt.h>
@@ -28,7 +29,7 @@
  * branch to what would be the reset vector. It must be executed with the
  * flat identity mapping.
  */
-SYM_CODE_START(cpu_soft_restart)
+SYM_TYPED_FUNC_START(cpu_soft_restart)
 	mov_q	x12, INIT_SCTLR_EL1_MMU_OFF
 	pre_disable_mmu_workaround
 	/*
@@ -47,6 +48,6 @@ SYM_CODE_START(cpu_soft_restart)
 	mov	x1, x3				// arg1
 	mov	x2, x4				// arg2
 	br	x8
-SYM_CODE_END(cpu_soft_restart)
+SYM_FUNC_END(cpu_soft_restart)
 
 .popsection
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 7837a69524c5..8b9f419fcad9 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -10,6 +10,7 @@
 #include <linux/init.h>
 #include <linux/linkage.h>
 #include <linux/pgtable.h>
+#include <linux/cfi_types.h>
 #include <asm/assembler.h>
 #include <asm/asm-offsets.h>
 #include <asm/asm_pointer_auth.h>
@@ -185,7 +186,7 @@ SYM_FUNC_END(cpu_do_resume)
  * This is the low-level counterpart to cpu_replace_ttbr1, and should not be
  * called by anything else. It can only be executed from a TTBR0 mapping.
  */
-SYM_FUNC_START(idmap_cpu_replace_ttbr1)
+SYM_TYPED_FUNC_START(idmap_cpu_replace_ttbr1)
 	save_and_disable_daif flags=x2
 
 	__idmap_cpu_set_reserved_ttbr1 x1, x3
@@ -253,7 +254,7 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
 SYM_DATA(__idmap_kpti_flag, .long 1)
 	.popsection
 
-SYM_FUNC_START(idmap_kpti_install_ng_mappings)
+SYM_TYPED_FUNC_START(idmap_kpti_install_ng_mappings)
 	cpu		.req	w0
 	temp_pte	.req	x0
 	num_cpus	.req	w1
-- 
cgit v1.2.3


From b26e484b8bb3a992ef30e851d771973a3dd2336b Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:52 -0700
Subject: arm64: Add CFI error handling

With -fsanitize=kcfi, CFI always traps. Add arm64 support for handling CFI
failures. The registers containing the target address and the expected type
are encoded in the first ten bits of the ESR as follows:

 - 0-4: n, where the register Xn contains the target address
 - 5-9: m, where the register Wm contains the type hash

This produces the following oops on CFI failure (generated using lkdtm):

[   21.885179] CFI failure at lkdtm_indirect_call+0x2c/0x44 [lkdtm]
(target: lkdtm_increment_int+0x0/0x1c [lkdtm]; expected type: 0x7e0c52a)
[   21.886593] Internal error: Oops - CFI: 0 [#1] PREEMPT SMP
[   21.891060] Modules linked in: lkdtm
[   21.893363] CPU: 0 PID: 151 Comm: sh Not tainted
5.19.0-rc1-00021-g852f4e48dbab #1
[   21.895560] Hardware name: linux,dummy-virt (DT)
[   21.896543] pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[   21.897583] pc : lkdtm_indirect_call+0x2c/0x44 [lkdtm]
[   21.898551] lr : lkdtm_CFI_FORWARD_PROTO+0x3c/0x6c [lkdtm]
[   21.899520] sp : ffff8000083a3c50
[   21.900191] x29: ffff8000083a3c50 x28: ffff0000027e0ec0 x27: 0000000000000000
[   21.902453] x26: 0000000000000000 x25: ffffc2aa3d07e7b0 x24: 0000000000000002
[   21.903736] x23: ffffc2aa3d079088 x22: ffffc2aa3d07e7b0 x21: ffff000003379000
[   21.905062] x20: ffff8000083a3dc0 x19: 0000000000000012 x18: 0000000000000000
[   21.906371] x17: 000000007e0c52a5 x16: 000000003ad55aca x15: ffffc2aa60d92138
[   21.907662] x14: ffffffffffffffff x13: 2e2e2e2065707974 x12: 0000000000000018
[   21.909775] x11: ffffc2aa62322b88 x10: ffffc2aa62322aa0 x9 : c7e305fb5195d200
[   21.911898] x8 : ffffc2aa3d077e20 x7 : 6d20676e696c6c61 x6 : 43203a6d74646b6c
[   21.913108] x5 : ffffc2aa6266c9df x4 : ffffc2aa6266c9e1 x3 : ffff8000083a3968
[   21.914358] x2 : 80000000fffff122 x1 : 00000000fffff122 x0 : ffffc2aa3d07e8f8
[   21.915827] Call trace:
[   21.916375]  lkdtm_indirect_call+0x2c/0x44 [lkdtm]
[   21.918060]  lkdtm_CFI_FORWARD_PROTO+0x3c/0x6c [lkdtm]
[   21.919030]  lkdtm_do_action+0x34/0x4c [lkdtm]
[   21.919920]  direct_entry+0x170/0x1ac [lkdtm]
[   21.920772]  full_proxy_write+0x84/0x104
[   21.921759]  vfs_write+0x188/0x3d8
[   21.922387]  ksys_write+0x78/0xe8
[   21.922986]  __arm64_sys_write+0x1c/0x2c
[   21.923696]  invoke_syscall+0x58/0x134
[   21.924554]  el0_svc_common+0xb4/0xf4
[   21.925603]  do_el0_svc+0x2c/0xb4
[   21.926563]  el0_svc+0x2c/0x7c
[   21.927147]  el0t_64_sync_handler+0x84/0xf0
[   21.927985]  el0t_64_sync+0x18c/0x190
[   21.929133] Code: 728a54b1 72afc191 6b11021f 54000040 (d4304500)
[   21.930690] ---[ end trace 0000000000000000 ]---
[   21.930971] Kernel panic - not syncing: Oops - CFI: Fatal exception

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-11-samitolvanen@google.com
---
 arch/arm64/include/asm/brk-imm.h |  6 +++++
 arch/arm64/kernel/traps.c        | 47 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h
index ec7720dbe2c8..6e000113e508 100644
--- a/arch/arm64/include/asm/brk-imm.h
+++ b/arch/arm64/include/asm/brk-imm.h
@@ -17,6 +17,7 @@
  * 0x401: for compile time BRK instruction
  * 0x800: kernel-mode BUG() and WARN() traps
  * 0x9xx: tag-based KASAN trap (allowed values 0x900 - 0x9ff)
+ * 0x8xxx: Control-Flow Integrity traps
  */
 #define KPROBES_BRK_IMM			0x004
 #define UPROBES_BRK_IMM			0x005
@@ -28,4 +29,9 @@
 #define KASAN_BRK_IMM			0x900
 #define KASAN_BRK_MASK			0x0ff
 
+#define CFI_BRK_IMM_TARGET		GENMASK(4, 0)
+#define CFI_BRK_IMM_TYPE		GENMASK(9, 5)
+#define CFI_BRK_IMM_BASE		0x8000
+#define CFI_BRK_IMM_MASK		(CFI_BRK_IMM_TARGET | CFI_BRK_IMM_TYPE)
+
 #endif
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index b7fed33981f7..3c026da95bbc 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -26,6 +26,7 @@
 #include <linux/syscalls.h>
 #include <linux/mm_types.h>
 #include <linux/kasan.h>
+#include <linux/cfi.h>
 
 #include <asm/atomic.h>
 #include <asm/bug.h>
@@ -991,6 +992,38 @@ static struct break_hook bug_break_hook = {
 	.imm = BUG_BRK_IMM,
 };
 
+#ifdef CONFIG_CFI_CLANG
+static int cfi_handler(struct pt_regs *regs, unsigned long esr)
+{
+	unsigned long target;
+	u32 type;
+
+	target = pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TARGET, esr));
+	type = (u32)pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TYPE, esr));
+
+	switch (report_cfi_failure(regs, regs->pc, &target, type)) {
+	case BUG_TRAP_TYPE_BUG:
+		die("Oops - CFI", regs, 0);
+		break;
+
+	case BUG_TRAP_TYPE_WARN:
+		break;
+
+	default:
+		return DBG_HOOK_ERROR;
+	}
+
+	arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
+	return DBG_HOOK_HANDLED;
+}
+
+static struct break_hook cfi_break_hook = {
+	.fn = cfi_handler,
+	.imm = CFI_BRK_IMM_BASE,
+	.mask = CFI_BRK_IMM_MASK,
+};
+#endif /* CONFIG_CFI_CLANG */
+
 static int reserved_fault_handler(struct pt_regs *regs, unsigned long esr)
 {
 	pr_err("%s generated an invalid instruction at %pS!\n",
@@ -1052,6 +1085,9 @@ static struct break_hook kasan_break_hook = {
 };
 #endif
 
+
+#define esr_comment(esr) ((esr) & ESR_ELx_BRK64_ISS_COMMENT_MASK)
+
 /*
  * Initial handler for AArch64 BRK exceptions
  * This handler only used until debug_traps_init().
@@ -1059,10 +1095,12 @@ static struct break_hook kasan_break_hook = {
 int __init early_brk64(unsigned long addr, unsigned long esr,
 		struct pt_regs *regs)
 {
+#ifdef CONFIG_CFI_CLANG
+	if ((esr_comment(esr) & ~CFI_BRK_IMM_MASK) == CFI_BRK_IMM_BASE)
+		return cfi_handler(regs, esr) != DBG_HOOK_HANDLED;
+#endif
 #ifdef CONFIG_KASAN_SW_TAGS
-	unsigned long comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK;
-
-	if ((comment & ~KASAN_BRK_MASK) == KASAN_BRK_IMM)
+	if ((esr_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM)
 		return kasan_handler(regs, esr) != DBG_HOOK_HANDLED;
 #endif
 	return bug_handler(regs, esr) != DBG_HOOK_HANDLED;
@@ -1071,6 +1109,9 @@ int __init early_brk64(unsigned long addr, unsigned long esr,
 void __init trap_init(void)
 {
 	register_kernel_break_hook(&bug_break_hook);
+#ifdef CONFIG_CFI_CLANG
+	register_kernel_break_hook(&cfi_break_hook);
+#endif
 	register_kernel_break_hook(&fault_break_hook);
 #ifdef CONFIG_KASAN_SW_TAGS
 	register_kernel_break_hook(&kasan_break_hook);
-- 
cgit v1.2.3


From 5f20997c194e8b74254cbdb113b2b09bc1c0c734 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:53 -0700
Subject: arm64: Drop unneeded __nocfi attributes

With -fsanitize=kcfi, CONFIG_CFI_CLANG no longer has issues
with address space confusion in functions that switch to linear
mapping. Now that the indirectly called assembly functions have
type annotations, drop the __nocfi attributes.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-12-samitolvanen@google.com
---
 arch/arm64/include/asm/mmu_context.h | 2 +-
 arch/arm64/kernel/alternative.c      | 2 +-
 arch/arm64/kernel/cpufeature.c       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index c7ccd82db1d2..bba0e630c8bc 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -147,7 +147,7 @@ static inline void cpu_install_ttbr0(phys_addr_t ttbr0, unsigned long t0sz)
  * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
  * avoiding the possibility of conflicting TLB entries being allocated.
  */
-static inline void __nocfi cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap)
+static inline void cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap)
 {
 	typedef void (ttbr_replace_func)(phys_addr_t);
 	extern ttbr_replace_func idmap_cpu_replace_ttbr1;
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 9bcaa5eacf16..d2c66507398d 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -133,7 +133,7 @@ static void clean_dcache_range_nopatch(u64 start, u64 end)
 	} while (cur += d_size, cur < end);
 }
 
-static void __nocfi __apply_alternatives(struct alt_region *region, bool is_module,
+static void __apply_alternatives(struct alt_region *region, bool is_module,
 				 unsigned long *feature_mask)
 {
 	struct alt_instr *alt;
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index af4de817d712..ca6e5ca7104e 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1685,7 +1685,7 @@ static phys_addr_t kpti_ng_pgd_alloc(int shift)
 	return kpti_ng_temp_alloc;
 }
 
-static void __nocfi
+static void
 kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
 {
 	typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long);
-- 
cgit v1.2.3


From 5dbbb3eaa2a784342f3206b77381b516181d089c Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:54 -0700
Subject: init: Drop __nocfi from __init

It's no longer necessary to disable CFI checking for all __init
functions. Drop the __nocfi attribute from __init.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-13-samitolvanen@google.com
---
 include/linux/init.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/init.h b/include/linux/init.h
index baf0b29a7010..88f2964097f5 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -47,7 +47,7 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__section(".init.text") __cold  __latent_entropy __noinitretpoline __nocfi
+#define __init		__section(".init.text") __cold  __latent_entropy __noinitretpoline
 #define __initdata	__section(".init.data")
 #define __initconst	__section(".init.rodata")
 #define __exitdata	__section(".exit.data")
-- 
cgit v1.2.3


From 607289a7cd7a3ca42b8a6877fcb6072e6eb20c34 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:55 -0700
Subject: treewide: Drop function_nocfi

With -fsanitize=kcfi, we no longer need function_nocfi() as
the compiler won't change function references to point to a
jump table. Remove all implementations and uses of the macro.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-14-samitolvanen@google.com
---
 arch/arm64/include/asm/ftrace.h           |  2 +-
 arch/arm64/include/asm/mmu_context.h      |  2 +-
 arch/arm64/kernel/acpi_parking_protocol.c |  2 +-
 arch/arm64/kernel/cpufeature.c            |  2 +-
 arch/arm64/kernel/ftrace.c                |  2 +-
 arch/arm64/kernel/machine_kexec.c         |  2 +-
 arch/arm64/kernel/psci.c                  |  2 +-
 arch/arm64/kernel/smp_spin_table.c        |  2 +-
 drivers/firmware/psci/psci.c              |  4 ++--
 drivers/misc/lkdtm/usercopy.c             |  2 +-
 include/linux/compiler.h                  | 10 ----------
 11 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h
index dbc45a4157fa..329dbbd4d50b 100644
--- a/arch/arm64/include/asm/ftrace.h
+++ b/arch/arm64/include/asm/ftrace.h
@@ -26,7 +26,7 @@
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 #define ARCH_SUPPORTS_FTRACE_OPS 1
 #else
-#define MCOUNT_ADDR		((unsigned long)function_nocfi(_mcount))
+#define MCOUNT_ADDR		((unsigned long)_mcount)
 #endif
 
 /* The BL at the callsite's adjusted rec->ip */
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index bba0e630c8bc..d3f8b5df0c1f 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -168,7 +168,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap)
 		ttbr1 |= TTBR_CNP_BIT;
 	}
 
-	replace_phys = (void *)__pa_symbol(function_nocfi(idmap_cpu_replace_ttbr1));
+	replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
 
 	__cpu_install_idmap(idmap);
 	replace_phys(ttbr1);
diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c
index bfeeb5319abf..b1990e38aed0 100644
--- a/arch/arm64/kernel/acpi_parking_protocol.c
+++ b/arch/arm64/kernel/acpi_parking_protocol.c
@@ -99,7 +99,7 @@ static int acpi_parking_protocol_cpu_boot(unsigned int cpu)
 	 * that read this address need to convert this address to the
 	 * Boot-Loader's endianness before jumping.
 	 */
-	writeq_relaxed(__pa_symbol(function_nocfi(secondary_entry)),
+	writeq_relaxed(__pa_symbol(secondary_entry),
 		       &mailbox->entry_point);
 	writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id);
 
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index ca6e5ca7104e..d8361691efeb 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1713,7 +1713,7 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
 	if (arm64_use_ng_mappings)
 		return;
 
-	remap_fn = (void *)__pa_symbol(function_nocfi(idmap_kpti_install_ng_mappings));
+	remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings);
 
 	if (!cpu) {
 		alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index ea5dc7c90f46..26789865748c 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -56,7 +56,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	unsigned long pc;
 	u32 new;
 
-	pc = (unsigned long)function_nocfi(ftrace_call);
+	pc = (unsigned long)ftrace_call;
 	new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func,
 					  AARCH64_INSN_BRANCH_LINK);
 
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 19c2d487cb08..ce3d40120f72 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -204,7 +204,7 @@ void machine_kexec(struct kimage *kimage)
 		typeof(cpu_soft_restart) *restart;
 
 		cpu_install_idmap();
-		restart = (void *)__pa_symbol(function_nocfi(cpu_soft_restart));
+		restart = (void *)__pa_symbol(cpu_soft_restart);
 		restart(is_hyp_nvhe(), kimage->start, kimage->arch.dtb_mem,
 			0, 0);
 	} else {
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c
index ab7f4c476104..29a8e444db83 100644
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -38,7 +38,7 @@ static int __init cpu_psci_cpu_prepare(unsigned int cpu)
 
 static int cpu_psci_cpu_boot(unsigned int cpu)
 {
-	phys_addr_t pa_secondary_entry = __pa_symbol(function_nocfi(secondary_entry));
+	phys_addr_t pa_secondary_entry = __pa_symbol(secondary_entry);
 	int err = psci_ops.cpu_on(cpu_logical_map(cpu), pa_secondary_entry);
 	if (err)
 		pr_err("failed to boot CPU%d (%d)\n", cpu, err);
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index 7e1624ecab3c..49029eace3ad 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -66,7 +66,7 @@ static int smp_spin_table_cpu_init(unsigned int cpu)
 static int smp_spin_table_cpu_prepare(unsigned int cpu)
 {
 	__le64 __iomem *release_addr;
-	phys_addr_t pa_holding_pen = __pa_symbol(function_nocfi(secondary_holding_pen));
+	phys_addr_t pa_holding_pen = __pa_symbol(secondary_holding_pen);
 
 	if (!cpu_release_addr[cpu])
 		return -ENODEV;
diff --git a/drivers/firmware/psci/psci.c b/drivers/firmware/psci/psci.c
index 75ef784a3789..bc6b5a12bf74 100644
--- a/drivers/firmware/psci/psci.c
+++ b/drivers/firmware/psci/psci.c
@@ -334,7 +334,7 @@ static int __init psci_features(u32 psci_func_id)
 static int psci_suspend_finisher(unsigned long state)
 {
 	u32 power_state = state;
-	phys_addr_t pa_cpu_resume = __pa_symbol(function_nocfi(cpu_resume));
+	phys_addr_t pa_cpu_resume = __pa_symbol(cpu_resume);
 
 	return psci_ops.cpu_suspend(power_state, pa_cpu_resume);
 }
@@ -359,7 +359,7 @@ int psci_cpu_suspend_enter(u32 state)
 
 static int psci_system_suspend(unsigned long unused)
 {
-	phys_addr_t pa_cpu_resume = __pa_symbol(function_nocfi(cpu_resume));
+	phys_addr_t pa_cpu_resume = __pa_symbol(cpu_resume);
 
 	return invoke_psci_fn(PSCI_FN_NATIVE(1_0, SYSTEM_SUSPEND),
 			      pa_cpu_resume, 0, 0);
diff --git a/drivers/misc/lkdtm/usercopy.c b/drivers/misc/lkdtm/usercopy.c
index 6215ec995cd3..67db57249a34 100644
--- a/drivers/misc/lkdtm/usercopy.c
+++ b/drivers/misc/lkdtm/usercopy.c
@@ -330,7 +330,7 @@ static void lkdtm_USERCOPY_KERNEL(void)
 
 	pr_info("attempting bad copy_to_user from kernel text: %px\n",
 		vm_mmap);
-	if (copy_to_user((void __user *)user_addr, function_nocfi(vm_mmap),
+	if (copy_to_user((void __user *)user_addr, vm_mmap,
 			 unconst + PAGE_SIZE)) {
 		pr_warn("copy_to_user failed, but lacked Oops\n");
 		goto free_user;
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 7bfafc69172a..973a1bfd7ef5 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -203,16 +203,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	__v;								\
 })
 
-/*
- * With CONFIG_CFI_CLANG, the compiler replaces function addresses in
- * instrumented C code with jump table addresses. Architectures that
- * support CFI can define this macro to return the actual function address
- * when needed.
- */
-#ifndef function_nocfi
-#define function_nocfi(x) (x)
-#endif
-
 #endif /* __KERNEL__ */
 
 /*
-- 
cgit v1.2.3


From 4b24356312fbe1bace72f9905d529b14fc34c1c3 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:56 -0700
Subject: treewide: Drop WARN_ON_FUNCTION_MISMATCH

CONFIG_CFI_CLANG no longer breaks cross-module function address
equality, which makes WARN_ON_FUNCTION_MISMATCH unnecessary. Remove
the definition and switch back to WARN_ON_ONCE.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-15-samitolvanen@google.com
---
 include/asm-generic/bug.h | 16 ----------------
 kernel/kthread.c          |  3 +--
 kernel/workqueue.c        |  2 +-
 3 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index ba1f860af38b..4050b191e1a9 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -220,22 +220,6 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 # define WARN_ON_SMP(x)			({0;})
 #endif
 
-/*
- * WARN_ON_FUNCTION_MISMATCH() warns if a value doesn't match a
- * function address, and can be useful for catching issues with
- * callback functions, for example.
- *
- * With CONFIG_CFI_CLANG, the warning is disabled because the
- * compiler replaces function addresses taken in C code with
- * local jump table addresses, which breaks cross-module function
- * address equality.
- */
-#if defined(CONFIG_CFI_CLANG) && defined(CONFIG_MODULES)
-# define WARN_ON_FUNCTION_MISMATCH(x, fn) ({ 0; })
-#else
-# define WARN_ON_FUNCTION_MISMATCH(x, fn) WARN_ON_ONCE((x) != (fn))
-#endif
-
 #endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3c677918d8f2..28a6b7ab4a0f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1050,8 +1050,7 @@ static void __kthread_queue_delayed_work(struct kthread_worker *worker,
 	struct timer_list *timer = &dwork->timer;
 	struct kthread_work *work = &dwork->work;
 
-	WARN_ON_FUNCTION_MISMATCH(timer->function,
-				  kthread_delayed_work_timer_fn);
+	WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
 
 	/*
 	 * If @delay is 0, queue @dwork->work immediately.  This is for
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index aeea9731ef80..16df315d2a3d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1651,7 +1651,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	WARN_ON_ONCE(!wq);
-	WARN_ON_FUNCTION_MISMATCH(timer->function, delayed_work_timer_fn);
+	WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
 	WARN_ON_ONCE(timer_pending(timer));
 	WARN_ON_ONCE(!list_empty(&work->entry));
 
-- 
cgit v1.2.3


From 5659b598b4dcb352b1a567c55fc5a658bc80076c Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:57 -0700
Subject: treewide: Drop __cficanonical

CONFIG_CFI_CLANG doesn't use a jump table anymore and therefore,
won't change function references to point elsewhere. Remove the
__cficanonical attribute and all uses of it.

Note that the Clang definition of the attribute was removed earlier,
just clean up the no-op definition and users.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-16-samitolvanen@google.com
---
 include/linux/compiler_types.h | 4 ----
 include/linux/init.h           | 4 ++--
 include/linux/pci.h            | 4 ++--
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 4f2a819fd60a..6f2ec0976e2d 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -265,10 +265,6 @@ struct ftrace_likely_data {
 # define __nocfi
 #endif
 
-#ifndef __cficanonical
-# define __cficanonical
-#endif
-
 /*
  * Any place that could be marked with the "alloc_size" attribute is also
  * a place to be marked with the "malloc" attribute. Do this as part of the
diff --git a/include/linux/init.h b/include/linux/init.h
index 88f2964097f5..a0a90cd73ebe 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -220,8 +220,8 @@ extern bool initcall_debug;
 	__initcall_name(initstub, __iid, id)
 
 #define __define_initcall_stub(__stub, fn)			\
-	int __init __cficanonical __stub(void);			\
-	int __init __cficanonical __stub(void)			\
+	int __init __stub(void);				\
+	int __init __stub(void)					\
 	{ 							\
 		return fn();					\
 	}							\
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 060af91bafcd..5da0846aa3c1 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2019,8 +2019,8 @@ enum pci_fixup_pass {
 #ifdef CONFIG_LTO_CLANG
 #define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class,	\
 				  class_shift, hook, stub)		\
-	void __cficanonical stub(struct pci_dev *dev);			\
-	void __cficanonical stub(struct pci_dev *dev)			\
+	void stub(struct pci_dev *dev);					\
+	void stub(struct pci_dev *dev)					\
 	{ 								\
 		hook(dev); 						\
 	}								\
-- 
cgit v1.2.3


From 5141d3a06b2da1731ac82091298b766a1f95d3d8 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:58 -0700
Subject: objtool: Preserve special st_shndx indexes in elf_update_symbol

elf_update_symbol fails to preserve the special st_shndx values
between [SHN_LORESERVE, SHN_HIRESERVE], which results in it
converting SHN_ABS entries into SHN_UNDEF, for example. Explicitly
check for the special indexes and ensure these symbols are not
marked undefined.

Fixes: ead165fa1042 ("objtool: Fix symbol creation")
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-17-samitolvanen@google.com
---
 tools/objtool/elf.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index c25e957c1e52..7e24b09b1163 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -619,6 +619,11 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab,
 	Elf64_Xword entsize = symtab->sh.sh_entsize;
 	int max_idx, idx = sym->idx;
 	Elf_Scn *s, *t = NULL;
+	bool is_special_shndx = sym->sym.st_shndx >= SHN_LORESERVE &&
+				sym->sym.st_shndx != SHN_XINDEX;
+
+	if (is_special_shndx)
+		shndx = sym->sym.st_shndx;
 
 	s = elf_getscn(elf->elf, symtab->idx);
 	if (!s) {
@@ -704,7 +709,7 @@ static int elf_update_symbol(struct elf *elf, struct section *symtab,
 	}
 
 	/* setup extended section index magic and write the symbol */
-	if (shndx >= SHN_UNDEF && shndx < SHN_LORESERVE) {
+	if ((shndx >= SHN_UNDEF && shndx < SHN_LORESERVE) || is_special_shndx) {
 		sym->sym.st_shndx = shndx;
 		if (!shndx_data)
 			shndx = 0;
-- 
cgit v1.2.3


From 3c68a92d17add767109441f4040391b9e8a14a98 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:54:59 -0700
Subject: objtool: Disable CFI warnings

The __cfi_ preambles contain a mov instruction that embeds the KCFI
type identifier in the following format:

  ; type preamble
  __cfi_function:
    mov <id>, %eax
  function:
    ...

While the preamble symbols are STT_FUNC and contain valid
instructions, they are never executed and always fall through. Skip
the warning for them.

.kcfi_traps sections point to CFI traps in text sections. Also skip
the warning about them referencing !ENDBR instructions.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-18-samitolvanen@google.com
---
 tools/objtool/check.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index e55fdf952a3a..48e18737a2d1 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -3316,6 +3316,10 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
 		next_insn = next_insn_to_validate(file, insn);
 
 		if (func && insn->func && func != insn->func->pfunc) {
+			/* Ignore KCFI type preambles, which always fall through */
+			if (!strncmp(func->name, "__cfi_", 6))
+				return 0;
+
 			WARN("%s() falls through to next function %s()",
 			     func->name, insn->func->name);
 			return 1;
@@ -4113,7 +4117,8 @@ static int validate_ibt(struct objtool_file *file)
 		    !strcmp(sec->name, "__bug_table")			||
 		    !strcmp(sec->name, "__ex_table")			||
 		    !strcmp(sec->name, "__jump_table")			||
-		    !strcmp(sec->name, "__mcount_loc"))
+		    !strcmp(sec->name, "__mcount_loc")			||
+		    !strcmp(sec->name, ".kcfi_traps"))
 			continue;
 
 		list_for_each_entry(reloc, &sec->reloc->reloc_list, list)
-- 
cgit v1.2.3


From dfb352ab1162f73b8c6dc98150fa32cf5aa2f623 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:55:00 -0700
Subject: kallsyms: Drop CONFIG_CFI_CLANG workarounds

With -fsanitize=kcfi, the compiler no longer renames static
functions with CONFIG_CFI_CLANG + ThinLTO. Drop the code that cleans
up the ThinLTO hash from the function names.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-19-samitolvanen@google.com
---
 kernel/kallsyms.c | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3e7e2c2ad2f7..b27e6ea31f8b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -159,7 +159,6 @@ static bool cleanup_symbol_name(char *s)
 	 * character in an identifier in C. Suffixes observed:
 	 * - foo.llvm.[0-9a-f]+
 	 * - foo.[0-9a-f]+
-	 * - foo.[0-9a-f]+.cfi_jt
 	 */
 	res = strchr(s, '.');
 	if (res) {
@@ -167,22 +166,6 @@ static bool cleanup_symbol_name(char *s)
 		return true;
 	}
 
-	if (!IS_ENABLED(CONFIG_CFI_CLANG) ||
-	    !IS_ENABLED(CONFIG_LTO_CLANG_THIN) ||
-	    CONFIG_CLANG_VERSION >= 130000)
-		return false;
-
-	/*
-	 * Prior to LLVM 13, the following suffixes were observed when thinLTO
-	 * and CFI are both enabled:
-	 * - foo$[0-9]+
-	 */
-	res = strrchr(s, '$');
-	if (res) {
-		*res = '\0';
-		return true;
-	}
-
 	return false;
 }
 
-- 
cgit v1.2.3


From ca7e10bff196f69a450b9072a7b757713d3bb2dd Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:55:01 -0700
Subject: x86/tools/relocs: Ignore __kcfi_typeid_ relocations

The compiler generates __kcfi_typeid_ symbols for annotating assembly
functions with type information. These are constants that can be
referenced in assembly code and are resolved by the linker. Ignore
them in relocs.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-20-samitolvanen@google.com
---
 arch/x86/tools/relocs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index e2c5b296120d..2925074b9a58 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -56,6 +56,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
 	"^(xen_irq_disable_direct_reloc$|"
 	"xen_save_fl_direct_reloc$|"
 	"VDSO|"
+	"__kcfi_typeid_|"
 	"__crc_)",
 
 /*
-- 
cgit v1.2.3


From ccace936eec7b805e1ab9268a6d163a00047b3a9 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:55:02 -0700
Subject: x86: Add types to indirectly called assembly functions

With CONFIG_CFI_CLANG, assembly functions indirectly called
from C code must be annotated with type identifiers to pass CFI
checking. Define the __CFI_TYPE helper macro to match the compiler
generated function preamble, and ensure SYM_TYPED_FUNC_START also
emits ENDBR with IBT.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-21-samitolvanen@google.com
---
 arch/x86/crypto/blowfish-x86_64-asm_64.S |  5 +++--
 arch/x86/include/asm/linkage.h           | 12 ++++++++++++
 arch/x86/lib/memcpy_64.S                 |  3 ++-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 802d71582689..4a43e072d2d1 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -6,6 +6,7 @@
  */
 
 #include <linux/linkage.h>
+#include <linux/cfi_types.h>
 
 .file "blowfish-x86_64-asm.S"
 .text
@@ -141,7 +142,7 @@ SYM_FUNC_START(__blowfish_enc_blk)
 	RET;
 SYM_FUNC_END(__blowfish_enc_blk)
 
-SYM_FUNC_START(blowfish_dec_blk)
+SYM_TYPED_FUNC_START(blowfish_dec_blk)
 	/* input:
 	 *	%rdi: ctx
 	 *	%rsi: dst
@@ -332,7 +333,7 @@ SYM_FUNC_START(__blowfish_enc_blk_4way)
 	RET;
 SYM_FUNC_END(__blowfish_enc_blk_4way)
 
-SYM_FUNC_START(blowfish_dec_blk_4way)
+SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
 	/* input:
 	 *	%rdi: ctx
 	 *	%rsi: dst
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 73ca20049835..f484d656d34e 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -43,6 +43,18 @@
 
 #endif /* __ASSEMBLY__ */
 
+#define __CFI_TYPE(name)					\
+	SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE)	\
+	.fill 11, 1, 0x90 ASM_NL				\
+	.byte 0xb8 ASM_NL					\
+	.long __kcfi_typeid_##name ASM_NL			\
+	SYM_FUNC_END(__cfi_##name)
+
+/* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */
+#define SYM_TYPED_FUNC_START(name)				\
+	SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
+	ENDBR
+
 /* SYM_FUNC_START -- use for global functions */
 #define SYM_FUNC_START(name)				\
 	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index d0d7b9bc6cad..dd8cd8831251 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -2,6 +2,7 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
+#include <linux/cfi_types.h>
 #include <asm/errno.h>
 #include <asm/cpufeatures.h>
 #include <asm/alternative.h>
@@ -27,7 +28,7 @@
  * Output:
  * rax original destination
  */
-SYM_FUNC_START(__memcpy)
+SYM_TYPED_FUNC_START(__memcpy)
 	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
 		      "jmp memcpy_erms", X86_FEATURE_ERMS
 
-- 
cgit v1.2.3


From a4b7a12c5594fe5e6ab2a5aa514a9ae3c0b85573 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:55:03 -0700
Subject: x86/purgatory: Disable CFI

Disable CONFIG_CFI_CLANG for the stand-alone purgatory.ro.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-22-samitolvanen@google.com
---
 arch/x86/purgatory/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 31c634a22818..58a200dc762d 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -55,6 +55,10 @@ ifdef CONFIG_RETPOLINE
 PURGATORY_CFLAGS_REMOVE		+= $(RETPOLINE_CFLAGS)
 endif
 
+ifdef CONFIG_CFI_CLANG
+PURGATORY_CFLAGS_REMOVE		+= $(CC_FLAGS_CFI)
+endif
+
 CFLAGS_REMOVE_purgatory.o	+= $(PURGATORY_CFLAGS_REMOVE)
 CFLAGS_purgatory.o		+= $(PURGATORY_CFLAGS)
 
-- 
cgit v1.2.3


From 3c516f89e17e56b4738f05588e51267e295b5e63 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Thu, 8 Sep 2022 14:55:04 -0700
Subject: x86: Add support for CONFIG_CFI_CLANG

With CONFIG_CFI_CLANG, the compiler injects a type preamble immediately
before each function and a check to validate the target function type
before indirect calls:

  ; type preamble
  __cfi_function:
    mov <id>, %eax
  function:
    ...
  ; indirect call check
    mov     -<id>,%r10d
    add     -0x4(%r11),%r10d
    je      .Ltmp1
    ud2
  .Ltmp1:
    call    __x86_indirect_thunk_r11

Add error handling code for the ud2 traps emitted for the checks, and
allow CONFIG_CFI_CLANG to be selected on x86_64.

This produces the following oops on CFI failure (generated using lkdtm):

[   21.441706] CFI failure at lkdtm_indirect_call+0x16/0x20 [lkdtm]
(target: lkdtm_increment_int+0x0/0x10 [lkdtm]; expected type: 0x7e0c52a)
[   21.444579] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
[   21.445296] CPU: 0 PID: 132 Comm: sh Not tainted
5.19.0-rc8-00020-g9f27360e674c #1
[   21.445296] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[   21.445296] RIP: 0010:lkdtm_indirect_call+0x16/0x20 [lkdtm]
[   21.445296] Code: 52 1c c0 48 c7 c1 c5 50 1c c0 e9 25 48 2a cc 0f 1f
44 00 00 49 89 fb 48 c7 c7 50 b4 1c c0 41 ba 5b ad f3 81 45 03 53 f8
[   21.445296] RSP: 0018:ffffa9f9c02ffdc0 EFLAGS: 00000292
[   21.445296] RAX: 0000000000000027 RBX: ffffffffc01cb300 RCX: 385cbbd2e070a700
[   21.445296] RDX: 0000000000000000 RSI: c0000000ffffdfff RDI: ffffffffc01cb450
[   21.445296] RBP: 0000000000000006 R08: 0000000000000000 R09: ffffffff8d081610
[   21.445296] R10: 00000000bcc90825 R11: ffffffffc01c2fc0 R12: 0000000000000000
[   21.445296] R13: ffffa31b827a6000 R14: 0000000000000000 R15: 0000000000000002
[   21.445296] FS:  00007f08b42216a0(0000) GS:ffffa31b9f400000(0000)
knlGS:0000000000000000
[   21.445296] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   21.445296] CR2: 0000000000c76678 CR3: 0000000001940000 CR4: 00000000000006f0
[   21.445296] Call Trace:
[   21.445296]  <TASK>
[   21.445296]  lkdtm_CFI_FORWARD_PROTO+0x30/0x50 [lkdtm]
[   21.445296]  direct_entry+0x12d/0x140 [lkdtm]
[   21.445296]  full_proxy_write+0x5d/0xb0
[   21.445296]  vfs_write+0x144/0x460
[   21.445296]  ? __x64_sys_wait4+0x5a/0xc0
[   21.445296]  ksys_write+0x69/0xd0
[   21.445296]  do_syscall_64+0x51/0xa0
[   21.445296]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   21.445296] RIP: 0033:0x7f08b41a6fe1
[   21.445296] Code: be 07 00 00 00 41 89 c0 e8 7e ff ff ff 44 89 c7 89
04 24 e8 91 c6 02 00 8b 04 24 48 83 c4 68 c3 48 63 ff b8 01 00 00 03
[   21.445296] RSP: 002b:00007ffcdf65c2e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[   21.445296] RAX: ffffffffffffffda RBX: 00007f08b4221690 RCX: 00007f08b41a6fe1
[   21.445296] RDX: 0000000000000012 RSI: 0000000000c738f0 RDI: 0000000000000001
[   21.445296] RBP: 0000000000000001 R08: fefefefefefefeff R09: fefefefeffc5ff4e
[   21.445296] R10: 00007f08b42222b0 R11: 0000000000000246 R12: 0000000000c738f0
[   21.445296] R13: 0000000000000012 R14: 00007ffcdf65c401 R15: 0000000000c70450
[   21.445296]  </TASK>
[   21.445296] Modules linked in: lkdtm
[   21.445296] Dumping ftrace buffer:
[   21.445296]    (ftrace buffer empty)
[   21.471442] ---[ end trace 0000000000000000 ]---
[   21.471811] RIP: 0010:lkdtm_indirect_call+0x16/0x20 [lkdtm]
[   21.472467] Code: 52 1c c0 48 c7 c1 c5 50 1c c0 e9 25 48 2a cc 0f 1f
44 00 00 49 89 fb 48 c7 c7 50 b4 1c c0 41 ba 5b ad f3 81 45 03 53 f8
[   21.474400] RSP: 0018:ffffa9f9c02ffdc0 EFLAGS: 00000292
[   21.474735] RAX: 0000000000000027 RBX: ffffffffc01cb300 RCX: 385cbbd2e070a700
[   21.475664] RDX: 0000000000000000 RSI: c0000000ffffdfff RDI: ffffffffc01cb450
[   21.476471] RBP: 0000000000000006 R08: 0000000000000000 R09: ffffffff8d081610
[   21.477127] R10: 00000000bcc90825 R11: ffffffffc01c2fc0 R12: 0000000000000000
[   21.477959] R13: ffffa31b827a6000 R14: 0000000000000000 R15: 0000000000000002
[   21.478657] FS:  00007f08b42216a0(0000) GS:ffffa31b9f400000(0000)
knlGS:0000000000000000
[   21.479577] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   21.480307] CR2: 0000000000c76678 CR3: 0000000001940000 CR4: 00000000000006f0
[   21.481460] Kernel panic - not syncing: Fatal exception

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220908215504.3686827-23-samitolvanen@google.com
---
 arch/x86/Kconfig           |  2 ++
 arch/x86/include/asm/cfi.h | 22 ++++++++++++
 arch/x86/kernel/Makefile   |  2 ++
 arch/x86/kernel/cfi.c      | 86 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps.c    |  4 ++-
 5 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/cfi.h
 create mode 100644 arch/x86/kernel/cfi.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f9920f1341c8..1fe6a83dac05 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -107,6 +107,8 @@ config X86
 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK	if X86_64
 	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
 	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
+	select ARCH_SUPPORTS_CFI_CLANG		if X86_64
+	select ARCH_USES_CFI_TRAPS		if X86_64 && CFI_CLANG
 	select ARCH_SUPPORTS_LTO_CLANG
 	select ARCH_SUPPORTS_LTO_CLANG_THIN
 	select ARCH_USE_BUILTIN_BSWAP
diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
new file mode 100644
index 000000000000..58dacd90daef
--- /dev/null
+++ b/arch/x86/include/asm/cfi.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CFI_H
+#define _ASM_X86_CFI_H
+
+/*
+ * Clang Control Flow Integrity (CFI) support.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+
+#include <linux/cfi.h>
+
+#ifdef CONFIG_CFI_CLANG
+enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
+#else
+static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
+{
+	return BUG_TRAP_TYPE_NONE;
+}
+#endif /* CONFIG_CFI_CLANG */
+
+#endif /* _ASM_X86_CFI_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a20a5ebfacd7..1286a73ebdbc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -139,6 +139,8 @@ obj-$(CONFIG_UNWINDER_GUESS)		+= unwind_guess.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)		+= sev.o
 
+obj-$(CONFIG_CFI_CLANG)			+= cfi.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c
new file mode 100644
index 000000000000..8674a5c0c031
--- /dev/null
+++ b/arch/x86/kernel/cfi.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Clang Control Flow Integrity (CFI) support.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+#include <asm/cfi.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <linux/string.h>
+
+/*
+ * Returns the target address and the expected type when regs->ip points
+ * to a compiler-generated CFI trap.
+ */
+static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target,
+			    u32 *type)
+{
+	char buffer[MAX_INSN_SIZE];
+	struct insn insn;
+	int offset = 0;
+
+	*target = *type = 0;
+
+	/*
+	 * The compiler generates the following instruction sequence
+	 * for indirect call checks:
+	 *
+	 *   movl    -<id>, %r10d       ; 6 bytes
+	 *   addl    -4(%reg), %r10d    ; 4 bytes
+	 *   je      .Ltmp1             ; 2 bytes
+	 *   ud2                        ; <- regs->ip
+	 *   .Ltmp1:
+	 *
+	 * We can decode the expected type and the target address from the
+	 * movl/addl instructions.
+	 */
+	if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 12, MAX_INSN_SIZE))
+		return false;
+	if (insn_decode_kernel(&insn, &buffer[offset]))
+		return false;
+	if (insn.opcode.value != 0xBA)
+		return false;
+
+	*type = -(u32)insn.immediate.value;
+
+	if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 6, MAX_INSN_SIZE))
+		return false;
+	if (insn_decode_kernel(&insn, &buffer[offset]))
+		return false;
+	if (insn.opcode.value != 0x3)
+		return false;
+
+	/* Read the target address from the register. */
+	offset = insn_get_modrm_rm_off(&insn, regs);
+	if (offset < 0)
+		return false;
+
+	*target = *(unsigned long *)((void *)regs + offset);
+
+	return true;
+}
+
+/*
+ * Checks if a ud2 trap is because of a CFI failure, and handles the trap
+ * if needed. Returns a bug_trap_type value similarly to report_bug.
+ */
+enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
+{
+	unsigned long target;
+	u32 type;
+
+	if (!is_cfi_trap(regs->ip))
+		return BUG_TRAP_TYPE_NONE;
+
+	if (!decode_cfi_insn(regs, &target, &type))
+		return report_cfi_failure_noaddr(regs, regs->ip);
+
+	return report_cfi_failure(regs, regs->ip, &target, type);
+}
+
+/*
+ * Ensure that __kcfi_typeid_ symbols are emitted for functions that may
+ * not be indirectly called with all configurations.
+ */
+__ADDRESSABLE(__memcpy)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d62b2cb85cea..178015a820f0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -63,6 +63,7 @@
 #include <asm/insn-eval.h>
 #include <asm/vdso.h>
 #include <asm/tdx.h>
+#include <asm/cfi.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
@@ -313,7 +314,8 @@ static noinstr bool handle_bug(struct pt_regs *regs)
 	 */
 	if (regs->flags & X86_EFLAGS_IF)
 		raw_local_irq_enable();
-	if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
+	if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN ||
+	    handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
 		regs->ip += LEN_UD2;
 		handled = true;
 	}
-- 
cgit v1.2.3


From 7cae596bc31f900bb72492ff40c7f5addf72fa19 Mon Sep 17 00:00:00 2001
From: Dylan Yudaken <dylany@fb.com>
Date: Mon, 26 Sep 2022 10:09:25 -0700
Subject: io_uring: register single issuer task at creation

Instead of picking the task from the first submitter task, rather use the
creator task or in the case of disabled (IORING_SETUP_R_DISABLED) the
enabling task.

This approach allows a lot of simplification of the logic here. This
removes init logic from the submission path, which can always be a bit
confusing, but also removes the need for locking to write (or read) the
submitter_task.

Users that want to move a ring before submitting can create the ring
disabled and then enable it on the submitting task.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Fixes: 97bbdc06a444 ("io_uring: add IORING_SETUP_SINGLE_ISSUER")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2965b354efc8..242d896c00f3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3357,6 +3357,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 		goto err;
 	}
 
+	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER
+	    && !(ctx->flags & IORING_SETUP_R_DISABLED))
+		ctx->submitter_task = get_task_struct(current);
+
 	file = io_uring_get_file(ctx);
 	if (IS_ERR(file)) {
 		ret = PTR_ERR(file);
@@ -3548,6 +3552,9 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx)
 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
 		return -EBADFD;
 
+	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task)
+		ctx->submitter_task = get_task_struct(current);
+
 	if (ctx->restrictions.registered)
 		ctx->restricted = 1;
 
-- 
cgit v1.2.3


From 3e1730842f142add55dc658929221521a9ea62b6 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 19 Sep 2022 19:45:14 -0700
Subject: x86/entry: Work around Clang __bdos() bug

Clang produces a false positive when building with CONFIG_FORTIFY_SOURCE=y
and CONFIG_UBSAN_BOUNDS=y when operating on an array with a dynamic
offset. Work around this by using a direct assignment of an empty
instance. Avoids this warning:

../include/linux/fortify-string.h:309:4: warning: call to __write_overflow_field declared with 'warn
ing' attribute: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wat
tribute-warning]
                        __write_overflow_field(p_size_field, size);
                        ^

which was isolated to the memset() call in xen_load_idt().

Note that this looks very much like another bug that was worked around:
https://github.com/ClangBuiltLinux/linux/issues/1592

Cc: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: xen-devel@lists.xenproject.org
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Link: https://lore.kernel.org/lkml/41527d69-e8ab-3f86-ff37-6b298c01d5bc@oracle.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/x86/xen/enlighten_pv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 0ed2e487a693..9b1a58dda935 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -765,6 +765,7 @@ static void xen_load_idt(const struct desc_ptr *desc)
 {
 	static DEFINE_SPINLOCK(lock);
 	static struct trap_info traps[257];
+	static const struct trap_info zero = { };
 	unsigned out;
 
 	trace_xen_cpu_load_idt(desc);
@@ -774,7 +775,7 @@ static void xen_load_idt(const struct desc_ptr *desc)
 	memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
 
 	out = xen_convert_trap_info(desc, traps, false);
-	memset(&traps[out], 0, sizeof(traps[0]));
+	traps[out] = zero;
 
 	xen_mc_flush();
 	if (HYPERVISOR_set_trap_table(traps))
-- 
cgit v1.2.3


From fa35198f39571bbdae53c5b321020021eaad6bd2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 19 Sep 2022 16:33:33 -0700
Subject: fortify: Explicitly check bounds are compile-time constants

In preparation for replacing __builtin_object_size() with
__builtin_dynamic_object_size(), all the compile-time size checks
need to check that the bounds comparisons are, in fact, known at
compile-time. Enforce what was guaranteed with __bos(). In other words,
since all uses of __bos() were constant expressions, it was not required
to test for this. When these change to __bdos(), they _may_ be constant
expressions, and the checks are only valid when the prior condition
holds. This results in no binary differences.

Cc: linux-hardening@vger.kernel.org
Link: https://lore.kernel.org/lkml/20220920192202.190793-3-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 49 ++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index ff879efe94ed..1c582224c525 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -80,6 +80,11 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 #define POS	__pass_object_size(1)
 #define POS0	__pass_object_size(0)
 
+#define __compiletime_lessthan(bounds, length)	(	\
+	__builtin_constant_p((bounds) < (length)) &&	\
+	(bounds) < (length)				\
+)
+
 /**
  * strncpy - Copy a string to memory with non-guaranteed NUL padding
  *
@@ -117,7 +122,7 @@ char *strncpy(char * const POS p, const char *q, __kernel_size_t size)
 {
 	size_t p_size = __builtin_object_size(p, 1);
 
-	if (__builtin_constant_p(size) && p_size < size)
+	if (__compiletime_lessthan(p_size, size))
 		__write_overflow();
 	if (p_size < size)
 		fortify_panic(__func__);
@@ -224,7 +229,7 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s
 	 * If size can be known at compile time and is greater than
 	 * p_size, generate a compile time write overflow error.
 	 */
-	if (__builtin_constant_p(size) && size > p_size)
+	if (__compiletime_lessthan(p_size, size))
 		__write_overflow();
 
 	/*
@@ -281,15 +286,16 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size,
 		/*
 		 * Length argument is a constant expression, so we
 		 * can perform compile-time bounds checking where
-		 * buffer sizes are known.
+		 * buffer sizes are also known at compile time.
 		 */
 
 		/* Error when size is larger than enclosing struct. */
-		if (p_size > p_size_field && p_size < size)
+		if (__compiletime_lessthan(p_size_field, p_size) &&
+		    __compiletime_lessthan(p_size, size))
 			__write_overflow();
 
 		/* Warn when write size is larger than dest field. */
-		if (p_size_field < size)
+		if (__compiletime_lessthan(p_size_field, size))
 			__write_overflow_field(p_size_field, size);
 	}
 	/*
@@ -365,25 +371,28 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
 		/*
 		 * Length argument is a constant expression, so we
 		 * can perform compile-time bounds checking where
-		 * buffer sizes are known.
+		 * buffer sizes are also known at compile time.
 		 */
 
 		/* Error when size is larger than enclosing struct. */
-		if (p_size > p_size_field && p_size < size)
+		if (__compiletime_lessthan(p_size_field, p_size) &&
+		    __compiletime_lessthan(p_size, size))
 			__write_overflow();
-		if (q_size > q_size_field && q_size < size)
+		if (__compiletime_lessthan(q_size_field, q_size) &&
+		    __compiletime_lessthan(q_size, size))
 			__read_overflow2();
 
 		/* Warn when write size argument larger than dest field. */
-		if (p_size_field < size)
+		if (__compiletime_lessthan(p_size_field, size))
 			__write_overflow_field(p_size_field, size);
 		/*
 		 * Warn for source field over-read when building with W=1
 		 * or when an over-write happened, so both can be fixed at
 		 * the same time.
 		 */
-		if ((IS_ENABLED(KBUILD_EXTRA_WARN1) || p_size_field < size) &&
-		    q_size_field < size)
+		if ((IS_ENABLED(KBUILD_EXTRA_WARN1) ||
+		     __compiletime_lessthan(p_size_field, size)) &&
+		    __compiletime_lessthan(q_size_field, size))
 			__read_overflow2_field(q_size_field, size);
 	}
 	/*
@@ -494,7 +503,7 @@ __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size)
 {
 	size_t p_size = __builtin_object_size(p, 0);
 
-	if (__builtin_constant_p(size) && p_size < size)
+	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
 	if (p_size < size)
 		fortify_panic(__func__);
@@ -508,9 +517,9 @@ int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t
 	size_t q_size = __builtin_object_size(q, 0);
 
 	if (__builtin_constant_p(size)) {
-		if (p_size < size)
+		if (__compiletime_lessthan(p_size, size))
 			__read_overflow();
-		if (q_size < size)
+		if (__compiletime_lessthan(q_size, size))
 			__read_overflow2();
 	}
 	if (p_size < size || q_size < size)
@@ -523,7 +532,7 @@ void *memchr(const void * const POS0 p, int c, __kernel_size_t size)
 {
 	size_t p_size = __builtin_object_size(p, 0);
 
-	if (__builtin_constant_p(size) && p_size < size)
+	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
 	if (p_size < size)
 		fortify_panic(__func__);
@@ -535,7 +544,7 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size)
 {
 	size_t p_size = __builtin_object_size(p, 0);
 
-	if (__builtin_constant_p(size) && p_size < size)
+	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
 	if (p_size < size)
 		fortify_panic(__func__);
@@ -547,7 +556,7 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp
 {
 	size_t p_size = __builtin_object_size(p, 0);
 
-	if (__builtin_constant_p(size) && p_size < size)
+	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
 	if (p_size < size)
 		fortify_panic(__func__);
@@ -563,11 +572,13 @@ char *strcpy(char * const POS p, const char * const POS q)
 	size_t size;
 
 	/* If neither buffer size is known, immediately give up. */
-	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
+	if (__builtin_constant_p(p_size) &&
+	    __builtin_constant_p(q_size) &&
+	    p_size == SIZE_MAX && q_size == SIZE_MAX)
 		return __underlying_strcpy(p, q);
 	size = strlen(q) + 1;
 	/* Compile-time check for const size overflow. */
-	if (__builtin_constant_p(size) && p_size < size)
+	if (__compiletime_lessthan(p_size, size))
 		__write_overflow();
 	/* Run-time check for dynamic size overflow. */
 	if (p_size < size)
-- 
cgit v1.2.3


From 9f7d69c5cd23904a29178a7ecc4eee9c1cfba04b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 19 Sep 2022 19:50:32 -0700
Subject: fortify: Convert to struct vs member helpers

In preparation for adding support for __builtin_dynamic_object_size(),
wrap each instance of __builtin_object_size(p, N) with either the new
__struct_size(p) as __bos(p, 0), or __member_size(p) as __bos(p, 1).
This will allow us to replace the definitions with __bdos() next.
There are no binary differences from this change.

Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Tom Rix <trix@redhat.com>
Cc: linux-hardening@vger.kernel.org
Cc: llvm@lists.linux.dev
Link: https://lore.kernel.org/lkml/20220920192202.190793-4-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 68 ++++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 33 deletions(-)

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 1c582224c525..b62c90cfafaf 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -20,7 +20,7 @@ void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("
 ({								\
 	unsigned char *__p = (unsigned char *)(p);		\
 	size_t __ret = SIZE_MAX;				\
-	size_t __p_size = __builtin_object_size(p, 1);		\
+	size_t __p_size = __member_size(p);			\
 	if (__p_size != SIZE_MAX &&				\
 	    __builtin_constant_p(*__p)) {			\
 		size_t __p_len = __p_size - 1;			\
@@ -72,13 +72,15 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 	__underlying_memcpy(dst, src, bytes)
 
 /*
- * Clang's use of __builtin_object_size() within inlines needs hinting via
- * __pass_object_size(). The preference is to only ever use type 1 (member
+ * Clang's use of __builtin_*object_size() within inlines needs hinting via
+ * __pass_*object_size(). The preference is to only ever use type 1 (member
  * size, rather than struct size), but there remain some stragglers using
  * type 0 that will be converted in the future.
  */
-#define POS	__pass_object_size(1)
-#define POS0	__pass_object_size(0)
+#define POS			__pass_object_size(1)
+#define POS0			__pass_object_size(0)
+#define __struct_size(p)	__builtin_object_size(p, 0)
+#define __member_size(p)	__builtin_object_size(p, 1)
 
 #define __compiletime_lessthan(bounds, length)	(	\
 	__builtin_constant_p((bounds) < (length)) &&	\
@@ -120,7 +122,7 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 __FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3)
 char *strncpy(char * const POS p, const char *q, __kernel_size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 1);
+	size_t p_size = __member_size(p);
 
 	if (__compiletime_lessthan(p_size, size))
 		__write_overflow();
@@ -132,7 +134,7 @@ char *strncpy(char * const POS p, const char *q, __kernel_size_t size)
 __FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2)
 char *strcat(char * const POS p, const char *q)
 {
-	size_t p_size = __builtin_object_size(p, 1);
+	size_t p_size = __member_size(p);
 
 	if (p_size == SIZE_MAX)
 		return __underlying_strcat(p, q);
@@ -144,7 +146,7 @@ char *strcat(char * const POS p, const char *q)
 extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
 __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen)
 {
-	size_t p_size = __builtin_object_size(p, 1);
+	size_t p_size = __member_size(p);
 	size_t p_len = __compiletime_strlen(p);
 	size_t ret;
 
@@ -174,7 +176,7 @@ __FORTIFY_INLINE __diagnose_as(__builtin_strlen, 1)
 __kernel_size_t __fortify_strlen(const char * const POS p)
 {
 	__kernel_size_t ret;
-	size_t p_size = __builtin_object_size(p, 1);
+	size_t p_size = __member_size(p);
 
 	/* Give up if we don't know how large p is. */
 	if (p_size == SIZE_MAX)
@@ -189,8 +191,8 @@ __kernel_size_t __fortify_strlen(const char * const POS p)
 extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy);
 __FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
+	size_t p_size = __member_size(p);
+	size_t q_size = __member_size(q);
 	size_t q_len;	/* Full count of source string length. */
 	size_t len;	/* Count of characters going into destination. */
 
@@ -218,8 +220,8 @@ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, s
 {
 	size_t len;
 	/* Use string size rather than possible enclosing struct size. */
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
+	size_t p_size = __member_size(p);
+	size_t q_size = __member_size(q);
 
 	/* If we cannot get size of p and q default to call strscpy. */
 	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
@@ -264,8 +266,8 @@ __FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3)
 char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count)
 {
 	size_t p_len, copy_len;
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
+	size_t p_size = __member_size(p);
+	size_t q_size = __member_size(q);
 
 	if (p_size == SIZE_MAX && q_size == SIZE_MAX)
 		return __underlying_strncat(p, q, count);
@@ -323,11 +325,11 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size,
 })
 
 /*
- * __builtin_object_size() must be captured here to avoid evaluating argument
- * side-effects further into the macro layers.
+ * __struct_size() vs __member_size() must be captured here to avoid
+ * evaluating argument side-effects further into the macro layers.
  */
 #define memset(p, c, s) __fortify_memset_chk(p, c, s,			\
-		__builtin_object_size(p, 0), __builtin_object_size(p, 1))
+		__struct_size(p), __member_size(p))
 
 /*
  * To make sure the compiler can enforce protection against buffer overflows,
@@ -420,7 +422,7 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
 	 * fake flexible arrays, until they are all converted to
 	 * proper flexible arrays.
 	 *
-	 * The implementation of __builtin_object_size() behaves
+	 * The implementation of __builtin_*object_size() behaves
 	 * like sizeof() when not directly referencing a flexible
 	 * array member, which means there will be many bounds checks
 	 * that will appear at run-time, without a way for them to be
@@ -486,22 +488,22 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
  */
 
 /*
- * __builtin_object_size() must be captured here to avoid evaluating argument
- * side-effects further into the macro layers.
+ * __struct_size() vs __member_size() must be captured here to avoid
+ * evaluating argument side-effects further into the macro layers.
  */
 #define memcpy(p, q, s)  __fortify_memcpy_chk(p, q, s,			\
-		__builtin_object_size(p, 0), __builtin_object_size(q, 0), \
-		__builtin_object_size(p, 1), __builtin_object_size(q, 1), \
+		__struct_size(p), __struct_size(q),			\
+		__member_size(p), __member_size(q),			\
 		memcpy)
 #define memmove(p, q, s)  __fortify_memcpy_chk(p, q, s,			\
-		__builtin_object_size(p, 0), __builtin_object_size(q, 0), \
-		__builtin_object_size(p, 1), __builtin_object_size(q, 1), \
+		__struct_size(p), __struct_size(q),			\
+		__member_size(p), __member_size(q),			\
 		memmove)
 
 extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
 __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 0);
+	size_t p_size = __struct_size(p);
 
 	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
@@ -513,8 +515,8 @@ __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size)
 __FORTIFY_INLINE __diagnose_as(__builtin_memcmp, 1, 2, 3)
 int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 0);
-	size_t q_size = __builtin_object_size(q, 0);
+	size_t p_size = __struct_size(p);
+	size_t q_size = __struct_size(q);
 
 	if (__builtin_constant_p(size)) {
 		if (__compiletime_lessthan(p_size, size))
@@ -530,7 +532,7 @@ int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t
 __FORTIFY_INLINE __diagnose_as(__builtin_memchr, 1, 2, 3)
 void *memchr(const void * const POS0 p, int c, __kernel_size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 0);
+	size_t p_size = __struct_size(p);
 
 	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
@@ -542,7 +544,7 @@ void *memchr(const void * const POS0 p, int c, __kernel_size_t size)
 void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
 __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size)
 {
-	size_t p_size = __builtin_object_size(p, 0);
+	size_t p_size = __struct_size(p);
 
 	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
@@ -554,7 +556,7 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size)
 extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup);
 __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp)
 {
-	size_t p_size = __builtin_object_size(p, 0);
+	size_t p_size = __struct_size(p);
 
 	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
@@ -567,8 +569,8 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp
 __FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2)
 char *strcpy(char * const POS p, const char * const POS q)
 {
-	size_t p_size = __builtin_object_size(p, 1);
-	size_t q_size = __builtin_object_size(q, 1);
+	size_t p_size = __member_size(p);
+	size_t q_size = __member_size(q);
 	size_t size;
 
 	/* If neither buffer size is known, immediately give up. */
-- 
cgit v1.2.3


From 72f78ae00a8e5d7abe13abac8305a300f6afd74b Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 18 Aug 2022 23:01:14 +0200
Subject: NFSD: move from strlcpy with unused retval to strscpy

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4idmap.c | 8 ++++----
 fs/nfsd/nfs4proc.c  | 2 +-
 fs/nfsd/nfssvc.c    | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index f92161ce1f97..e70a1a2999b7 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -82,8 +82,8 @@ ent_init(struct cache_head *cnew, struct cache_head *citm)
 	new->id = itm->id;
 	new->type = itm->type;
 
-	strlcpy(new->name, itm->name, sizeof(new->name));
-	strlcpy(new->authname, itm->authname, sizeof(new->authname));
+	strscpy(new->name, itm->name, sizeof(new->name));
+	strscpy(new->authname, itm->authname, sizeof(new->authname));
 }
 
 static void
@@ -548,7 +548,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
 		return nfserr_badowner;
 	memcpy(key.name, name, namelen);
 	key.name[namelen] = '\0';
-	strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+	strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
 	ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item);
 	if (ret == -ENOENT)
 		return nfserr_badowner;
@@ -584,7 +584,7 @@ static __be32 idmap_id_to_name(struct xdr_stream *xdr,
 	int ret;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
-	strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+	strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
 	ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item);
 	if (ret == -ENOENT)
 		return encode_ascii_id(xdr, id);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a72ab97f77ef..0437210b9898 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1343,7 +1343,7 @@ try_again:
 		return 0;
 	}
 	if (work) {
-		strlcpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1);
+		strscpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1);
 		refcount_set(&work->nsui_refcnt, 2);
 		work->nsui_busy = true;
 		list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 4bb5baa17040..bfbd9f672f59 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -799,7 +799,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
 	if (nrservs == 0 && nn->nfsd_serv == NULL)
 		goto out;
 
-	strlcpy(nn->nfsd_name, utsname()->nodename,
+	strscpy(nn->nfsd_name, utsname()->nodename,
 		sizeof(nn->nfsd_name));
 
 	error = nfsd_create_serv(net);
-- 
cgit v1.2.3


From 97f8e62572555f8ad578d7b1739ba64d5d2cac0f Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Thu, 18 Aug 2022 23:01:16 +0200
Subject: lockd: move from strlcpy with unused retval to strscpy

Follow the advice of the below link and prefer 'strscpy' in this
subsystem. Conversion is 1:1 because the return value is not used.
Generated by a coccinelle script.

Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/host.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index f802223e71ab..cdc8e12cdac4 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -164,7 +164,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
 	host->h_addrbuf    = nsm->sm_addrbuf;
 	host->net	   = ni->net;
 	host->h_cred	   = get_cred(ni->cred);
-	strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
+	strscpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
 
 out:
 	return host;
-- 
cgit v1.2.3


From 754035ff79a14886e68c0c9f6fa80adb21f12b53 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <kolga@netapp.com>
Date: Fri, 19 Aug 2022 15:16:36 -0400
Subject: NFSD enforce filehandle check for source file in COPY

If the passed in filehandle for the source file in the COPY operation
is not a regular file, the server MUST return NFS4ERR_WRONG_TYPE.

Signed-off-by: Olga Kornievskaia <kolga@netapp.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 0437210b9898..757d8959f992 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1768,7 +1768,13 @@ static int nfsd4_do_async_copy(void *data)
 		filp = nfs42_ssc_open(copy->ss_mnt, &copy->c_fh,
 				      &copy->stateid);
 		if (IS_ERR(filp)) {
-			nfserr = nfserr_offload_denied;
+			switch (PTR_ERR(filp)) {
+			case -EBADF:
+				nfserr = nfserr_wrong_type;
+				break;
+			default:
+				nfserr = nfserr_offload_denied;
+			}
 			nfsd4_interssc_disconnect(copy->ss_mnt);
 			goto do_callback;
 		}
-- 
cgit v1.2.3


From 4ab3442ca384a02abf8b1f2b3449a6c547851873 Mon Sep 17 00:00:00 2001
From: Jinpeng Cui <cui.jinpeng2@zte.com.cn>
Date: Wed, 31 Aug 2022 14:20:02 +0000
Subject: NFSD: remove redundant variable status

Return value directly from fh_verify() do_open_permission()
exp_pseudoroot() instead of getting value from
redundant variable status.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Jinpeng Cui <cui.jinpeng2@zte.com.cn>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 757d8959f992..7055e1c91d0e 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -141,7 +141,6 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src)
 static __be32
 do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode)
 {
-	__be32 status;
 
 	if (open->op_truncate &&
 		!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
@@ -156,9 +155,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
 	if (open->op_share_deny & NFS4_SHARE_DENY_READ)
 		accmode |= NFSD_MAY_WRITE;
 
-	status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
-
-	return status;
+	return fh_verify(rqstp, current_fh, S_IFREG, accmode);
 }
 
 static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
@@ -454,7 +451,6 @@ static __be32
 do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
 {
 	struct svc_fh *current_fh = &cstate->current_fh;
-	__be32 status;
 	int accmode = 0;
 
 	/* We don't know the target directory, and therefore can not
@@ -479,9 +475,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, str
 	if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH)
 		accmode = NFSD_MAY_OWNER_OVERRIDE;
 
-	status = do_open_permission(rqstp, current_fh, open, accmode);
-
-	return status;
+	return do_open_permission(rqstp, current_fh, open, accmode);
 }
 
 static void
@@ -668,11 +662,9 @@ static __be32
 nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		union nfsd4_op_u *u)
 {
-	__be32 status;
-
 	fh_put(&cstate->current_fh);
-	status = exp_pseudoroot(rqstp, &cstate->current_fh);
-	return status;
+
+	return exp_pseudoroot(rqstp, &cstate->current_fh);
 }
 
 static __be32
-- 
cgit v1.2.3


From fd1ef88049de09bc70d60b549992524cfc0e66ff Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 1 Sep 2022 07:27:04 +0200
Subject: nfsd: Fix a memory leak in an error handling path

If this memdup_user() call fails, the memory allocated in a previous call
a few lines above should be freed. Otherwise it leaks.

Fixes: 6ee95d1c8991 ("nfsd: add support for upcall version 2")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4recover.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index c634483d85d2..8f24485e0f04 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -815,8 +815,10 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
 				princhash.data = memdup_user(
 						&ci->cc_princhash.cp_data,
 						princhashlen);
-				if (IS_ERR_OR_NULL(princhash.data))
+				if (IS_ERR_OR_NULL(princhash.data)) {
+					kfree(name.data);
 					return -EFAULT;
+				}
 				princhash.len = princhashlen;
 			} else
 				princhash.len = 0;
-- 
cgit v1.2.3


From d44899b8bb0b919f923186c616a84f0e70e04772 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 1 Sep 2022 07:27:11 +0200
Subject: nfsd: Avoid some useless tests

memdup_user() can't return NULL, so there is no point for checking for it.

Simplify some tests accordingly.

Suggested-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4recover.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 8f24485e0f04..4edfc9580641 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -807,7 +807,7 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
 			if (get_user(namelen, &ci->cc_name.cn_len))
 				return -EFAULT;
 			name.data = memdup_user(&ci->cc_name.cn_id, namelen);
-			if (IS_ERR_OR_NULL(name.data))
+			if (IS_ERR(name.data))
 				return -EFAULT;
 			name.len = namelen;
 			get_user(princhashlen, &ci->cc_princhash.cp_len);
@@ -815,7 +815,7 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
 				princhash.data = memdup_user(
 						&ci->cc_princhash.cp_data,
 						princhashlen);
-				if (IS_ERR_OR_NULL(princhash.data)) {
+				if (IS_ERR(princhash.data)) {
 					kfree(name.data);
 					return -EFAULT;
 				}
@@ -829,7 +829,7 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
 			if (get_user(namelen, &cnm->cn_len))
 				return -EFAULT;
 			name.data = memdup_user(&cnm->cn_id, namelen);
-			if (IS_ERR_OR_NULL(name.data))
+			if (IS_ERR(name.data))
 				return -EFAULT;
 			name.len = namelen;
 		}
-- 
cgit v1.2.3


From 30a30fcc3fc1ad4c5d017c9fcb75dc8f59e7bdad Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Thu, 1 Sep 2022 07:27:19 +0200
Subject: nfsd: Propagate some error code returned by memdup_user()

Propagate the error code returned by memdup_user() instead of a hard coded
-EFAULT.

Suggested-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4recover.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 4edfc9580641..5d680045fa2c 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -808,7 +808,7 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
 				return -EFAULT;
 			name.data = memdup_user(&ci->cc_name.cn_id, namelen);
 			if (IS_ERR(name.data))
-				return -EFAULT;
+				return PTR_ERR(name.data);
 			name.len = namelen;
 			get_user(princhashlen, &ci->cc_princhash.cp_len);
 			if (princhashlen > 0) {
@@ -817,7 +817,7 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
 						princhashlen);
 				if (IS_ERR(princhash.data)) {
 					kfree(name.data);
-					return -EFAULT;
+					return PTR_ERR(princhash.data);
 				}
 				princhash.len = princhashlen;
 			} else
@@ -830,7 +830,7 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
 				return -EFAULT;
 			name.data = memdup_user(&cnm->cn_id, namelen);
 			if (IS_ERR(name.data))
-				return -EFAULT;
+				return PTR_ERR(name.data);
 			name.len = namelen;
 		}
 		if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) {
-- 
cgit v1.2.3


From 80e591ce636f3ae6855a0ca26963da1fdd6d4508 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 2 Sep 2022 18:18:16 -0400
Subject: NFSD: Increase NFSD_MAX_OPS_PER_COMPOUND

When attempting an NFSv4 mount, a Solaris NFSv4 client builds a
single large COMPOUND that chains a series of LOOKUPs to get to the
pseudo filesystem root directory that is to be mounted. The Linux
NFS server's current maximum of 16 operations per NFSv4 COMPOUND is
not large enough to ensure that this works for paths that are more
than a few components deep.

Since NFSD_MAX_OPS_PER_COMPOUND is mostly a sanity check, and most
NFSv4 COMPOUNDS are between 3 and 6 operations (thus they do not
trigger any re-allocation of the operation array on the server),
increasing this maximum should result in little to no impact.

The ops array can get large now, so allocate it via vmalloc() to
help ensure memory fragmentation won't cause an allocation failure.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=216383
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 7 ++++---
 fs/nfsd/state.h   | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1e9690a061ec..4b69e86240eb 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -42,6 +42,8 @@
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/sunrpc/addr.h>
 #include <linux/xattr.h>
+#include <linux/vmalloc.h>
+
 #include <uapi/linux/xattr.h>
 
 #include "idmap.h"
@@ -2369,10 +2371,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 		return true;
 
 	if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
-		argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
+		argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops));
 		if (!argp->ops) {
 			argp->ops = argp->iops;
-			dprintk("nfsd: couldn't allocate room for COMPOUND\n");
 			return false;
 		}
 	}
@@ -5394,7 +5395,7 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
 	struct nfsd4_compoundargs *args = rqstp->rq_argp;
 
 	if (args->ops != args->iops) {
-		kfree(args->ops);
+		vfree(args->ops);
 		args->ops = args->iops;
 	}
 	while (args->to_free) {
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index ae596dbf8667..5d28beb290fe 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -175,7 +175,7 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
 /* Maximum number of slots per session. 160 is useful for long haul TCP */
 #define NFSD_MAX_SLOTS_PER_SESSION     160
 /* Maximum number of operations per session compound */
-#define NFSD_MAX_OPS_PER_COMPOUND	16
+#define NFSD_MAX_OPS_PER_COMPOUND	50
 /* Maximum  session per slot cache size */
 #define NFSD_SLOT_CACHE_SIZE		2048
 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
-- 
cgit v1.2.3


From 90bfc37b5ab91c1a6165e3e5cfc49bf04571b762 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Sep 2022 15:09:53 -0400
Subject: SUNRPC: Fix svcxdr_init_decode's end-of-buffer calculation

Ensure that stream-based argument decoding can't go past the actual
end of the receive buffer. xdr_init_decode's calculation of the
value of xdr->end over-estimates the end of the buffer because the
Linux kernel RPC server code does not remove the size of the RPC
header from rqstp->rq_arg before calling the upper layer's
dispatcher.

The server-side still uses the svc_getnl() macros to decode the
RPC call header. These macros reduce the length of the head iov
but do not update the total length of the message in the buffer
(buf->len).

A proper fix for this would be to replace the use of svc_getnl() and
friends in the RPC header decoder, but that would be a large and
invasive change that would be difficult to backport.

Fixes: 5191955d6fc6 ("SUNRPC: Prepare for xdr_stream-style decoding on the server-side")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index daecb009c05b..5a830b66f059 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -544,16 +544,27 @@ static inline void svc_reserve_auth(struct svc_rqst *rqstp, int space)
 }
 
 /**
- * svcxdr_init_decode - Prepare an xdr_stream for svc Call decoding
+ * svcxdr_init_decode - Prepare an xdr_stream for Call decoding
  * @rqstp: controlling server RPC transaction context
  *
+ * This function currently assumes the RPC header in rq_arg has
+ * already been decoded. Upon return, xdr->p points to the
+ * location of the upper layer header.
  */
 static inline void svcxdr_init_decode(struct svc_rqst *rqstp)
 {
 	struct xdr_stream *xdr = &rqstp->rq_arg_stream;
-	struct kvec *argv = rqstp->rq_arg.head;
+	struct xdr_buf *buf = &rqstp->rq_arg;
+	struct kvec *argv = buf->head;
 
-	xdr_init_decode(xdr, &rqstp->rq_arg, argv->iov_base, NULL);
+	/*
+	 * svc_getnl() and friends do not keep the xdr_buf's ::len
+	 * field up to date. Refresh that field before initializing
+	 * the argument decoding stream.
+	 */
+	buf->len = buf->head->iov_len + buf->page_len + buf->tail->iov_len;
+
+	xdr_init_decode(xdr, buf, argv->iov_base, NULL);
 	xdr_set_scratch_page(xdr, rqstp->rq_scratch_page);
 }
 
-- 
cgit v1.2.3


From 1242a87da0d8cd2a428e96ca68e7ea899b0f4624 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Sep 2022 15:09:59 -0400
Subject: SUNRPC: Fix svcxdr_init_encode's buflen calculation

Commit 2825a7f90753 ("nfsd4: allow encoding across page boundaries")
added an explicit computation of the remaining length in the rq_res
XDR buffer.

The computation appears to suffer from an "off-by-one" bug. Because
buflen is too large by one page, XDR encoding can run off the end of
the send buffer by eventually trying to use the struct page address
in rq_page_end, which always contains NULL.

Fixes: bddfdbcddbe2 ("NFSD: Extract the svcxdr_init_encode() helper")
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 5a830b66f059..0ca8a8ffb47e 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -587,7 +587,7 @@ static inline void svcxdr_init_encode(struct svc_rqst *rqstp)
 	xdr->end = resv->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
 	buf->len = resv->iov_len;
 	xdr->page_ptr = buf->pages - 1;
-	buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages);
+	buf->buflen = PAGE_SIZE * (rqstp->rq_page_end - buf->pages);
 	buf->buflen -= rqstp->rq_auth_slack;
 	xdr->rqst = NULL;
 }
-- 
cgit v1.2.3


From 00b4492686e0497fdb924a9d4c8f6f99377e176c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Sep 2022 15:10:05 -0400
Subject: NFSD: Protect against send buffer overflow in NFSv2 READDIR

Restore the previous limit on the @count argument to prevent a
buffer overflow attack.

Fixes: 53b1119a6e50 ("NFSD: Fix READDIR buffer overflow")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsproc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 7381972f1677..ddb1902c0a18 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -567,12 +567,11 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
 	struct xdr_buf *buf = &resp->dirlist;
 	struct xdr_stream *xdr = &resp->xdr;
 
-	count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp));
-
 	memset(buf, 0, sizeof(*buf));
 
 	/* Reserve room for the NULL ptr & eof flag (-2 words) */
-	buf->buflen = count - XDR_UNIT * 2;
+	buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), (u32)PAGE_SIZE);
+	buf->buflen -= XDR_UNIT * 2;
 	buf->pages = rqstp->rq_next_page;
 	rqstp->rq_next_page++;
 
-- 
cgit v1.2.3


From 640f87c190e0d1b2a0fcb2ecf6d2cd53b1c41991 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Sep 2022 15:10:12 -0400
Subject: NFSD: Protect against send buffer overflow in NFSv3 READDIR

Since before the git era, NFSD has conserved the number of pages
held by each nfsd thread by combining the RPC receive and send
buffers into a single array of pages. This works because there are
no cases where an operation needs a large RPC Call message and a
large RPC Reply message at the same time.

Once an RPC Call has been received, svc_process() updates
svc_rqst::rq_res to describe the part of rq_pages that can be
used for constructing the Reply. This means that the send buffer
(rq_res) shrinks when the received RPC record containing the RPC
Call is large.

A client can force this shrinkage on TCP by sending a correctly-
formed RPC Call header contained in an RPC record that is
excessively large. The full maximum payload size cannot be
constructed in that case.

Thanks to Aleksi Illikainen and Kari Hulkko for uncovering this
issue.

Reported-by: Ben Ronallo <Benjamin.Ronallo@synopsys.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs3proc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index a41cca619338..7a159785499a 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -563,13 +563,14 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp,
 {
 	struct xdr_buf *buf = &resp->dirlist;
 	struct xdr_stream *xdr = &resp->xdr;
-
-	count = clamp(count, (u32)(XDR_UNIT * 2), svc_max_payload(rqstp));
+	unsigned int sendbuf = min_t(unsigned int, rqstp->rq_res.buflen,
+				     svc_max_payload(rqstp));
 
 	memset(buf, 0, sizeof(*buf));
 
 	/* Reserve room for the NULL ptr & eof flag (-2 words) */
-	buf->buflen = count - XDR_UNIT * 2;
+	buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), sendbuf);
+	buf->buflen -= XDR_UNIT * 2;
 	buf->pages = rqstp->rq_next_page;
 	rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-- 
cgit v1.2.3


From 401bc1f90874280a80b93f23be33a0e7e2d1f912 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Sep 2022 15:10:18 -0400
Subject: NFSD: Protect against send buffer overflow in NFSv2 READ

Since before the git era, NFSD has conserved the number of pages
held by each nfsd thread by combining the RPC receive and send
buffers into a single array of pages. This works because there are
no cases where an operation needs a large RPC Call message and a
large RPC Reply at the same time.

Once an RPC Call has been received, svc_process() updates
svc_rqst::rq_res to describe the part of rq_pages that can be
used for constructing the Reply. This means that the send buffer
(rq_res) shrinks when the received RPC record containing the RPC
Call is large.

A client can force this shrinkage on TCP by sending a correctly-
formed RPC Call header contained in an RPC record that is
excessively large. The full maximum payload size cannot be
constructed in that case.

Cc: <stable@vger.kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsproc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index ddb1902c0a18..4b19cc727ea5 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -185,6 +185,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
 		argp->count, argp->offset);
 
 	argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
+	argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
 
 	v = 0;
 	len = argp->count;
-- 
cgit v1.2.3


From fa6be9cc6e80ec79892ddf08a8c10cabab9baf38 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Sep 2022 15:10:24 -0400
Subject: NFSD: Protect against send buffer overflow in NFSv3 READ

Since before the git era, NFSD has conserved the number of pages
held by each nfsd thread by combining the RPC receive and send
buffers into a single array of pages. This works because there are
no cases where an operation needs a large RPC Call message and a
large RPC Reply at the same time.

Once an RPC Call has been received, svc_process() updates
svc_rqst::rq_res to describe the part of rq_pages that can be
used for constructing the Reply. This means that the send buffer
(rq_res) shrinks when the received RPC record containing the RPC
Call is large.

A client can force this shrinkage on TCP by sending a correctly-
formed RPC Call header contained in an RPC record that is
excessively large. The full maximum payload size cannot be
constructed in that case.

Cc: <stable@vger.kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs3proc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 7a159785499a..5b1e771238b3 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -150,7 +150,6 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
 {
 	struct nfsd3_readargs *argp = rqstp->rq_argp;
 	struct nfsd3_readres *resp = rqstp->rq_resp;
-	u32 max_blocksize = svc_max_payload(rqstp);
 	unsigned int len;
 	int v;
 
@@ -159,7 +158,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
 				(unsigned long) argp->count,
 				(unsigned long long) argp->offset);
 
-	argp->count = min_t(u32, argp->count, max_blocksize);
+	argp->count = min_t(u32, argp->count, svc_max_payload(rqstp));
+	argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
 	if (argp->offset > (u64)OFFSET_MAX)
 		argp->offset = (u64)OFFSET_MAX;
 	if (argp->offset + argp->count > (u64)OFFSET_MAX)
-- 
cgit v1.2.3


From 9558f9304ca1903090fa5d995a3269a8e82804b4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 6 Sep 2022 10:42:19 +1000
Subject: NFSD: drop fname and flen args from nfsd_create_locked()

nfsd_create_locked() does not use the "fname" and "flen" arguments, so
drop them from declaration and all callers.

Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsproc.c | 5 ++---
 fs/nfsd/vfs.c     | 5 ++---
 fs/nfsd/vfs.h     | 4 ++--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 4b19cc727ea5..ee02ede74bf5 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -391,9 +391,8 @@ nfsd_proc_create(struct svc_rqst *rqstp)
 	resp->status = nfs_ok;
 	if (!inode) {
 		/* File doesn't exist. Create it and set attrs */
-		resp->status = nfsd_create_locked(rqstp, dirfhp, argp->name,
-						  argp->len, &attrs, type, rdev,
-						  newfhp);
+		resp->status = nfsd_create_locked(rqstp, dirfhp, &attrs, type,
+						  rdev, newfhp);
 	} else if (type == S_IFREG) {
 		dprintk("nfsd:   existing %s, valid=%x, size=%ld\n",
 			argp->name, attr->ia_valid, (long) attr->ia_size);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fc17b0ac8729..712fbfe30f53 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1255,7 +1255,7 @@ nfsd_check_ignore_resizing(struct iattr *iap)
 /* The parent directory should already be locked: */
 __be32
 nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
-		   char *fname, int flen, struct nfsd_attrs *attrs,
+		   struct nfsd_attrs *attrs,
 		   int type, dev_t rdev, struct svc_fh *resfhp)
 {
 	struct dentry	*dentry, *dchild;
@@ -1382,8 +1382,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	if (err)
 		goto out_unlock;
 	fh_fill_pre_attrs(fhp);
-	err = nfsd_create_locked(rqstp, fhp, fname, flen, attrs, type,
-				 rdev, resfhp);
+	err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp);
 	fh_fill_post_attrs(fhp);
 out_unlock:
 	inode_unlock(dentry->d_inode);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index c95cd414b4bb..120521bc7b24 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -79,8 +79,8 @@ __be32		nfsd4_clone_file_range(struct svc_rqst *rqstp,
 				       u64 count, bool sync);
 #endif /* CONFIG_NFSD_V4 */
 __be32		nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
-				char *name, int len, struct nfsd_attrs *attrs,
-				int type, dev_t rdev, struct svc_fh *res);
+				struct nfsd_attrs *attrs, int type, dev_t rdev,
+				struct svc_fh *res);
 __be32		nfsd_create(struct svc_rqst *, struct svc_fh *,
 				char *name, int len, struct nfsd_attrs *attrs,
 				int type, dev_t rdev, struct svc_fh *res);
-- 
cgit v1.2.3


From 7518a3dc5ea249d4112156ce71b8b184eb786151 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 5 Sep 2022 15:33:32 -0400
Subject: NFSD: Fix handling of oversized NFSv4 COMPOUND requests

If an NFS server returns NFS4ERR_RESOURCE on the first operation in
an NFSv4 COMPOUND, there's no way for a client to know where the
problem is and then simplify the compound to make forward progress.

So instead, make NFSD process as many operations in an oversized
COMPOUND as it can and then return NFS4ERR_RESOURCE on the first
operation it did not process.

pynfs NFSv4.0 COMP6 exercises this case, but checks only for the
COMPOUND status code, not whether the server has processed any
of the operations.

pynfs NFSv4.1 SEQ6 and SEQ7 exercise the NFSv4.1 case, which detects
too many operations per COMPOUND by checking against the limits
negotiated when the session was created.

Suggested-by: Bruce Fields <bfields@fieldses.org>
Fixes: 0078117c6d91 ("nfsd: return RESOURCE not GARBAGE_ARGS on too many ops")
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c | 19 +++++++++++++------
 fs/nfsd/nfs4xdr.c  | 12 +++---------
 fs/nfsd/xdr4.h     |  3 ++-
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 7055e1c91d0e..6ef0795fefb4 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2631,9 +2631,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
 	status = nfserr_minor_vers_mismatch;
 	if (nfsd_minorversion(nn, args->minorversion, NFSD_TEST) <= 0)
 		goto out;
-	status = nfserr_resource;
-	if (args->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
-		goto out;
 
 	status = nfs41_check_op_ordering(args);
 	if (status) {
@@ -2646,10 +2643,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
 
 	rqstp->rq_lease_breaker = (void **)&cstate->clp;
 
-	trace_nfsd_compound(rqstp, args->opcnt);
+	trace_nfsd_compound(rqstp, args->client_opcnt);
 	while (!status && resp->opcnt < args->opcnt) {
 		op = &args->ops[resp->opcnt++];
 
+		if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) {
+			/* If there are still more operations to process,
+			 * stop here and report NFS4ERR_RESOURCE. */
+			if (cstate->minorversion == 0 &&
+			    args->client_opcnt > resp->opcnt) {
+				op->status = nfserr_resource;
+				goto encode_op;
+			}
+		}
+
 		/*
 		 * The XDR decode routines may have pre-set op->status;
 		 * for example, if there is a miscellaneous XDR error
@@ -2725,8 +2732,8 @@ encode_op:
 			status = op->status;
 		}
 
-		trace_nfsd_compound_status(args->opcnt, resp->opcnt, status,
-					   nfsd4_op_name(op->opnum));
+		trace_nfsd_compound_status(args->client_opcnt, resp->opcnt,
+					   status, nfsd4_op_name(op->opnum));
 
 		nfsd4_cstate_clear_replay(cstate);
 		nfsd4_increment_op_stats(op->opnum);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 4b69e86240eb..09519b4a860a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2359,16 +2359,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 
 	if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0)
 		return false;
-	if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0)
+	if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0)
 		return false;
-
-	/*
-	 * NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS
-	 * here, so we return success at the xdr level so that
-	 * nfsd4_proc can handle this is an NFS-level error.
-	 */
-	if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
-		return true;
+	argp->opcnt = min_t(u32, argp->client_opcnt,
+			    NFSD_MAX_OPS_PER_COMPOUND);
 
 	if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
 		argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops));
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 96267258e629..466e2786fc97 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -717,9 +717,10 @@ struct nfsd4_compoundargs {
 	struct svcxdr_tmpbuf		*to_free;
 	struct svc_rqst			*rqstp;
 
-	u32				taglen;
 	char *				tag;
+	u32				taglen;
 	u32				minorversion;
+	u32				client_opcnt;
 	u32				opcnt;
 	struct nfsd4_op			*ops;
 	struct nfsd4_op			iops[8];
-- 
cgit v1.2.3


From 6106d9119b6599fa23dc556b429d887b4c2d9f62 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 8 Sep 2022 12:31:07 -0400
Subject: nfsd: clean up mounted_on_fileid handling

We only need the inode number for this, not a full rack of attributes.
Rename this function make it take a pointer to a u64 instead of
struct kstat, and change it to just request STATX_INO.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
[ cel: renamed get_mounted_on_ino() ]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 09519b4a860a..30d4897e62ab 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2769,9 +2769,10 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32
 }
 
 
-static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
+static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino)
 {
 	struct path path = exp->ex_path;
+	struct kstat stat;
 	int err;
 
 	path_get(&path);
@@ -2779,8 +2780,10 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
 		if (path.dentry != path.mnt->mnt_root)
 			break;
 	}
-	err = vfs_getattr(&path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
+	err = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
 	path_put(&path);
+	if (!err)
+		*pino = stat.ino;
 	return err;
 }
 
@@ -3277,22 +3280,21 @@ out_acl:
 		*p++ = cpu_to_be32(stat.btime.tv_nsec);
 	}
 	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
-		struct kstat parent_stat;
 		u64 ino = stat.ino;
 
 		p = xdr_reserve_space(xdr, 8);
 		if (!p)
                 	goto out_resource;
 		/*
-		 * Get parent's attributes if not ignoring crossmount
-		 * and this is the root of a cross-mounted filesystem.
+		 * Get ino of mountpoint in parent filesystem, if not ignoring
+		 * crossmount and this is the root of a cross-mounted
+		 * filesystem.
 		 */
 		if (ignore_crossmnt == 0 &&
 		    dentry == exp->ex_path.mnt->mnt_root) {
-			err = get_parent_attributes(exp, &parent_stat);
+			err = nfsd4_get_mounted_on_ino(exp, &ino);
 			if (err)
 				goto out_nfserr;
-			ino = parent_stat.ino;
 		}
 		p = xdr_encode_hyper(p, ino);
 	}
-- 
cgit v1.2.3


From 18224dc58d960c65446971930d0487fc72d00598 Mon Sep 17 00:00:00 2001
From: Gaosheng Cui <cuigaosheng1@huawei.com>
Date: Fri, 9 Sep 2022 14:59:10 +0800
Subject: nfsd: remove nfsd4_prepare_cb_recall() declaration

nfsd4_prepare_cb_recall() has been removed since
commit 0162ac2b978e ("nfsd: introduce nfsd4_callback_ops"),
so remove it.

Signed-off-by: Gaosheng Cui <cuigaosheng1@huawei.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/state.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 5d28beb290fe..4155be65d806 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -697,7 +697,6 @@ extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);
 extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
-extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
 extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
 				struct xdr_netobj princhash, struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
-- 
cgit v1.2.3


From 948755efc951de75c87d4fa916d9d36b58299295 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 8 Sep 2022 18:13:42 -0400
Subject: NFSD: Replace dprintk() call site in fh_verify()

Record permission errors in the trace log. Note that the new trace
event is conditional, so it will only record non-zero return values
from nfsd_permission().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfsfh.c |  8 +-------
 fs/nfsd/trace.h | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index a5b71526cee0..d73434200df9 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -392,13 +392,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
 skip_pseudoflavor_check:
 	/* Finally, check access permissions. */
 	error = nfsd_permission(rqstp, exp, dentry, access);
-
-	if (error) {
-		dprintk("fh_verify: %pd2 permission failure, "
-			"acc=%x, error=%d\n",
-			dentry,
-			access, ntohl(error));
-	}
+	trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error);
 out:
 	if (error == nfserr_stale)
 		nfsd_stats_fh_stale_inc(exp);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 9ebd67d461f9..1b9f5753f336 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -195,7 +195,7 @@ TRACE_EVENT(nfsd_fh_verify,
 		__sockaddr(client, rqstp->rq_xprt->xpt_remotelen)
 		__field(u32, xid)
 		__field(u32, fh_hash)
-		__field(void *, inode)
+		__field(const void *, inode)
 		__field(unsigned long, type)
 		__field(unsigned long, access)
 	),
@@ -211,13 +211,55 @@ TRACE_EVENT(nfsd_fh_verify,
 		__entry->type = type;
 		__entry->access = access;
 	),
-	TP_printk("xid=0x%08x fh_hash=0x%08x inode=%p type=%s access=%s",
-		__entry->xid, __entry->fh_hash, __entry->inode,
+	TP_printk("xid=0x%08x fh_hash=0x%08x type=%s access=%s",
+		__entry->xid, __entry->fh_hash,
 		show_fs_file_type(__entry->type),
 		show_nfsd_may_flags(__entry->access)
 	)
 );
 
+TRACE_EVENT_CONDITION(nfsd_fh_verify_err,
+	TP_PROTO(
+		const struct svc_rqst *rqstp,
+		const struct svc_fh *fhp,
+		umode_t type,
+		int access,
+		__be32 error
+	),
+	TP_ARGS(rqstp, fhp, type, access, error),
+	TP_CONDITION(error),
+	TP_STRUCT__entry(
+		__field(unsigned int, netns_ino)
+		__sockaddr(server, rqstp->rq_xprt->xpt_remotelen)
+		__sockaddr(client, rqstp->rq_xprt->xpt_remotelen)
+		__field(u32, xid)
+		__field(u32, fh_hash)
+		__field(const void *, inode)
+		__field(unsigned long, type)
+		__field(unsigned long, access)
+		__field(int, error)
+	),
+	TP_fast_assign(
+		__entry->netns_ino = SVC_NET(rqstp)->ns.inum;
+		__assign_sockaddr(server, &rqstp->rq_xprt->xpt_local,
+		       rqstp->rq_xprt->xpt_locallen);
+		__assign_sockaddr(client, &rqstp->rq_xprt->xpt_remote,
+				  rqstp->rq_xprt->xpt_remotelen);
+		__entry->xid = be32_to_cpu(rqstp->rq_xid);
+		__entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+		__entry->inode = d_inode(fhp->fh_dentry);
+		__entry->type = type;
+		__entry->access = access;
+		__entry->error = be32_to_cpu(error);
+	),
+	TP_printk("xid=0x%08x fh_hash=0x%08x type=%s access=%s error=%d",
+		__entry->xid, __entry->fh_hash,
+		show_fs_file_type(__entry->type),
+		show_nfsd_may_flags(__entry->access),
+		__entry->error
+	)
+);
+
 DECLARE_EVENT_CLASS(nfsd_fh_err_class,
 	TP_PROTO(struct svc_rqst *rqstp,
 		 struct svc_fh	*fhp,
-- 
cgit v1.2.3


From de29cf7e6cbbe236c3a51999c188fcd467762899 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 8 Sep 2022 18:13:48 -0400
Subject: NFSD: Trace NFSv4 COMPOUND tags

The Linux NFSv4 client implementation does not use COMPOUND tags,
but the Solaris and MacOS implementations do, and so does pynfs.
Record these eye-catchers in the server's trace buffer to annotate
client requests while troubleshooting.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4proc.c |  2 +-
 fs/nfsd/trace.h    | 21 ++++++++++++++-------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6ef0795fefb4..f14b015022ed 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2643,7 +2643,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
 
 	rqstp->rq_lease_breaker = (void **)&cstate->clp;
 
-	trace_nfsd_compound(rqstp, args->client_opcnt);
+	trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt);
 	while (!status && resp->opcnt < args->opcnt) {
 		op = &args->ops[resp->opcnt++];
 
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 1b9f5753f336..0c35a1a844e6 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -84,19 +84,26 @@ DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
 		{ NFSD_MAY_64BIT_COOKIE,	"64BIT_COOKIE" })
 
 TRACE_EVENT(nfsd_compound,
-	TP_PROTO(const struct svc_rqst *rqst,
-		 u32 args_opcnt),
-	TP_ARGS(rqst, args_opcnt),
+	TP_PROTO(
+		const struct svc_rqst *rqst,
+		const char *tag,
+		u32 taglen,
+		u32 opcnt
+	),
+	TP_ARGS(rqst, tag, taglen, opcnt),
 	TP_STRUCT__entry(
 		__field(u32, xid)
-		__field(u32, args_opcnt)
+		__field(u32, opcnt)
+		__string_len(tag, tag, taglen)
 	),
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(rqst->rq_xid);
-		__entry->args_opcnt = args_opcnt;
+		__entry->opcnt = opcnt;
+		__assign_str_len(tag, tag, taglen);
 	),
-	TP_printk("xid=0x%08x opcnt=%u",
-		__entry->xid, __entry->args_opcnt)
+	TP_printk("xid=0x%08x opcnt=%u tag=%s",
+		__entry->xid, __entry->opcnt, __get_str(tag)
+	)
 )
 
 TRACE_EVENT(nfsd_compound_status,
-- 
cgit v1.2.3


From 1035d65446a018ca2dd179e29a2fcd6d29057781 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 8 Sep 2022 18:13:54 -0400
Subject: NFSD: Add tracepoints to report NFSv4 callback completions

Wireshark has always been lousy about dissecting NFSv4 callbacks,
especially NFSv4.0 backchannel requests. Add tracepoints so we
can surgically capture these events in the trace log.

Tracepoints are time-stamped and ordered so that we can now observe
the timing relationship between a CB_RECALL Reply and the client's
DELEGRETURN Call. Example:

            nfsd-1153  [002]   211.986391: nfsd_cb_recall:       addr=192.168.1.67:45767 client 62ea82e4:fee7492a stateid 00000003:00000001

            nfsd-1153  [002]   212.095634: nfsd_compound:        xid=0x0000002c opcnt=2
            nfsd-1153  [002]   212.095647: nfsd_compound_status: op=1/2 OP_PUTFH status=0
            nfsd-1153  [002]   212.095658: nfsd_file_put:        hash=0xf72 inode=0xffff9291148c7410 ref=3 flags=HASHED|REFERENCED may=READ file=0xffff929103b3ea00
            nfsd-1153  [002]   212.095661: nfsd_compound_status: op=2/2 OP_DELEGRETURN status=0
   kworker/u25:8-148   [002]   212.096713: nfsd_cb_recall_done:  client 62ea82e4:fee7492a stateid 00000003:00000001 status=0

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4layouts.c |  2 +-
 fs/nfsd/nfs4proc.c    |  4 ++++
 fs/nfsd/nfs4state.c   |  4 ++++
 fs/nfsd/trace.h       | 39 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 2c05692a9abf..3564d1c6f610 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -658,7 +658,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 	ktime_t now, cutoff;
 	const struct nfsd4_layout_ops *ops;
 
-
+	trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task);
 	switch (task->tk_status) {
 	case 0:
 	case -NFS4ERR_DELAY:
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index f14b015022ed..1918c9ec3478 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1613,6 +1613,10 @@ static void nfsd4_cb_offload_release(struct nfsd4_callback *cb)
 static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
 				 struct rpc_task *task)
 {
+	struct nfsd4_cb_offload *cbo =
+		container_of(cb, struct nfsd4_cb_offload, co_cb);
+
+	trace_nfsd_cb_offload_done(&cbo->co_res.cb_stateid, task);
 	return 1;
 }
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c5d199d7e6b4..561f3556b1d2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -357,6 +357,8 @@ nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb)
 static int
 nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
 {
+	trace_nfsd_cb_notify_lock_done(&zero_stateid, task);
+
 	/*
 	 * Since this is just an optimization, we don't try very hard if it
 	 * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and
@@ -4743,6 +4745,8 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
 {
 	struct nfs4_delegation *dp = cb_to_delegation(cb);
 
+	trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task);
+
 	if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID ||
 	    dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID)
 	        return 1;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 0c35a1a844e6..ec8e08315779 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1448,6 +1448,45 @@ TRACE_EVENT(nfsd_cb_offload,
 		__entry->fh_hash, __entry->count, __entry->status)
 );
 
+DECLARE_EVENT_CLASS(nfsd_cb_done_class,
+	TP_PROTO(
+		const stateid_t *stp,
+		const struct rpc_task *task
+	),
+	TP_ARGS(stp, task),
+	TP_STRUCT__entry(
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, si_id)
+		__field(u32, si_generation)
+		__field(int, status)
+	),
+	TP_fast_assign(
+		__entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+		__entry->cl_id = stp->si_opaque.so_clid.cl_id;
+		__entry->si_id = stp->si_opaque.so_id;
+		__entry->si_generation = stp->si_generation;
+		__entry->status = task->tk_status;
+	),
+	TP_printk("client %08x:%08x stateid %08x:%08x status=%d",
+		__entry->cl_boot, __entry->cl_id, __entry->si_id,
+		__entry->si_generation, __entry->status
+	)
+);
+
+#define DEFINE_NFSD_CB_DONE_EVENT(name)			\
+DEFINE_EVENT(nfsd_cb_done_class, name,			\
+	TP_PROTO(					\
+		const stateid_t *stp,			\
+		const struct rpc_task *task		\
+	),						\
+	TP_ARGS(stp, task))
+
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_recall_done);
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_notify_lock_done);
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_layout_done);
+DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_offload_done);
+
 #endif /* _NFSD_TRACE_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3


From c035362eb935fe9381d9d1cc453bc2a37460e24c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 8 Sep 2022 18:14:00 -0400
Subject: NFSD: Add a mechanism to wait for a DELEGRETURN

Subsequent patches will use this mechanism to wake up an operation
that is waiting for a client to return a delegation.

The new tracepoint records whether the wait timed out or was
properly awoken by the expected DELEGRETURN:

            nfsd-1155  [002] 83799.493199: nfsd_delegret_wakeup: xid=0x14b7d6ef fh_hash=0xf6826792 (timed out)

Suggested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 30 ++++++++++++++++++++++++++++++
 fs/nfsd/nfsd.h      |  7 +++++++
 fs/nfsd/trace.h     | 23 +++++++++++++++++++++++
 3 files changed, 60 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 561f3556b1d2..54bc70427ce3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4717,6 +4717,35 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 	return ret;
 }
 
+static bool nfsd4_deleg_present(const struct inode *inode)
+{
+	struct file_lock_context *ctx = smp_load_acquire(&inode->i_flctx);
+
+	return ctx && !list_empty_careful(&ctx->flc_lease);
+}
+
+/**
+ * nfsd_wait_for_delegreturn - wait for delegations to be returned
+ * @rqstp: the RPC transaction being executed
+ * @inode: in-core inode of the file being waited for
+ *
+ * The timeout prevents deadlock if all nfsd threads happen to be
+ * tied up waiting for returning delegations.
+ *
+ * Return values:
+ *   %true: delegation was returned
+ *   %false: timed out waiting for delegreturn
+ */
+bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode)
+{
+	long __maybe_unused timeo;
+
+	timeo = wait_var_event_timeout(inode, !nfsd4_deleg_present(inode),
+				       NFSD_DELEGRETURN_TIMEOUT);
+	trace_nfsd_delegret_wakeup(rqstp, inode, timeo);
+	return timeo > 0;
+}
+
 static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
 {
 	struct nfs4_delegation *dp = cb_to_delegation(cb);
@@ -6779,6 +6808,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		goto put_stateid;
 
+	wake_up_var(d_inode(cstate->current_fh.fh_dentry));
 	destroy_delegation(dp);
 put_stateid:
 	nfs4_put_stid(&dp->dl_stid);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 57a468ed85c3..6ab4ad41ae84 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -164,6 +164,7 @@ char * nfs4_recoverydir(void);
 bool nfsd4_spo_must_allow(struct svc_rqst *rqstp);
 int nfsd4_create_laundry_wq(void);
 void nfsd4_destroy_laundry_wq(void);
+bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode);
 #else
 static inline int nfsd4_init_slabs(void) { return 0; }
 static inline void nfsd4_free_slabs(void) { }
@@ -179,6 +180,11 @@ static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
 }
 static inline int nfsd4_create_laundry_wq(void) { return 0; };
 static inline void nfsd4_destroy_laundry_wq(void) {};
+static inline bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp,
+					      struct inode *inode)
+{
+	return false;
+}
 #endif
 
 /*
@@ -343,6 +349,7 @@ void		nfsd_lockd_shutdown(void);
 #define	NFSD_COURTESY_CLIENT_TIMEOUT	(24 * 60 * 60)	/* seconds */
 #define	NFSD_CLIENT_MAX_TRIM_PER_RUN	128
 #define	NFS4_CLIENTS_PER_GB		1024
+#define NFSD_DELEGRETURN_TIMEOUT	(HZ / 34)	/* 30ms */
 
 /*
  * The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index ec8e08315779..06a96e955bd0 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -538,6 +538,29 @@ DEFINE_NFSD_COPY_ERR_EVENT(clone_file_range_err);
 #include "filecache.h"
 #include "vfs.h"
 
+TRACE_EVENT(nfsd_delegret_wakeup,
+	TP_PROTO(
+		const struct svc_rqst *rqstp,
+		const struct inode *inode,
+		long timeo
+	),
+	TP_ARGS(rqstp, inode, timeo),
+	TP_STRUCT__entry(
+		__field(u32, xid)
+		__field(const void *, inode)
+		__field(long, timeo)
+	),
+	TP_fast_assign(
+		__entry->xid = be32_to_cpu(rqstp->rq_xid);
+		__entry->inode = inode;
+		__entry->timeo = timeo;
+	),
+	TP_printk("xid=0x%08x inode=%p%s",
+		  __entry->xid, __entry->inode,
+		  __entry->timeo == 0 ? " (timed out)" : ""
+	)
+);
+
 DECLARE_EVENT_CLASS(nfsd_stateid_class,
 	TP_PROTO(stateid_t *stp),
 	TP_ARGS(stp),
-- 
cgit v1.2.3


From c0aa1913db57219e91a0a8832363cbafb3a9cf8f Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 8 Sep 2022 18:14:07 -0400
Subject: NFSD: Refactor nfsd_setattr()

Move code that will be retried (in a subsequent patch) into a helper
function.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/vfs.c | 97 +++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 57 insertions(+), 40 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 712fbfe30f53..7c17b4a93168 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -343,8 +343,61 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	return nfserrno(get_write_access(inode));
 }
 
-/*
- * Set various file attributes.  After this call fhp needs an fh_put.
+static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
+{
+	int host_err;
+
+	if (iap->ia_valid & ATTR_SIZE) {
+		/*
+		 * RFC5661, Section 18.30.4:
+		 *   Changing the size of a file with SETATTR indirectly
+		 *   changes the time_modify and change attributes.
+		 *
+		 * (and similar for the older RFCs)
+		 */
+		struct iattr size_attr = {
+			.ia_valid	= ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
+			.ia_size	= iap->ia_size,
+		};
+
+		if (iap->ia_size < 0)
+			return -EFBIG;
+
+		host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
+		if (host_err)
+			return host_err;
+		iap->ia_valid &= ~ATTR_SIZE;
+
+		/*
+		 * Avoid the additional setattr call below if the only other
+		 * attribute that the client sends is the mtime, as we update
+		 * it as part of the size change above.
+		 */
+		if ((iap->ia_valid & ~ATTR_MTIME) == 0)
+			return 0;
+	}
+
+	if (!iap->ia_valid)
+		return 0;
+
+	iap->ia_valid |= ATTR_CTIME;
+	return notify_change(&init_user_ns, dentry, iap, NULL);
+}
+
+/**
+ * nfsd_setattr - Set various file attributes.
+ * @rqstp: controlling RPC transaction
+ * @fhp: filehandle of target
+ * @attr: attributes to set
+ * @check_guard: set to 1 if guardtime is a valid timestamp
+ * @guardtime: do not act if ctime.tv_sec does not match this timestamp
+ *
+ * This call may adjust the contents of @attr (in particular, this
+ * call may change the bits in the na_iattr.ia_valid field).
+ *
+ * Returns nfs_ok on success, otherwise an NFS status code is
+ * returned. Caller must release @fhp by calling fh_put in either
+ * case.
  */
 __be32
 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
@@ -357,7 +410,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	int		accmode = NFSD_MAY_SATTR;
 	umode_t		ftype = 0;
 	__be32		err;
-	int		host_err = 0;
+	int		host_err;
 	bool		get_write_count;
 	bool		size_change = (iap->ia_valid & ATTR_SIZE);
 
@@ -414,43 +467,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	}
 
 	inode_lock(inode);
-	if (size_change) {
-		/*
-		 * RFC5661, Section 18.30.4:
-		 *   Changing the size of a file with SETATTR indirectly
-		 *   changes the time_modify and change attributes.
-		 *
-		 * (and similar for the older RFCs)
-		 */
-		struct iattr size_attr = {
-			.ia_valid	= ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
-			.ia_size	= iap->ia_size,
-		};
-
-		host_err = -EFBIG;
-		if (iap->ia_size < 0)
-			goto out_unlock;
-
-		host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
-		if (host_err)
-			goto out_unlock;
-		iap->ia_valid &= ~ATTR_SIZE;
-
-		/*
-		 * Avoid the additional setattr call below if the only other
-		 * attribute that the client sends is the mtime, as we update
-		 * it as part of the size change above.
-		 */
-		if ((iap->ia_valid & ~ATTR_MTIME) == 0)
-			goto out_unlock;
-	}
-
-	if (iap->ia_valid) {
-		iap->ia_valid |= ATTR_CTIME;
-		host_err = notify_change(&init_user_ns, dentry, iap, NULL);
-	}
-
-out_unlock:
+	host_err = __nfsd_setattr(dentry, iap);
 	if (attr->na_seclabel && attr->na_seclabel->len)
 		attr->na_labelerr = security_inode_setsecctx(dentry,
 			attr->na_seclabel->data, attr->na_seclabel->len);
-- 
cgit v1.2.3


From 34b91dda7124fc3259e4b2ae53e0c933dedfec01 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 8 Sep 2022 18:14:13 -0400
Subject: NFSD: Make nfsd4_setattr() wait before returning NFS4ERR_DELAY

nfsd_setattr() can kick off a CB_RECALL (via
notify_change() -> break_lease()) if a delegation is present. Before
returning NFS4ERR_DELAY, give the client holding that delegation a
chance to return it and then retry the nfsd_setattr() again, once.

Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=354
Tested-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/vfs.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7c17b4a93168..d4db595d33ea 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -413,6 +413,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	int		host_err;
 	bool		get_write_count;
 	bool		size_change = (iap->ia_valid & ATTR_SIZE);
+	int		retries;
 
 	if (iap->ia_valid & ATTR_SIZE) {
 		accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
@@ -467,7 +468,13 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	}
 
 	inode_lock(inode);
-	host_err = __nfsd_setattr(dentry, iap);
+	for (retries = 1;;) {
+		host_err = __nfsd_setattr(dentry, iap);
+		if (host_err != -EAGAIN || !retries--)
+			break;
+		if (!nfsd_wait_for_delegreturn(rqstp, inode))
+			break;
+	}
 	if (attr->na_seclabel && attr->na_seclabel->len)
 		attr->na_labelerr = security_inode_setsecctx(dentry,
 			attr->na_seclabel->data, attr->na_seclabel->len);
-- 
cgit v1.2.3


From 68c522afd0b1936b48a03a4c8b81261e7597c62d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 8 Sep 2022 18:14:19 -0400
Subject: NFSD: Make nfsd4_rename() wait before returning NFS4ERR_DELAY

nfsd_rename() can kick off a CB_RECALL (via
vfs_rename() -> leases_conflict()) if a delegation is present.
Before returning NFS4ERR_DELAY, give the client holding that
delegation a chance to return it and then retry the nfsd_rename()
again, once.

This version of the patch handles renaming an existing file,
but does not deal with renaming onto an existing file. That
case will still always trigger an NFS4ERR_DELAY.

Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=354
Tested-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/vfs.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d4db595d33ea..7d86ed235359 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1696,7 +1696,15 @@ retry:
 			.new_dir	= tdir,
 			.new_dentry	= ndentry,
 		};
-		host_err = vfs_rename(&rd);
+		int retries;
+
+		for (retries = 1;;) {
+			host_err = vfs_rename(&rd);
+			if (host_err != -EAGAIN || !retries--)
+				break;
+			if (!nfsd_wait_for_delegreturn(rqstp, d_inode(odentry)))
+				break;
+		}
 		if (!host_err) {
 			host_err = commit_metadata(tfhp);
 			if (!host_err)
-- 
cgit v1.2.3


From 5f5f8b6d655fd947e899b1771c2f7cb581a06764 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 8 Sep 2022 18:14:25 -0400
Subject: NFSD: Make nfsd4_remove() wait before returning NFS4ERR_DELAY

nfsd_unlink() can kick off a CB_RECALL (via
vfs_unlink() -> leases_conflict()) if a delegation is present.
Before returning NFS4ERR_DELAY, give the client holding that
delegation a chance to return it and then retry the nfsd_unlink()
again, once.

Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=354
Tested-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/vfs.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7d86ed235359..83be89905cbf 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1788,9 +1788,18 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 
 	fh_fill_pre_attrs(fhp);
 	if (type != S_IFDIR) {
+		int retries;
+
 		if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK)
 			nfsd_close_cached_files(rdentry);
-		host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL);
+
+		for (retries = 1;;) {
+			host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL);
+			if (host_err != -EAGAIN || !retries--)
+				break;
+			if (!nfsd_wait_for_delegreturn(rqstp, rinode))
+				break;
+		}
 	} else {
 		host_err = vfs_rmdir(&init_user_ns, dirp, rdentry);
 	}
-- 
cgit v1.2.3


From 06981d560606ac48d61e5f4fff6738b925c93173 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@Netapp.com>
Date: Tue, 13 Sep 2022 14:01:50 -0400
Subject: NFSD: Return nfserr_serverfault if splice_ok but buf->pages have data

This was discussed with Chuck as part of this patch set. Returning
nfserr_resource was decided to not be the best error message here, and
he suggested changing to nfserr_serverfault instead.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Link: https://lore.kernel.org/linux-nfs/20220907195259.926736-1-anna@kernel.org/T/#t
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 30d4897e62ab..0220e1fa214b 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3991,7 +3991,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
 	}
 	if (resp->xdr->buf->page_len && splice_ok) {
 		WARN_ON_ONCE(1);
-		return nfserr_resource;
+		return nfserr_serverfault;
 	}
 	xdr_commit_encode(xdr);
 
-- 
cgit v1.2.3


From 3a4ea23d86a317c4b68b9a69d51f7e84e1e04357 Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Wed, 14 Sep 2022 08:54:25 -0700
Subject: NFSD: keep track of the number of courtesy clients in the system

Add counter nfs4_courtesy_client_count to nfsd_net to keep track
of the number of courtesy clients in the system.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/netns.h     |  2 ++
 fs/nfsd/nfs4state.c | 17 ++++++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ffe17743cc74..55c7006d6109 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -192,6 +192,8 @@ struct nfsd_net {
 
 	atomic_t		nfs4_client_count;
 	int			nfs4_max_clients;
+
+	atomic_t		nfsd_courtesy_clients;
 };
 
 /* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 54bc70427ce3..a248ecfe89c8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -160,6 +160,13 @@ static bool is_client_expired(struct nfs4_client *clp)
 	return clp->cl_time == 0;
 }
 
+static void nfsd4_dec_courtesy_client_count(struct nfsd_net *nn,
+					struct nfs4_client *clp)
+{
+	if (clp->cl_state != NFSD4_ACTIVE)
+		atomic_add_unless(&nn->nfsd_courtesy_clients, -1, 0);
+}
+
 static __be32 get_client_locked(struct nfs4_client *clp)
 {
 	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -169,6 +176,7 @@ static __be32 get_client_locked(struct nfs4_client *clp)
 	if (is_client_expired(clp))
 		return nfserr_expired;
 	atomic_inc(&clp->cl_rpc_users);
+	nfsd4_dec_courtesy_client_count(nn, clp);
 	clp->cl_state = NFSD4_ACTIVE;
 	return nfs_ok;
 }
@@ -190,6 +198,7 @@ renew_client_locked(struct nfs4_client *clp)
 
 	list_move_tail(&clp->cl_lru, &nn->client_lru);
 	clp->cl_time = ktime_get_boottime_seconds();
+	nfsd4_dec_courtesy_client_count(nn, clp);
 	clp->cl_state = NFSD4_ACTIVE;
 }
 
@@ -2235,6 +2244,7 @@ __destroy_client(struct nfs4_client *clp)
 	if (clp->cl_cb_conn.cb_xprt)
 		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
 	atomic_add_unless(&nn->nfs4_client_count, -1, 0);
+	nfsd4_dec_courtesy_client_count(nn, clp);
 	free_client(clp);
 	wake_up_all(&expiry_wq);
 }
@@ -4358,6 +4368,8 @@ void nfsd4_init_leases_net(struct nfsd_net *nn)
 	max_clients = (u64)si.totalram * si.mem_unit / (1024 * 1024 * 1024);
 	max_clients *= NFS4_CLIENTS_PER_GB;
 	nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB);
+
+	atomic_set(&nn->nfsd_courtesy_clients, 0);
 }
 
 static void init_nfs4_replay(struct nfs4_replay *rp)
@@ -5911,8 +5923,11 @@ nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist,
 			goto exp_client;
 		if (!state_expired(lt, clp->cl_time))
 			break;
-		if (!atomic_read(&clp->cl_rpc_users))
+		if (!atomic_read(&clp->cl_rpc_users)) {
+			if (clp->cl_state == NFSD4_ACTIVE)
+				atomic_inc(&nn->nfsd_courtesy_clients);
 			clp->cl_state = NFSD4_COURTESY;
+		}
 		if (!client_has_state(clp))
 			goto exp_client;
 		if (!nfs4_anylock_blockers(clp))
-- 
cgit v1.2.3


From 7746b32f467b3813fb61faaab3258de35806a7ac Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Wed, 14 Sep 2022 08:54:26 -0700
Subject: NFSD: add shrinker to reap courtesy clients on low memory condition

Add courtesy_client_reaper to react to low memory condition triggered
by the system memory shrinker.

The delayed_work for the courtesy_client_reaper is scheduled on
the shrinker's count callback using the laundry_wq.

The shrinker's scan callback is not used for expiring the courtesy
clients due to potential deadlocks.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/netns.h     |  2 ++
 fs/nfsd/nfs4state.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/nfsd/nfsctl.c    |  6 ++--
 fs/nfsd/nfsd.h      |  6 ++--
 4 files changed, 96 insertions(+), 12 deletions(-)

diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 55c7006d6109..8c854ba3285b 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -194,6 +194,8 @@ struct nfsd_net {
 	int			nfs4_max_clients;
 
 	atomic_t		nfsd_courtesy_clients;
+	struct shrinker		nfsd_client_shrinker;
+	struct delayed_work	nfsd_shrinker_work;
 };
 
 /* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a248ecfe89c8..56b50210da38 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4349,7 +4349,27 @@ out:
 	return -ENOMEM;
 }
 
-void nfsd4_init_leases_net(struct nfsd_net *nn)
+static unsigned long
+nfsd_courtesy_client_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	int cnt;
+	struct nfsd_net *nn = container_of(shrink,
+			struct nfsd_net, nfsd_client_shrinker);
+
+	cnt = atomic_read(&nn->nfsd_courtesy_clients);
+	if (cnt > 0)
+		mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
+	return (unsigned long)cnt;
+}
+
+static unsigned long
+nfsd_courtesy_client_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	return SHRINK_STOP;
+}
+
+int
+nfsd4_init_leases_net(struct nfsd_net *nn)
 {
 	struct sysinfo si;
 	u64 max_clients;
@@ -4370,6 +4390,16 @@ void nfsd4_init_leases_net(struct nfsd_net *nn)
 	nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB);
 
 	atomic_set(&nn->nfsd_courtesy_clients, 0);
+	nn->nfsd_client_shrinker.scan_objects = nfsd_courtesy_client_scan;
+	nn->nfsd_client_shrinker.count_objects = nfsd_courtesy_client_count;
+	nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
+	return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client");
+}
+
+void
+nfsd4_leases_net_shutdown(struct nfsd_net *nn)
+{
+	unregister_shrinker(&nn->nfsd_client_shrinker);
 }
 
 static void init_nfs4_replay(struct nfs4_replay *rp)
@@ -5942,10 +5972,49 @@ exp_client:
 	spin_unlock(&nn->client_lock);
 }
 
+static void
+nfs4_get_courtesy_client_reaplist(struct nfsd_net *nn,
+				struct list_head *reaplist)
+{
+	unsigned int maxreap = 0, reapcnt = 0;
+	struct list_head *pos, *next;
+	struct nfs4_client *clp;
+
+	maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN;
+	INIT_LIST_HEAD(reaplist);
+
+	spin_lock(&nn->client_lock);
+	list_for_each_safe(pos, next, &nn->client_lru) {
+		clp = list_entry(pos, struct nfs4_client, cl_lru);
+		if (clp->cl_state == NFSD4_ACTIVE)
+			break;
+		if (reapcnt >= maxreap)
+			break;
+		if (!mark_client_expired_locked(clp)) {
+			list_add(&clp->cl_lru, reaplist);
+			reapcnt++;
+		}
+	}
+	spin_unlock(&nn->client_lock);
+}
+
+static void
+nfs4_process_client_reaplist(struct list_head *reaplist)
+{
+	struct list_head *pos, *next;
+	struct nfs4_client *clp;
+
+	list_for_each_safe(pos, next, reaplist) {
+		clp = list_entry(pos, struct nfs4_client, cl_lru);
+		trace_nfsd_clid_purged(&clp->cl_clientid);
+		list_del_init(&clp->cl_lru);
+		expire_client(clp);
+	}
+}
+
 static time64_t
 nfs4_laundromat(struct nfsd_net *nn)
 {
-	struct nfs4_client *clp;
 	struct nfs4_openowner *oo;
 	struct nfs4_delegation *dp;
 	struct nfs4_ol_stateid *stp;
@@ -5974,12 +6043,8 @@ nfs4_laundromat(struct nfsd_net *nn)
 	}
 	spin_unlock(&nn->s2s_cp_lock);
 	nfs4_get_client_reaplist(nn, &reaplist, &lt);
-	list_for_each_safe(pos, next, &reaplist) {
-		clp = list_entry(pos, struct nfs4_client, cl_lru);
-		trace_nfsd_clid_purged(&clp->cl_clientid);
-		list_del_init(&clp->cl_lru);
-		expire_client(clp);
-	}
+	nfs4_process_client_reaplist(&reaplist);
+
 	spin_lock(&state_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
@@ -6062,6 +6127,18 @@ laundromat_main(struct work_struct *laundry)
 	queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
 }
 
+static void
+courtesy_client_reaper(struct work_struct *reaper)
+{
+	struct list_head reaplist;
+	struct delayed_work *dwork = to_delayed_work(reaper);
+	struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
+					nfsd_shrinker_work);
+
+	nfs4_get_courtesy_client_reaplist(nn, &reaplist);
+	nfs4_process_client_reaplist(&reaplist);
+}
+
 static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
 {
 	if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle))
@@ -7879,6 +7956,7 @@ static int nfs4_state_create_net(struct net *net)
 	INIT_LIST_HEAD(&nn->blocked_locks_lru);
 
 	INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
+	INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, courtesy_client_reaper);
 	get_net(net);
 
 	return 0;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 917fa1892fd2..597a26ad4183 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1481,11 +1481,12 @@ static __net_init int nfsd_init_net(struct net *net)
 		goto out_idmap_error;
 	nn->nfsd_versions = NULL;
 	nn->nfsd4_minorversions = NULL;
+	retval = nfsd4_init_leases_net(nn);
+	if (retval)
+		goto out_drc_error;
 	retval = nfsd_reply_cache_init(nn);
 	if (retval)
 		goto out_drc_error;
-	nfsd4_init_leases_net(nn);
-
 	get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
 	seqlock_init(&nn->writeverf_lock);
 
@@ -1507,6 +1508,7 @@ static __net_exit void nfsd_exit_net(struct net *net)
 	nfsd_idmap_shutdown(net);
 	nfsd_export_shutdown(net);
 	nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
+	nfsd4_leases_net_shutdown(nn);
 }
 
 static struct pernet_operations nfsd_net_ops = {
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 6ab4ad41ae84..09726c5b9a31 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -505,7 +505,8 @@ extern void unregister_cld_notifier(void);
 extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn);
 #endif
 
-extern void nfsd4_init_leases_net(struct nfsd_net *nn);
+extern int nfsd4_init_leases_net(struct nfsd_net *nn);
+extern void nfsd4_leases_net_shutdown(struct nfsd_net *nn);
 
 #else /* CONFIG_NFSD_V4 */
 static inline int nfsd4_is_junction(struct dentry *dentry)
@@ -513,7 +514,8 @@ static inline int nfsd4_is_junction(struct dentry *dentry)
 	return 0;
 }
 
-static inline void nfsd4_init_leases_net(struct nfsd_net *nn) {};
+static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; };
+static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) {};
 
 #define register_cld_notifier() 0
 #define unregister_cld_notifier() do { } while(0)
-- 
cgit v1.2.3


From 81593c4dff32266102cd95cdc993576a6afe5650 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:22:33 -0400
Subject: SUNRPC: Optimize svc_process()

Move exception handling code out of the hot path, and avoid the need
for a bswap of a non-constant.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/svc.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 7c9a0d0b1230..4268145490a4 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1434,8 +1434,7 @@ svc_process(struct svc_rqst *rqstp)
 {
 	struct kvec		*argv = &rqstp->rq_arg.head[0];
 	struct kvec		*resv = &rqstp->rq_res.head[0];
-	struct svc_serv		*serv = rqstp->rq_server;
-	u32			dir;
+	__be32			dir;
 
 #if IS_ENABLED(CONFIG_FAIL_SUNRPC)
 	if (!fail_sunrpc.ignore_server_disconnect &&
@@ -1450,7 +1449,7 @@ svc_process(struct svc_rqst *rqstp)
 	rqstp->rq_next_page = &rqstp->rq_respages[1];
 	resv->iov_base = page_address(rqstp->rq_respages[0]);
 	resv->iov_len = 0;
-	rqstp->rq_res.pages = rqstp->rq_respages + 1;
+	rqstp->rq_res.pages = rqstp->rq_next_page;
 	rqstp->rq_res.len = 0;
 	rqstp->rq_res.page_base = 0;
 	rqstp->rq_res.page_len = 0;
@@ -1458,18 +1457,17 @@ svc_process(struct svc_rqst *rqstp)
 	rqstp->rq_res.tail[0].iov_base = NULL;
 	rqstp->rq_res.tail[0].iov_len = 0;
 
-	dir  = svc_getnl(argv);
-	if (dir != 0) {
-		/* direction != CALL */
-		svc_printk(rqstp, "bad direction %d, dropping request\n", dir);
-		serv->sv_stats->rpcbadfmt++;
+	dir = svc_getu32(argv);
+	if (dir != rpc_call)
+		goto out_baddir;
+	if (!svc_process_common(rqstp, argv, resv))
 		goto out_drop;
-	}
-
-	/* Returns 1 for send, 0 for drop */
-	if (likely(svc_process_common(rqstp, argv, resv)))
-		return svc_send(rqstp);
+	return svc_send(rqstp);
 
+out_baddir:
+	svc_printk(rqstp, "bad direction 0x%08x, dropping request\n",
+		   be32_to_cpu(dir));
+	rqstp->rq_server->sv_stats->rpcbadfmt++;
 out_drop:
 	svc_drop(rqstp);
 	return 0;
-- 
cgit v1.2.3


From 103cc1fafee48adb91fca0e19deb869fd23e46ab Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:22:38 -0400
Subject: SUNRPC: Parametrize how much of argsize should be zeroed

Currently, SUNRPC clears the whole of .pc_argsize before processing
each incoming RPC transaction. Add an extra parameter to struct
svc_procedure to enable upper layers to reduce the amount of each
operation's argument structure that is zeroed by SUNRPC.

The size of struct nfsd4_compoundargs, in particular, is a lot to
clear on each incoming RPC Call. A subsequent patch will cut this
down to something closer to what NFSv2 and NFSv3 uses.

This patch should cause no behavior changes.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/lockd/svc4proc.c        | 24 ++++++++++++++++++++++++
 fs/lockd/svcproc.c         | 24 ++++++++++++++++++++++++
 fs/nfs/callback_xdr.c      |  1 +
 fs/nfsd/nfs2acl.c          |  5 +++++
 fs/nfsd/nfs3acl.c          |  3 +++
 fs/nfsd/nfs3proc.c         | 22 ++++++++++++++++++++++
 fs/nfsd/nfs4proc.c         |  2 ++
 fs/nfsd/nfsproc.c          | 18 ++++++++++++++++++
 include/linux/sunrpc/svc.h |  1 +
 net/sunrpc/svc.c           |  2 +-
 10 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index bf274f23969b..284b019cb652 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -521,6 +521,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_void,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_void),
+		.pc_argzero = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "NULL",
@@ -530,6 +531,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_testargs,
 		.pc_encode = nlm4svc_encode_testres,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St+2+No+Rg,
 		.pc_name = "TEST",
@@ -539,6 +541,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_lockargs,
 		.pc_encode = nlm4svc_encode_res,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
 		.pc_name = "LOCK",
@@ -548,6 +551,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_cancargs,
 		.pc_encode = nlm4svc_encode_res,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
 		.pc_name = "CANCEL",
@@ -557,6 +561,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_unlockargs,
 		.pc_encode = nlm4svc_encode_res,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
 		.pc_name = "UNLOCK",
@@ -566,6 +571,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_testargs,
 		.pc_encode = nlm4svc_encode_res,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
 		.pc_name = "GRANTED",
@@ -575,6 +581,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_testargs,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "TEST_MSG",
@@ -584,6 +591,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_lockargs,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "LOCK_MSG",
@@ -593,6 +601,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_cancargs,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "CANCEL_MSG",
@@ -602,6 +611,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_unlockargs,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "UNLOCK_MSG",
@@ -611,6 +621,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_testargs,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "GRANTED_MSG",
@@ -620,6 +631,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_void,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_res),
+		.pc_argzero = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "TEST_RES",
@@ -629,6 +641,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_void,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_res),
+		.pc_argzero = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "LOCK_RES",
@@ -638,6 +651,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_void,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_res),
+		.pc_argzero = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "CANCEL_RES",
@@ -647,6 +661,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_void,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_res),
+		.pc_argzero = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "UNLOCK_RES",
@@ -656,6 +671,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_res,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_res),
+		.pc_argzero = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "GRANTED_RES",
@@ -665,6 +681,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_reboot,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_reboot),
+		.pc_argzero = sizeof(struct nlm_reboot),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "SM_NOTIFY",
@@ -674,6 +691,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_void,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_void),
+		.pc_argzero = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = 0,
 		.pc_name = "UNUSED",
@@ -683,6 +701,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_void,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_void),
+		.pc_argzero = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = 0,
 		.pc_name = "UNUSED",
@@ -692,6 +711,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_void,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_void),
+		.pc_argzero = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = 0,
 		.pc_name = "UNUSED",
@@ -701,6 +721,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_shareargs,
 		.pc_encode = nlm4svc_encode_shareres,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St+1,
 		.pc_name = "SHARE",
@@ -710,6 +731,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_shareargs,
 		.pc_encode = nlm4svc_encode_shareres,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St+1,
 		.pc_name = "UNSHARE",
@@ -719,6 +741,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_lockargs,
 		.pc_encode = nlm4svc_encode_res,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
 		.pc_name = "NM_LOCK",
@@ -728,6 +751,7 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
 		.pc_decode = nlm4svc_decode_notify,
 		.pc_encode = nlm4svc_encode_void,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "FREE_ALL",
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index b09ca35b527c..e35c05e27806 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -555,6 +555,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_void,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_void),
+		.pc_argzero = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "NULL",
@@ -564,6 +565,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_testargs,
 		.pc_encode = nlmsvc_encode_testres,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St+2+No+Rg,
 		.pc_name = "TEST",
@@ -573,6 +575,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_lockargs,
 		.pc_encode = nlmsvc_encode_res,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
 		.pc_name = "LOCK",
@@ -582,6 +585,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_cancargs,
 		.pc_encode = nlmsvc_encode_res,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
 		.pc_name = "CANCEL",
@@ -591,6 +595,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_unlockargs,
 		.pc_encode = nlmsvc_encode_res,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
 		.pc_name = "UNLOCK",
@@ -600,6 +605,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_testargs,
 		.pc_encode = nlmsvc_encode_res,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
 		.pc_name = "GRANTED",
@@ -609,6 +615,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_testargs,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "TEST_MSG",
@@ -618,6 +625,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_lockargs,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "LOCK_MSG",
@@ -627,6 +635,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_cancargs,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "CANCEL_MSG",
@@ -636,6 +645,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_unlockargs,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "UNLOCK_MSG",
@@ -645,6 +655,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_testargs,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "GRANTED_MSG",
@@ -654,6 +665,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_void,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_res),
+		.pc_argzero = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "TEST_RES",
@@ -663,6 +675,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_void,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_res),
+		.pc_argzero = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "LOCK_RES",
@@ -672,6 +685,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_void,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_res),
+		.pc_argzero = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "CANCEL_RES",
@@ -681,6 +695,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_void,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_res),
+		.pc_argzero = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "UNLOCK_RES",
@@ -690,6 +705,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_res,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_res),
+		.pc_argzero = sizeof(struct nlm_res),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "GRANTED_RES",
@@ -699,6 +715,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_reboot,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_reboot),
+		.pc_argzero = sizeof(struct nlm_reboot),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "SM_NOTIFY",
@@ -708,6 +725,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_void,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_void),
+		.pc_argzero = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "UNUSED",
@@ -717,6 +735,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_void,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_void),
+		.pc_argzero = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "UNUSED",
@@ -726,6 +745,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_void,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_void),
+		.pc_argzero = sizeof(struct nlm_void),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = St,
 		.pc_name = "UNUSED",
@@ -735,6 +755,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_shareargs,
 		.pc_encode = nlmsvc_encode_shareres,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St+1,
 		.pc_name = "SHARE",
@@ -744,6 +765,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_shareargs,
 		.pc_encode = nlmsvc_encode_shareres,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St+1,
 		.pc_name = "UNSHARE",
@@ -753,6 +775,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_lockargs,
 		.pc_encode = nlmsvc_encode_res,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_res),
 		.pc_xdrressize = Ck+St,
 		.pc_name = "NM_LOCK",
@@ -762,6 +785,7 @@ const struct svc_procedure nlmsvc_procedures[24] = {
 		.pc_decode = nlmsvc_decode_notify,
 		.pc_encode = nlmsvc_encode_void,
 		.pc_argsize = sizeof(struct nlm_args),
+		.pc_argzero = sizeof(struct nlm_args),
 		.pc_ressize = sizeof(struct nlm_void),
 		.pc_xdrressize = 0,
 		.pc_name = "FREE_ALL",
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 8dcb08e1a885..d0cccddb7d08 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -1065,6 +1065,7 @@ static const struct svc_procedure nfs4_callback_procedures1[] = {
 		.pc_func = nfs4_callback_compound,
 		.pc_encode = nfs4_encode_void,
 		.pc_argsize = 256,
+		.pc_argzero = 256,
 		.pc_ressize = 256,
 		.pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
 		.pc_name = "COMPOUND",
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 9edd3c1a30fb..13e6e6897f6c 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -331,6 +331,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
 		.pc_decode = nfssvc_decode_voidarg,
 		.pc_encode = nfssvc_encode_voidres,
 		.pc_argsize = sizeof(struct nfsd_voidargs),
+		.pc_argzero = sizeof(struct nfsd_voidargs),
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST,
@@ -342,6 +343,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
 		.pc_encode = nfsaclsvc_encode_getaclres,
 		.pc_release = nfsaclsvc_release_getacl,
 		.pc_argsize = sizeof(struct nfsd3_getaclargs),
+		.pc_argzero = sizeof(struct nfsd3_getaclargs),
 		.pc_ressize = sizeof(struct nfsd3_getaclres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+1+2*(1+ACL),
@@ -353,6 +355,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
 		.pc_encode = nfssvc_encode_attrstatres,
 		.pc_release = nfssvc_release_attrstat,
 		.pc_argsize = sizeof(struct nfsd3_setaclargs),
+		.pc_argzero = sizeof(struct nfsd3_setaclargs),
 		.pc_ressize = sizeof(struct nfsd_attrstat),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+AT,
@@ -364,6 +367,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
 		.pc_encode = nfssvc_encode_attrstatres,
 		.pc_release = nfssvc_release_attrstat,
 		.pc_argsize = sizeof(struct nfsd_fhandle),
+		.pc_argzero = sizeof(struct nfsd_fhandle),
 		.pc_ressize = sizeof(struct nfsd_attrstat),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+AT,
@@ -375,6 +379,7 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
 		.pc_encode = nfsaclsvc_encode_accessres,
 		.pc_release = nfsaclsvc_release_access,
 		.pc_argsize = sizeof(struct nfsd3_accessargs),
+		.pc_argzero = sizeof(struct nfsd3_accessargs),
 		.pc_ressize = sizeof(struct nfsd3_accessres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+AT+1,
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 9446c6743664..2fb9ee356455 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -252,6 +252,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
 		.pc_decode = nfssvc_decode_voidarg,
 		.pc_encode = nfssvc_encode_voidres,
 		.pc_argsize = sizeof(struct nfsd_voidargs),
+		.pc_argzero = sizeof(struct nfsd_voidargs),
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST,
@@ -263,6 +264,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
 		.pc_encode = nfs3svc_encode_getaclres,
 		.pc_release = nfs3svc_release_getacl,
 		.pc_argsize = sizeof(struct nfsd3_getaclargs),
+		.pc_argzero = sizeof(struct nfsd3_getaclargs),
 		.pc_ressize = sizeof(struct nfsd3_getaclres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+1+2*(1+ACL),
@@ -274,6 +276,7 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
 		.pc_encode = nfs3svc_encode_setaclres,
 		.pc_release = nfs3svc_release_fhandle,
 		.pc_argsize = sizeof(struct nfsd3_setaclargs),
+		.pc_argzero = sizeof(struct nfsd3_setaclargs),
 		.pc_ressize = sizeof(struct nfsd3_attrstat),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT,
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 5b1e771238b3..58695e4e18b4 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -809,6 +809,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_decode = nfssvc_decode_voidarg,
 		.pc_encode = nfssvc_encode_voidres,
 		.pc_argsize = sizeof(struct nfsd_voidargs),
+		.pc_argzero = sizeof(struct nfsd_voidargs),
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST,
@@ -820,6 +821,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_getattrres,
 		.pc_release = nfs3svc_release_fhandle,
 		.pc_argsize = sizeof(struct nfsd_fhandle),
+		.pc_argzero = sizeof(struct nfsd_fhandle),
 		.pc_ressize = sizeof(struct nfsd3_attrstatres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+AT,
@@ -831,6 +833,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_wccstatres,
 		.pc_release = nfs3svc_release_fhandle,
 		.pc_argsize = sizeof(struct nfsd3_sattrargs),
+		.pc_argzero = sizeof(struct nfsd3_sattrargs),
 		.pc_ressize = sizeof(struct nfsd3_wccstatres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+WC,
@@ -842,6 +845,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_lookupres,
 		.pc_release = nfs3svc_release_fhandle2,
 		.pc_argsize = sizeof(struct nfsd3_diropargs),
+		.pc_argzero = sizeof(struct nfsd3_diropargs),
 		.pc_ressize = sizeof(struct nfsd3_diropres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+FH+pAT+pAT,
@@ -853,6 +857,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_accessres,
 		.pc_release = nfs3svc_release_fhandle,
 		.pc_argsize = sizeof(struct nfsd3_accessargs),
+		.pc_argzero = sizeof(struct nfsd3_accessargs),
 		.pc_ressize = sizeof(struct nfsd3_accessres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT+1,
@@ -864,6 +869,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_readlinkres,
 		.pc_release = nfs3svc_release_fhandle,
 		.pc_argsize = sizeof(struct nfsd_fhandle),
+		.pc_argzero = sizeof(struct nfsd_fhandle),
 		.pc_ressize = sizeof(struct nfsd3_readlinkres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4,
@@ -875,6 +881,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_readres,
 		.pc_release = nfs3svc_release_fhandle,
 		.pc_argsize = sizeof(struct nfsd3_readargs),
+		.pc_argzero = sizeof(struct nfsd3_readargs),
 		.pc_ressize = sizeof(struct nfsd3_readres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4,
@@ -886,6 +893,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_writeres,
 		.pc_release = nfs3svc_release_fhandle,
 		.pc_argsize = sizeof(struct nfsd3_writeargs),
+		.pc_argzero = sizeof(struct nfsd3_writeargs),
 		.pc_ressize = sizeof(struct nfsd3_writeres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+WC+4,
@@ -897,6 +905,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_createres,
 		.pc_release = nfs3svc_release_fhandle2,
 		.pc_argsize = sizeof(struct nfsd3_createargs),
+		.pc_argzero = sizeof(struct nfsd3_createargs),
 		.pc_ressize = sizeof(struct nfsd3_createres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+(1+FH+pAT)+WC,
@@ -908,6 +917,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_createres,
 		.pc_release = nfs3svc_release_fhandle2,
 		.pc_argsize = sizeof(struct nfsd3_mkdirargs),
+		.pc_argzero = sizeof(struct nfsd3_mkdirargs),
 		.pc_ressize = sizeof(struct nfsd3_createres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+(1+FH+pAT)+WC,
@@ -919,6 +929,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_createres,
 		.pc_release = nfs3svc_release_fhandle2,
 		.pc_argsize = sizeof(struct nfsd3_symlinkargs),
+		.pc_argzero = sizeof(struct nfsd3_symlinkargs),
 		.pc_ressize = sizeof(struct nfsd3_createres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+(1+FH+pAT)+WC,
@@ -930,6 +941,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_createres,
 		.pc_release = nfs3svc_release_fhandle2,
 		.pc_argsize = sizeof(struct nfsd3_mknodargs),
+		.pc_argzero = sizeof(struct nfsd3_mknodargs),
 		.pc_ressize = sizeof(struct nfsd3_createres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+(1+FH+pAT)+WC,
@@ -941,6 +953,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_wccstatres,
 		.pc_release = nfs3svc_release_fhandle,
 		.pc_argsize = sizeof(struct nfsd3_diropargs),
+		.pc_argzero = sizeof(struct nfsd3_diropargs),
 		.pc_ressize = sizeof(struct nfsd3_wccstatres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+WC,
@@ -952,6 +965,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_wccstatres,
 		.pc_release = nfs3svc_release_fhandle,
 		.pc_argsize = sizeof(struct nfsd3_diropargs),
+		.pc_argzero = sizeof(struct nfsd3_diropargs),
 		.pc_ressize = sizeof(struct nfsd3_wccstatres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+WC,
@@ -963,6 +977,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_renameres,
 		.pc_release = nfs3svc_release_fhandle2,
 		.pc_argsize = sizeof(struct nfsd3_renameargs),
+		.pc_argzero = sizeof(struct nfsd3_renameargs),
 		.pc_ressize = sizeof(struct nfsd3_renameres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+WC+WC,
@@ -974,6 +989,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_linkres,
 		.pc_release = nfs3svc_release_fhandle2,
 		.pc_argsize = sizeof(struct nfsd3_linkargs),
+		.pc_argzero = sizeof(struct nfsd3_linkargs),
 		.pc_ressize = sizeof(struct nfsd3_linkres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+pAT+WC,
@@ -985,6 +1001,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_readdirres,
 		.pc_release = nfs3svc_release_fhandle,
 		.pc_argsize = sizeof(struct nfsd3_readdirargs),
+		.pc_argzero = sizeof(struct nfsd3_readdirargs),
 		.pc_ressize = sizeof(struct nfsd3_readdirres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_name = "READDIR",
@@ -995,6 +1012,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_readdirres,
 		.pc_release = nfs3svc_release_fhandle,
 		.pc_argsize = sizeof(struct nfsd3_readdirplusargs),
+		.pc_argzero = sizeof(struct nfsd3_readdirplusargs),
 		.pc_ressize = sizeof(struct nfsd3_readdirres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_name = "READDIRPLUS",
@@ -1004,6 +1022,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_decode = nfs3svc_decode_fhandleargs,
 		.pc_encode = nfs3svc_encode_fsstatres,
 		.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+		.pc_argzero = sizeof(struct nfsd3_fhandleargs),
 		.pc_ressize = sizeof(struct nfsd3_fsstatres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT+2*6+1,
@@ -1014,6 +1033,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_decode = nfs3svc_decode_fhandleargs,
 		.pc_encode = nfs3svc_encode_fsinfores,
 		.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+		.pc_argzero = sizeof(struct nfsd3_fhandleargs),
 		.pc_ressize = sizeof(struct nfsd3_fsinfores),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT+12,
@@ -1024,6 +1044,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_decode = nfs3svc_decode_fhandleargs,
 		.pc_encode = nfs3svc_encode_pathconfres,
 		.pc_argsize = sizeof(struct nfsd3_fhandleargs),
+		.pc_argzero = sizeof(struct nfsd3_fhandleargs),
 		.pc_ressize = sizeof(struct nfsd3_pathconfres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+pAT+6,
@@ -1035,6 +1056,7 @@ static const struct svc_procedure nfsd_procedures3[22] = {
 		.pc_encode = nfs3svc_encode_commitres,
 		.pc_release = nfs3svc_release_fhandle,
 		.pc_argsize = sizeof(struct nfsd3_commitargs),
+		.pc_argzero = sizeof(struct nfsd3_commitargs),
 		.pc_ressize = sizeof(struct nfsd3_commitres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+WC+2,
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 1918c9ec3478..66a99827c7aa 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -3585,6 +3585,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
 		.pc_decode = nfssvc_decode_voidarg,
 		.pc_encode = nfssvc_encode_voidres,
 		.pc_argsize = sizeof(struct nfsd_voidargs),
+		.pc_argzero = sizeof(struct nfsd_voidargs),
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = 1,
@@ -3595,6 +3596,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
 		.pc_decode = nfs4svc_decode_compoundargs,
 		.pc_encode = nfs4svc_encode_compoundres,
 		.pc_argsize = sizeof(struct nfsd4_compoundargs),
+		.pc_argzero = sizeof(struct nfsd4_compoundargs),
 		.pc_ressize = sizeof(struct nfsd4_compoundres),
 		.pc_release = nfsd4_release_compoundargs,
 		.pc_cachetype = RC_NOCACHE,
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index ee02ede74bf5..49778ff410e3 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -645,6 +645,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_decode = nfssvc_decode_voidarg,
 		.pc_encode = nfssvc_encode_voidres,
 		.pc_argsize = sizeof(struct nfsd_voidargs),
+		.pc_argzero = sizeof(struct nfsd_voidargs),
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = 0,
@@ -656,6 +657,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_encode = nfssvc_encode_attrstatres,
 		.pc_release = nfssvc_release_attrstat,
 		.pc_argsize = sizeof(struct nfsd_fhandle),
+		.pc_argzero = sizeof(struct nfsd_fhandle),
 		.pc_ressize = sizeof(struct nfsd_attrstat),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+AT,
@@ -667,6 +669,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_encode = nfssvc_encode_attrstatres,
 		.pc_release = nfssvc_release_attrstat,
 		.pc_argsize = sizeof(struct nfsd_sattrargs),
+		.pc_argzero = sizeof(struct nfsd_sattrargs),
 		.pc_ressize = sizeof(struct nfsd_attrstat),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+AT,
@@ -677,6 +680,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_decode = nfssvc_decode_voidarg,
 		.pc_encode = nfssvc_encode_voidres,
 		.pc_argsize = sizeof(struct nfsd_voidargs),
+		.pc_argzero = sizeof(struct nfsd_voidargs),
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = 0,
@@ -688,6 +692,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_encode = nfssvc_encode_diropres,
 		.pc_release = nfssvc_release_diropres,
 		.pc_argsize = sizeof(struct nfsd_diropargs),
+		.pc_argzero = sizeof(struct nfsd_diropargs),
 		.pc_ressize = sizeof(struct nfsd_diropres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+FH+AT,
@@ -698,6 +703,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_decode = nfssvc_decode_fhandleargs,
 		.pc_encode = nfssvc_encode_readlinkres,
 		.pc_argsize = sizeof(struct nfsd_fhandle),
+		.pc_argzero = sizeof(struct nfsd_fhandle),
 		.pc_ressize = sizeof(struct nfsd_readlinkres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
@@ -709,6 +715,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_encode = nfssvc_encode_readres,
 		.pc_release = nfssvc_release_readres,
 		.pc_argsize = sizeof(struct nfsd_readargs),
+		.pc_argzero = sizeof(struct nfsd_readargs),
 		.pc_ressize = sizeof(struct nfsd_readres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
@@ -719,6 +726,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_decode = nfssvc_decode_voidarg,
 		.pc_encode = nfssvc_encode_voidres,
 		.pc_argsize = sizeof(struct nfsd_voidargs),
+		.pc_argzero = sizeof(struct nfsd_voidargs),
 		.pc_ressize = sizeof(struct nfsd_voidres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = 0,
@@ -730,6 +738,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_encode = nfssvc_encode_attrstatres,
 		.pc_release = nfssvc_release_attrstat,
 		.pc_argsize = sizeof(struct nfsd_writeargs),
+		.pc_argzero = sizeof(struct nfsd_writeargs),
 		.pc_ressize = sizeof(struct nfsd_attrstat),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+AT,
@@ -741,6 +750,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_encode = nfssvc_encode_diropres,
 		.pc_release = nfssvc_release_diropres,
 		.pc_argsize = sizeof(struct nfsd_createargs),
+		.pc_argzero = sizeof(struct nfsd_createargs),
 		.pc_ressize = sizeof(struct nfsd_diropres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+FH+AT,
@@ -751,6 +761,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_decode = nfssvc_decode_diropargs,
 		.pc_encode = nfssvc_encode_statres,
 		.pc_argsize = sizeof(struct nfsd_diropargs),
+		.pc_argzero = sizeof(struct nfsd_diropargs),
 		.pc_ressize = sizeof(struct nfsd_stat),
 		.pc_cachetype = RC_REPLSTAT,
 		.pc_xdrressize = ST,
@@ -761,6 +772,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_decode = nfssvc_decode_renameargs,
 		.pc_encode = nfssvc_encode_statres,
 		.pc_argsize = sizeof(struct nfsd_renameargs),
+		.pc_argzero = sizeof(struct nfsd_renameargs),
 		.pc_ressize = sizeof(struct nfsd_stat),
 		.pc_cachetype = RC_REPLSTAT,
 		.pc_xdrressize = ST,
@@ -771,6 +783,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_decode = nfssvc_decode_linkargs,
 		.pc_encode = nfssvc_encode_statres,
 		.pc_argsize = sizeof(struct nfsd_linkargs),
+		.pc_argzero = sizeof(struct nfsd_linkargs),
 		.pc_ressize = sizeof(struct nfsd_stat),
 		.pc_cachetype = RC_REPLSTAT,
 		.pc_xdrressize = ST,
@@ -781,6 +794,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_decode = nfssvc_decode_symlinkargs,
 		.pc_encode = nfssvc_encode_statres,
 		.pc_argsize = sizeof(struct nfsd_symlinkargs),
+		.pc_argzero = sizeof(struct nfsd_symlinkargs),
 		.pc_ressize = sizeof(struct nfsd_stat),
 		.pc_cachetype = RC_REPLSTAT,
 		.pc_xdrressize = ST,
@@ -792,6 +806,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_encode = nfssvc_encode_diropres,
 		.pc_release = nfssvc_release_diropres,
 		.pc_argsize = sizeof(struct nfsd_createargs),
+		.pc_argzero = sizeof(struct nfsd_createargs),
 		.pc_ressize = sizeof(struct nfsd_diropres),
 		.pc_cachetype = RC_REPLBUFF,
 		.pc_xdrressize = ST+FH+AT,
@@ -802,6 +817,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_decode = nfssvc_decode_diropargs,
 		.pc_encode = nfssvc_encode_statres,
 		.pc_argsize = sizeof(struct nfsd_diropargs),
+		.pc_argzero = sizeof(struct nfsd_diropargs),
 		.pc_ressize = sizeof(struct nfsd_stat),
 		.pc_cachetype = RC_REPLSTAT,
 		.pc_xdrressize = ST,
@@ -812,6 +828,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_decode = nfssvc_decode_readdirargs,
 		.pc_encode = nfssvc_encode_readdirres,
 		.pc_argsize = sizeof(struct nfsd_readdirargs),
+		.pc_argzero = sizeof(struct nfsd_readdirargs),
 		.pc_ressize = sizeof(struct nfsd_readdirres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_name = "READDIR",
@@ -821,6 +838,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
 		.pc_decode = nfssvc_decode_fhandleargs,
 		.pc_encode = nfssvc_encode_statfsres,
 		.pc_argsize = sizeof(struct nfsd_fhandle),
+		.pc_argzero = sizeof(struct nfsd_fhandle),
 		.pc_ressize = sizeof(struct nfsd_statfsres),
 		.pc_cachetype = RC_NOCACHE,
 		.pc_xdrressize = ST+5,
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 0ca8a8ffb47e..88de45491376 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -472,6 +472,7 @@ struct svc_procedure {
 	/* XDR free result: */
 	void			(*pc_release)(struct svc_rqst *);
 	unsigned int		pc_argsize;	/* argument struct size */
+	unsigned int		pc_argzero;	/* how much of argument to clear */
 	unsigned int		pc_ressize;	/* result struct size */
 	unsigned int		pc_cachetype;	/* cache info (NFS) */
 	unsigned int		pc_xdrressize;	/* maximum size of XDR reply */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 4268145490a4..32a537f852fe 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1205,7 +1205,7 @@ svc_generic_init_request(struct svc_rqst *rqstp,
 		goto err_bad_proc;
 
 	/* Initialize storage for argp and resp */
-	memset(rqstp->rq_argp, 0, procp->pc_argsize);
+	memset(rqstp->rq_argp, 0, procp->pc_argzero);
 	memset(rqstp->rq_resp, 0, procp->pc_ressize);
 
 	/* Bump per-procedure stats counter */
-- 
cgit v1.2.3


From 3fdc546462348b8a497c72bc894e0cde9f10fc40 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:22:44 -0400
Subject: NFSD: Reduce amount of struct nfsd4_compoundargs that needs clearing

Have SunRPC clear everything except for the iops array. Then have
each NFSv4 XDR decoder clear it's own argument before decoding.

Now individual operations may have a large argument struct while not
penalizing the vast majority of operations with a small struct.

And, clearing the argument structure occurs as the argument fields
are initialized, enabling the CPU to do write combining on that
memory. In some cases, clearing is not even necessary because all
of the fields in the argument structure are initialized by the
decoder.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c |  2 +-
 fs/nfsd/nfs4xdr.c  | 61 ++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 66a99827c7aa..bb22f53c7ba9 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -3596,7 +3596,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
 		.pc_decode = nfs4svc_decode_compoundargs,
 		.pc_encode = nfs4svc_encode_compoundres,
 		.pc_argsize = sizeof(struct nfsd4_compoundargs),
-		.pc_argzero = sizeof(struct nfsd4_compoundargs),
+		.pc_argzero = offsetof(struct nfsd4_compoundargs, iops),
 		.pc_ressize = sizeof(struct nfsd4_compoundres),
 		.pc_release = nfsd4_release_compoundargs,
 		.pc_cachetype = RC_NOCACHE,
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 0220e1fa214b..05dddecbaed6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -793,6 +793,7 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit
 		return nfserr_bad_xdr;
 	if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0)
 		return nfserr_bad_xdr;
+	memset(&commit->co_verf, 0, sizeof(commit->co_verf));
 	return nfs_ok;
 }
 
@@ -801,6 +802,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
 {
 	__be32 *p, status;
 
+	memset(create, 0, sizeof(*create));
 	if (xdr_stream_decode_u32(argp->xdr, &create->cr_type) < 0)
 		return nfserr_bad_xdr;
 	switch (create->cr_type) {
@@ -850,6 +852,7 @@ nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegretu
 static inline __be32
 nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
 {
+	memset(getattr, 0, sizeof(*getattr));
 	return nfsd4_decode_bitmap4(argp, getattr->ga_bmval,
 				    ARRAY_SIZE(getattr->ga_bmval));
 }
@@ -857,6 +860,7 @@ nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *geta
 static __be32
 nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
 {
+	memset(link, 0, sizeof(*link));
 	return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen);
 }
 
@@ -905,6 +909,7 @@ nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 static __be32
 nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 {
+	memset(lock, 0, sizeof(*lock));
 	if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0)
 		return nfserr_bad_xdr;
 	if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT))
@@ -921,6 +926,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
 static __be32
 nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
 {
+	memset(lockt, 0, sizeof(*lockt));
 	if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0)
 		return nfserr_bad_xdr;
 	if ((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT))
@@ -1142,11 +1148,8 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 	__be32 status;
 	u32 dummy;
 
-	memset(open->op_bmval, 0, sizeof(open->op_bmval));
-	open->op_iattr.ia_valid = 0;
-	open->op_openowner = NULL;
+	memset(open, 0, sizeof(*open));
 
-	open->op_xdr_error = 0;
 	if (xdr_stream_decode_u32(argp->xdr, &open->op_seqid) < 0)
 		return nfserr_bad_xdr;
 	/* deleg_want is ignored */
@@ -1181,6 +1184,8 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
 	if (xdr_stream_decode_u32(argp->xdr, &open_conf->oc_seqid) < 0)
 		return nfserr_bad_xdr;
 
+	memset(&open_conf->oc_resp_stateid, 0,
+	       sizeof(open_conf->oc_resp_stateid));
 	return nfs_ok;
 }
 
@@ -1189,6 +1194,7 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
 {
 	__be32 status;
 
+	memset(open_down, 0, sizeof(*open_down));
 	status = nfsd4_decode_stateid4(argp, &open_down->od_stateid);
 	if (status)
 		return status;
@@ -1218,6 +1224,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
 	if (!putfh->pf_fhval)
 		return nfserr_jukebox;
 
+	putfh->no_verify = false;
 	return nfs_ok;
 }
 
@@ -1234,6 +1241,7 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
 {
 	__be32 status;
 
+	memset(read, 0, sizeof(*read));
 	status = nfsd4_decode_stateid4(argp, &read->rd_stateid);
 	if (status)
 		return status;
@@ -1250,6 +1258,7 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
 {
 	__be32 status;
 
+	memset(readdir, 0, sizeof(*readdir));
 	if (xdr_stream_decode_u64(argp->xdr, &readdir->rd_cookie) < 0)
 		return nfserr_bad_xdr;
 	status = nfsd4_decode_verifier4(argp, &readdir->rd_verf);
@@ -1269,6 +1278,7 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
 static __be32
 nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
 {
+	memset(&remove->rm_cinfo, 0, sizeof(remove->rm_cinfo));
 	return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen);
 }
 
@@ -1277,6 +1287,7 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
 {
 	__be32 status;
 
+	memset(rename, 0, sizeof(*rename));
 	status = nfsd4_decode_component4(argp, &rename->rn_sname, &rename->rn_snamelen);
 	if (status)
 		return status;
@@ -1293,6 +1304,7 @@ static __be32
 nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
 		     struct nfsd4_secinfo *secinfo)
 {
+	secinfo->si_exp = NULL;
 	return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen);
 }
 
@@ -1301,6 +1313,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
 {
 	__be32 status;
 
+	memset(setattr, 0, sizeof(*setattr));
 	status = nfsd4_decode_stateid4(argp, &setattr->sa_stateid);
 	if (status)
 		return status;
@@ -1315,6 +1328,8 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
 {
 	__be32 *p, status;
 
+	memset(setclientid, 0, sizeof(*setclientid));
+
 	if (argp->minorversion >= 1)
 		return nfserr_notsupp;
 
@@ -1371,6 +1386,8 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify
 {
 	__be32 *p, status;
 
+	memset(verify, 0, sizeof(*verify));
+
 	status = nfsd4_decode_bitmap4(argp, verify->ve_bmval,
 				      ARRAY_SIZE(verify->ve_bmval));
 	if (status)
@@ -1410,6 +1427,9 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
 	if (!xdr_stream_subsegment(argp->xdr, &write->wr_payload, write->wr_buflen))
 		return nfserr_bad_xdr;
 
+	write->wr_bytes_written = 0;
+	write->wr_how_written = 0;
+	memset(&write->wr_verifier, 0, sizeof(write->wr_verifier));
 	return nfs_ok;
 }
 
@@ -1434,6 +1454,7 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
 
 static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
 {
+	memset(bc, 0, sizeof(*bc));
 	if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0)
 		return nfserr_bad_xdr;
 	return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
@@ -1444,6 +1465,7 @@ static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp,
 	u32 use_conn_in_rdma_mode;
 	__be32 status;
 
+	memset(bcts, 0, sizeof(*bcts));
 	status = nfsd4_decode_sessionid4(argp, &bcts->sessionid);
 	if (status)
 		return status;
@@ -1585,6 +1607,7 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
 {
 	__be32 status;
 
+	memset(exid, 0, sizeof(*exid));
 	status = nfsd4_decode_verifier4(argp, &exid->verifier);
 	if (status)
 		return status;
@@ -1637,6 +1660,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
 {
 	__be32 status;
 
+	memset(sess, 0, sizeof(*sess));
 	status = nfsd4_decode_clientid4(argp, &sess->clientid);
 	if (status)
 		return status;
@@ -1652,11 +1676,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
 		return status;
 	if (xdr_stream_decode_u32(argp->xdr, &sess->callback_prog) < 0)
 		return nfserr_bad_xdr;
-	status = nfsd4_decode_cb_sec(argp, &sess->cb_sec);
-	if (status)
-		return status;
-
-	return nfs_ok;
+	return nfsd4_decode_cb_sec(argp, &sess->cb_sec);
 }
 
 static __be32
@@ -1680,6 +1700,7 @@ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
 {
 	__be32 status;
 
+	memset(gdev, 0, sizeof(*gdev));
 	status = nfsd4_decode_deviceid4(argp, &gdev->gd_devid);
 	if (status)
 		return status;
@@ -1700,6 +1721,7 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
 {
 	__be32 *p, status;
 
+	memset(lcp, 0, sizeof(*lcp));
 	if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.offset) < 0)
 		return nfserr_bad_xdr;
 	if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.length) < 0)
@@ -1735,6 +1757,7 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
 {
 	__be32 status;
 
+	memset(lgp, 0, sizeof(*lgp));
 	if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_signal) < 0)
 		return nfserr_bad_xdr;
 	if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_layout_type) < 0)
@@ -1760,6 +1783,7 @@ static __be32
 nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
 		struct nfsd4_layoutreturn *lrp)
 {
+	memset(lrp, 0, sizeof(*lrp));
 	if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0)
 		return nfserr_bad_xdr;
 	if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_layout_type) < 0)
@@ -1775,6 +1799,8 @@ static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
 {
 	if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0)
 		return nfserr_bad_xdr;
+
+	sin->sin_exp = NULL;
 	return nfs_ok;
 }
 
@@ -1795,6 +1821,7 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
 	seq->maxslots = be32_to_cpup(p++);
 	seq->cachethis = be32_to_cpup(p);
 
+	seq->status_flags = 0;
 	return nfs_ok;
 }
 
@@ -1805,6 +1832,7 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta
 	__be32 status;
 	u32 i;
 
+	memset(test_stateid, 0, sizeof(*test_stateid));
 	if (xdr_stream_decode_u32(argp->xdr, &test_stateid->ts_num_ids) < 0)
 		return nfserr_bad_xdr;
 
@@ -1902,6 +1930,7 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
 	struct nl4_server *ns_dummy;
 	__be32 status;
 
+	memset(copy, 0, sizeof(*copy));
 	status = nfsd4_decode_stateid4(argp, &copy->cp_src_stateid);
 	if (status)
 		return status;
@@ -1957,6 +1986,7 @@ nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
 {
 	__be32 status;
 
+	memset(cn, 0, sizeof(*cn));
 	cn->cpn_src = svcxdr_tmpalloc(argp, sizeof(*cn->cpn_src));
 	if (cn->cpn_src == NULL)
 		return nfserr_jukebox;
@@ -1974,6 +2004,8 @@ static __be32
 nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp,
 			    struct nfsd4_offload_status *os)
 {
+	os->count = 0;
+	os->status = 0;
 	return nfsd4_decode_stateid4(argp, &os->stateid);
 }
 
@@ -1990,6 +2022,8 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
 	if (xdr_stream_decode_u32(argp->xdr, &seek->seek_whence) < 0)
 		return nfserr_bad_xdr;
 
+	seek->seek_eof = 0;
+	seek->seek_pos = 0;
 	return nfs_ok;
 }
 
@@ -2125,6 +2159,7 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
 	__be32 status;
 	u32 maxcount;
 
+	memset(getxattr, 0, sizeof(*getxattr));
 	status = nfsd4_decode_xattr_name(argp, &getxattr->getxa_name);
 	if (status)
 		return status;
@@ -2133,8 +2168,7 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
 	maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount);
 
 	getxattr->getxa_len = maxcount;
-
-	return status;
+	return nfs_ok;
 }
 
 static __be32
@@ -2144,6 +2178,8 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
 	u32 flags, maxcount, size;
 	__be32 status;
 
+	memset(setxattr, 0, sizeof(*setxattr));
+
 	if (xdr_stream_decode_u32(argp->xdr, &flags) < 0)
 		return nfserr_bad_xdr;
 
@@ -2182,6 +2218,8 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
 {
 	u32 maxcount;
 
+	memset(listxattrs, 0, sizeof(*listxattrs));
+
 	if (xdr_stream_decode_u64(argp->xdr, &listxattrs->lsxa_cookie) < 0)
 		return nfserr_bad_xdr;
 
@@ -2209,6 +2247,7 @@ static __be32
 nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp,
 			 struct nfsd4_removexattr *removexattr)
 {
+	memset(removexattr, 0, sizeof(*removexattr));
 	return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name);
 }
 
-- 
cgit v1.2.3


From f18d8afbf2fae507fb7246f09fe1de5297c9f11a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:22:50 -0400
Subject: SUNRPC: Clarify comment that documents svc_max_payload()

Note the function returns a per-transport value, not a per-request
value (eg, one that is related to the size of the available send or
receive buffer space).

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/svc.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 32a537f852fe..149171774bc6 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1554,8 +1554,12 @@ out:
 EXPORT_SYMBOL_GPL(bc_svc_process);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
-/*
- * Return (transport-specific) limit on the rpc payload.
+/**
+ * svc_max_payload - Return transport-specific limit on the RPC payload
+ * @rqstp: RPC transaction context
+ *
+ * Returns the maximum number of payload bytes the current transport
+ * allows.
  */
 u32 svc_max_payload(const struct svc_rqst *rqstp)
 {
-- 
cgit v1.2.3


From 98124f5bd6c76699d514fbe491dd95265369cc99 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:22:56 -0400
Subject: NFSD: Refactor common code out of dirlist helpers

The dust has settled a bit and it's become obvious what code is
totally common between nfsd_init_dirlist_pages() and
nfsd3_init_dirlist_pages(). Move that common code to SUNRPC.

The new helper brackets the existing xdr_init_decode_pages() API.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs3proc.c         | 10 +---------
 fs/nfsd/nfsproc.c          | 10 +---------
 include/linux/sunrpc/xdr.h |  2 ++
 net/sunrpc/xdr.c           | 22 ++++++++++++++++++++++
 4 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 58695e4e18b4..923d9a80df92 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -574,15 +574,7 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp,
 	buf->pages = rqstp->rq_next_page;
 	rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-	/* This is xdr_init_encode(), but it assumes that
-	 * the head kvec has already been consumed. */
-	xdr_set_scratch_buffer(xdr, NULL, 0);
-	xdr->buf = buf;
-	xdr->page_ptr = buf->pages;
-	xdr->iov = NULL;
-	xdr->p = page_address(*buf->pages);
-	xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
-	xdr->rqst = NULL;
+	xdr_init_encode_pages(xdr, buf, buf->pages,  NULL);
 }
 
 /*
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 49778ff410e3..82b3ddeacc33 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -575,15 +575,7 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
 	buf->pages = rqstp->rq_next_page;
 	rqstp->rq_next_page++;
 
-	/* This is xdr_init_encode(), but it assumes that
-	 * the head kvec has already been consumed. */
-	xdr_set_scratch_buffer(xdr, NULL, 0);
-	xdr->buf = buf;
-	xdr->page_ptr = buf->pages;
-	xdr->iov = NULL;
-	xdr->p = page_address(*buf->pages);
-	xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
-	xdr->rqst = NULL;
+	xdr_init_encode_pages(xdr, buf, buf->pages,  NULL);
 }
 
 /*
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 69175029abbb..f84e2a1358e1 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -240,6 +240,8 @@ typedef int	(*kxdrdproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 
 extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf,
 			    __be32 *p, struct rpc_rqst *rqst);
+extern void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
+			   struct page **pages, struct rpc_rqst *rqst);
 extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
 extern int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec,
 		size_t nbytes);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 482586c23fdd..b7a7e1467a1b 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -946,6 +946,28 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
 }
 EXPORT_SYMBOL_GPL(xdr_init_encode);
 
+/**
+ * xdr_init_encode_pages - Initialize an xdr_stream for encoding into pages
+ * @xdr: pointer to xdr_stream struct
+ * @buf: pointer to XDR buffer into which to encode data
+ * @pages: list of pages to decode into
+ * @rqst: pointer to controlling rpc_rqst, for debugging
+ *
+ */
+void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
+			   struct page **pages, struct rpc_rqst *rqst)
+{
+	xdr_reset_scratch_buffer(xdr);
+
+	xdr->buf = buf;
+	xdr->page_ptr = pages;
+	xdr->iov = NULL;
+	xdr->p = page_address(*pages);
+	xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
+	xdr->rqst = rqst;
+}
+EXPORT_SYMBOL_GPL(xdr_init_encode_pages);
+
 /**
  * __xdr_commit_encode - Ensure all data is written to buffer
  * @xdr: pointer to xdr_stream
-- 
cgit v1.2.3


From c3d2a04f05c590303c125a176e6e43df4a436fdb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:23:02 -0400
Subject: NFSD: Use xdr_inline_decode() to decode NFSv3 symlinks

Replace the check for buffer over/underflow with a helper that is
commonly used for this purpose. The helper also sets xdr->nwords
correctly after successfully linearizing the symlink argument into
the stream's scratch buffer.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs3xdr.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 0293b8d65f10..71e32cf28885 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -616,8 +616,6 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 {
 	struct nfsd3_symlinkargs *args = rqstp->rq_argp;
 	struct kvec *head = rqstp->rq_arg.head;
-	struct kvec *tail = rqstp->rq_arg.tail;
-	size_t remaining;
 
 	if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen))
 		return false;
@@ -626,16 +624,10 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 	if (xdr_stream_decode_u32(xdr, &args->tlen) < 0)
 		return false;
 
-	/* request sanity */
-	remaining = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len;
-	remaining -= xdr_stream_pos(xdr);
-	if (remaining < xdr_align_size(args->tlen))
-		return false;
-
-	args->first.iov_base = xdr->p;
+	/* symlink_data */
 	args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
-
-	return true;
+	args->first.iov_base = xdr_inline_decode(xdr, args->tlen);
+	return args->first.iov_base != NULL;
 }
 
 bool
-- 
cgit v1.2.3


From d4da5baa533215b14625458e645056baf646bb2e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:23:07 -0400
Subject: NFSD: Clean up WRITE arg decoders

xdr_stream_subsegment() already returns a boolean value.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs3xdr.c | 4 +---
 fs/nfsd/nfsxdr.c  | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 71e32cf28885..3308dd671ef0 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -571,10 +571,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 		args->count = max_blocksize;
 		args->len = max_blocksize;
 	}
-	if (!xdr_stream_subsegment(xdr, &args->payload, args->count))
-		return false;
 
-	return true;
+	return xdr_stream_subsegment(xdr, &args->payload, args->count);
 }
 
 bool
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index aba8520b4b8b..caf6355b18fa 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -338,10 +338,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 		return false;
 	if (args->len > NFSSVC_MAXBLKSIZE_V2)
 		return false;
-	if (!xdr_stream_subsegment(xdr, &args->payload, args->len))
-		return false;
 
-	return true;
+	return xdr_stream_subsegment(xdr, &args->payload, args->len);
 }
 
 bool
-- 
cgit v1.2.3


From b8ab2a6f2d67a7f7b03a92348e3f29c1c767a9ca Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:23:13 -0400
Subject: SUNRPC: Fix typo in xdr_buf_subsegment's kdoc comment

Fix a typo.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 net/sunrpc/xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index b7a7e1467a1b..336a7c7833e4 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1597,7 +1597,7 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
  *
  * @buf and @subbuf may be pointers to the same struct xdr_buf.
  *
- * Returns -1 if base of length are out of bounds.
+ * Returns -1 if base or length are out of bounds.
  */
 int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf,
 		       unsigned int base, unsigned int len)
-- 
cgit v1.2.3


From 9993a66317fc9951322483a9edbfae95a640b210 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:23:19 -0400
Subject: NFSD: Clean up nfs4svc_encode_compoundres()

In today's Linux NFS server implementation, the NFS dispatcher
initializes each XDR result stream, and the NFSv4 .pc_func and
.pc_encode methods all use xdr_stream-based encoding. This keeps
rq_res.len automatically updated. There is no longer a need for
the WARN_ON_ONCE() check in nfs4svc_encode_compoundres().

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 05dddecbaed6..bcfeb1a922c0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -5459,12 +5459,8 @@ bool
 nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 {
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
-	struct xdr_buf *buf = xdr->buf;
 	__be32 *p;
 
-	WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
-				 buf->tail[0].iov_len);
-
 	/*
 	 * Send buffer space for the following items is reserved
 	 * at the top of nfsd4_proc_compound().
-- 
cgit v1.2.3


From 6604148cf961b57fc735e4204f8996536da9253c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:23:25 -0400
Subject: NFSD: Remove "inline" directives on op_rsize_bop helpers

These helpers are always invoked indirectly, so the compiler can't
inline these anyway. While we're updating the synopses of these
helpers, defensively convert their parameters to const pointers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c | 121 +++++++++++++++++++++++++++++++++--------------------
 fs/nfsd/xdr4.h     |   3 +-
 2 files changed, 77 insertions(+), 47 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index bb22f53c7ba9..8e1e560effa2 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2771,28 +2771,33 @@ out:
 
 #define op_encode_channel_attrs_maxsz	(6 + 1 + 1)
 
-static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_only_status_rsize(const struct svc_rqst *rqstp,
+				   const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_status_stateid_rsize(const struct svc_rqst *rqstp,
+				      const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
 }
 
-static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_access_rsize(const struct svc_rqst *rqstp,
+			      const struct nfsd4_op *op)
 {
 	/* ac_supported, ac_resp_access */
 	return (op_encode_hdr_size + 2)* sizeof(__be32);
 }
 
-static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_commit_rsize(const struct svc_rqst *rqstp,
+			      const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_create_rsize(const struct svc_rqst *rqstp,
+			      const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_change_info_maxsz
 		+ nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
@@ -2803,10 +2808,10 @@ static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
  * the op prematurely if the estimate is too large.  We may turn off splice
  * reads unnecessarily.
  */
-static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
-				      struct nfsd4_op *op)
+static u32 nfsd4_getattr_rsize(const struct svc_rqst *rqstp,
+			       const struct nfsd4_op *op)
 {
-	u32 *bmap = op->u.getattr.ga_bmval;
+	const u32 *bmap = op->u.getattr.ga_bmval;
 	u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2];
 	u32 ret = 0;
 
@@ -2841,24 +2846,28 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
 	return ret;
 }
 
-static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_getfh_rsize(const struct svc_rqst *rqstp,
+			     const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE;
 }
 
-static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_link_rsize(const struct svc_rqst *rqstp,
+			    const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_change_info_maxsz)
 		* sizeof(__be32);
 }
 
-static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_lock_rsize(const struct svc_rqst *rqstp,
+			    const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
 		* sizeof(__be32);
 }
 
-static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_open_rsize(const struct svc_rqst *rqstp,
+			    const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_stateid_maxsz
 		+ op_encode_change_info_maxsz + 1
@@ -2866,7 +2875,8 @@ static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 		+ op_encode_delegation_maxsz) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_read_rsize(const struct svc_rqst *rqstp,
+			    const struct nfsd4_op *op)
 {
 	u32 maxcount = 0, rlen = 0;
 
@@ -2876,7 +2886,8 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 	return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_read_plus_rsize(const struct svc_rqst *rqstp,
+				 const struct nfsd4_op *op)
 {
 	u32 maxcount = svc_max_payload(rqstp);
 	u32 rlen = min(op->u.read.rd_length, maxcount);
@@ -2890,7 +2901,8 @@ static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op
 	return (op_encode_hdr_size + 2 + seg_len + XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_readdir_rsize(const struct svc_rqst *rqstp,
+			       const struct nfsd4_op *op)
 {
 	u32 maxcount = 0, rlen = 0;
 
@@ -2901,59 +2913,68 @@ static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o
 		XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_readlink_rsize(const struct svc_rqst *rqstp,
+				const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE;
 }
 
-static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_remove_rsize(const struct svc_rqst *rqstp,
+			      const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_change_info_maxsz)
 		* sizeof(__be32);
 }
 
-static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_rename_rsize(const struct svc_rqst *rqstp,
+			      const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_change_info_maxsz
 		+ op_encode_change_info_maxsz) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
-				       struct nfsd4_op *op)
+static u32 nfsd4_sequence_rsize(const struct svc_rqst *rqstp,
+				const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size
 		+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_test_stateid_rsize(const struct svc_rqst *rqstp,
+				    const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids)
 		* sizeof(__be32);
 }
 
-static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_setattr_rsize(const struct svc_rqst *rqstp,
+			       const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_secinfo_rsize(const struct svc_rqst *rqstp,
+			       const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR *
 		(4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_setclientid_rsize(const struct svc_rqst *rqstp,
+				   const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
 								sizeof(__be32);
 }
 
-static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_write_rsize(const struct svc_rqst *rqstp,
+			     const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_exchange_id_rsize(const struct svc_rqst *rqstp,
+				   const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
 		1 + 1 + /* eir_flags, spr_how */\
@@ -2967,14 +2988,16 @@ static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_o
 		0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_bind_conn_to_session_rsize(const struct svc_rqst *rqstp,
+					    const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + \
 		XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
 		2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_create_session_rsize(const struct svc_rqst *rqstp,
+				      const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + \
 		XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
@@ -2983,7 +3006,8 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
 		op_encode_channel_attrs_maxsz) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_copy_rsize(const struct svc_rqst *rqstp,
+			    const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size +
 		1 /* wr_callback */ +
@@ -2995,16 +3019,16 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
 		1 /* cr_synchronous */) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp,
-					     struct nfsd4_op *op)
+static u32 nfsd4_offload_status_rsize(const struct svc_rqst *rqstp,
+				      const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size +
 		2 /* osr_count */ +
 		1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp,
-					struct nfsd4_op *op)
+static u32 nfsd4_copy_notify_rsize(const struct svc_rqst *rqstp,
+				   const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size +
 		3 /* cnr_lease_time */ +
@@ -3019,7 +3043,8 @@ static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp,
 }
 
 #ifdef CONFIG_NFSD_PNFS
-static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_getdeviceinfo_rsize(const struct svc_rqst *rqstp,
+				     const struct nfsd4_op *op)
 {
 	u32 maxcount = 0, rlen = 0;
 
@@ -3037,7 +3062,8 @@ static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4
  * so we need to define an arbitrary upper bound here.
  */
 #define MAX_LAYOUT_SIZE		128
-static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_layoutget_rsize(const struct svc_rqst *rqstp,
+				 const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size +
 		1 /* logr_return_on_close */ +
@@ -3046,14 +3072,16 @@ static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op
 		MAX_LAYOUT_SIZE) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_layoutcommit_rsize(const struct svc_rqst *rqstp,
+				    const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size +
 		1 /* locr_newsize */ +
 		2 /* ns_size */) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_layoutreturn_rsize(const struct svc_rqst *rqstp,
+				    const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size +
 		1 /* lrs_stateid */ +
@@ -3062,13 +3090,14 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_
 #endif /* CONFIG_NFSD_PNFS */
 
 
-static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+static u32 nfsd4_seek_rsize(const struct svc_rqst *rqstp,
+			    const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + 3) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_getxattr_rsize(struct svc_rqst *rqstp,
-				       struct nfsd4_op *op)
+static u32 nfsd4_getxattr_rsize(const struct svc_rqst *rqstp,
+				const struct nfsd4_op *op)
 {
 	u32 maxcount, rlen;
 
@@ -3078,14 +3107,14 @@ static inline u32 nfsd4_getxattr_rsize(struct svc_rqst *rqstp,
 	return (op_encode_hdr_size + 1 + XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_setxattr_rsize(struct svc_rqst *rqstp,
-				       struct nfsd4_op *op)
+static u32 nfsd4_setxattr_rsize(const struct svc_rqst *rqstp,
+				const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_change_info_maxsz)
 		* sizeof(__be32);
 }
-static inline u32 nfsd4_listxattrs_rsize(struct svc_rqst *rqstp,
-					 struct nfsd4_op *op)
+static u32 nfsd4_listxattrs_rsize(const struct svc_rqst *rqstp,
+				  const struct nfsd4_op *op)
 {
 	u32 maxcount, rlen;
 
@@ -3095,8 +3124,8 @@ static inline u32 nfsd4_listxattrs_rsize(struct svc_rqst *rqstp,
 	return (op_encode_hdr_size + 4 + XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
 
-static inline u32 nfsd4_removexattr_rsize(struct svc_rqst *rqstp,
-					  struct nfsd4_op *op)
+static u32 nfsd4_removexattr_rsize(const struct svc_rqst *rqstp,
+				   const struct nfsd4_op *op)
 {
 	return (op_encode_hdr_size + op_encode_change_info_maxsz)
 		* sizeof(__be32);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 466e2786fc97..7fcbc7a46c15 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -889,7 +889,8 @@ struct nfsd4_operation {
 	u32 op_flags;
 	char *op_name;
 	/* Try to get response size before operation */
-	u32 (*op_rsize_bop)(struct svc_rqst *, struct nfsd4_op *);
+	u32 (*op_rsize_bop)(const struct svc_rqst *rqstp,
+			const struct nfsd4_op *op);
 	void (*op_get_currentstateid)(struct nfsd4_compound_state *,
 			union nfsd4_op_u *);
 	void (*op_set_currentstateid)(struct nfsd4_compound_state *,
-- 
cgit v1.2.3


From 77e378cf2a595d8e39cddf28a31efe6afd9394a0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:23:30 -0400
Subject: NFSD: Remove unused nfsd4_compoundargs::cachetype field

This field was added by commit 1091006c5eb1 ("nfsd: turn on reply
cache for NFSv4") but was never put to use.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/xdr4.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 7fcbc7a46c15..b2bc85421b50 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -724,7 +724,6 @@ struct nfsd4_compoundargs {
 	u32				opcnt;
 	struct nfsd4_op			*ops;
 	struct nfsd4_op			iops[8];
-	int				cachetype;
 };
 
 struct nfsd4_compoundres {
-- 
cgit v1.2.3


From 9f553e61bd36c1048543ac2f6945103dd2f742be Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 12 Sep 2022 17:23:36 -0400
Subject: NFSD: Pack struct nfsd4_compoundres

Remove a couple of 4-byte holes on platforms with 64-bit pointers.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/xdr4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index b2bc85421b50..0eb00105d845 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -732,8 +732,8 @@ struct nfsd4_compoundres {
 	struct svc_rqst *		rqstp;
 
 	__be32				*statusp;
-	u32				taglen;
 	char *				tag;
+	u32				taglen;
 	u32				opcnt;
 
 	struct nfsd4_compound_state	cstate;
-- 
cgit v1.2.3


From 0cfb0c4228a5c8e2ed2b58f8309b660b187cef02 Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong2@huawei.com>
Date: Fri, 23 Sep 2022 00:31:52 +0800
Subject: nfsd: use DEFINE_PROC_SHOW_ATTRIBUTE to define nfsd_proc_ops

Use DEFINE_PROC_SHOW_ATTRIBUTE helper macro to simplify the code.

Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/stats.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index a8c5a02a84f0..777e24e5da33 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -32,7 +32,7 @@ struct svc_stat		nfsd_svcstats = {
 	.program	= &nfsd_program,
 };
 
-static int nfsd_proc_show(struct seq_file *seq, void *v)
+static int nfsd_show(struct seq_file *seq, void *v)
 {
 	int i;
 
@@ -72,17 +72,7 @@ static int nfsd_proc_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static int nfsd_proc_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, nfsd_proc_show, NULL);
-}
-
-static const struct proc_ops nfsd_proc_ops = {
-	.proc_open	= nfsd_proc_open,
-	.proc_read	= seq_read,
-	.proc_lseek	= seq_lseek,
-	.proc_release	= single_release,
-};
+DEFINE_PROC_SHOW_ATTRIBUTE(nfsd);
 
 int nfsd_percpu_counters_init(struct percpu_counter counters[], int num)
 {
-- 
cgit v1.2.3


From 9beeaab8e05d353d709103cafa1941714b4d5d94 Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong2@huawei.com>
Date: Fri, 23 Sep 2022 00:31:53 +0800
Subject: nfsd: use DEFINE_SHOW_ATTRIBUTE to define export_features_fops and
 supported_enctypes_fops

Use DEFINE_SHOW_ATTRIBUTE helper macro to simplify the code.

Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
[ cel: reduce line length ]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsctl.c | 29 +++++------------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 597a26ad4183..3ed0cfdb0c0b 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -185,17 +185,7 @@ static int export_features_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int export_features_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, export_features_show, NULL);
-}
-
-static const struct file_operations export_features_operations = {
-	.open		= export_features_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(export_features);
 
 #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
 static int supported_enctypes_show(struct seq_file *m, void *v)
@@ -204,17 +194,7 @@ static int supported_enctypes_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int supported_enctypes_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, supported_enctypes_show, NULL);
-}
-
-static const struct file_operations supported_enctypes_ops = {
-	.open		= supported_enctypes_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(supported_enctypes);
 #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
 
 static const struct file_operations pool_stats_operations = {
@@ -1365,7 +1345,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
 		/* Per-export io stats use same ops as exports file */
 		[NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO},
 		[NFSD_Export_features] = {"export_features",
-					&export_features_operations, S_IRUGO},
+					&export_features_fops, S_IRUGO},
 		[NFSD_FO_UnlockIP] = {"unlock_ip",
 					&transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_FO_UnlockFS] = {"unlock_filesystem",
@@ -1381,7 +1361,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
 		[NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_Filecache] = {"filecache", &filecache_ops, S_IRUGO},
 #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
-		[NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
+		[NFSD_SupportedEnctypes] = {"supported_krb5_enctypes",
+					&supported_enctypes_fops, S_IRUGO},
 #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
-- 
cgit v1.2.3


From 1d7f6b302b75ff7acb9eb3cab0c631b10cfa7542 Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong2@huawei.com>
Date: Fri, 23 Sep 2022 00:31:54 +0800
Subject: nfsd: use DEFINE_SHOW_ATTRIBUTE to define client_info_fops

Use DEFINE_SHOW_ATTRIBUTE helper macro to simplify the code.

inode is converted from seq_file->file instead of seq_file->private in
client_info_show().

Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 56b50210da38..3d5eb00501a1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2490,7 +2490,7 @@ static const char *cb_state2str(int state)
 
 static int client_info_show(struct seq_file *m, void *v)
 {
-	struct inode *inode = m->private;
+	struct inode *inode = file_inode(m->file);
 	struct nfs4_client *clp;
 	u64 clid;
 
@@ -2530,17 +2530,7 @@ static int client_info_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int client_info_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, client_info_show, inode);
-}
-
-static const struct file_operations client_info_fops = {
-	.open		= client_info_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(client_info);
 
 static void *states_start(struct seq_file *s, loff_t *pos)
 	__acquires(&clp->cl_lock)
-- 
cgit v1.2.3


From 64776611a06322b99386f8dfe3b3ba1aa0347a38 Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong2@huawei.com>
Date: Fri, 23 Sep 2022 00:31:55 +0800
Subject: nfsd: use DEFINE_SHOW_ATTRIBUTE to define nfsd_reply_cache_stats_fops

Use DEFINE_SHOW_ATTRIBUTE helper macro to simplify the code.

nfsd_net is converted from seq_file->file instead of seq_file->private in
nfsd_reply_cache_stats_show().

Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
[ cel: reduce line length ]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/cache.h    |  2 +-
 fs/nfsd/nfscache.c | 13 +++----------
 fs/nfsd/nfsctl.c   | 10 +++-------
 3 files changed, 7 insertions(+), 18 deletions(-)

diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 65c331f75e9c..f21259ead64b 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,6 +84,6 @@ int	nfsd_reply_cache_init(struct nfsd_net *);
 void	nfsd_reply_cache_shutdown(struct nfsd_net *);
 int	nfsd_cache_lookup(struct svc_rqst *);
 void	nfsd_cache_update(struct svc_rqst *, int, __be32 *);
-int	nfsd_reply_cache_stats_open(struct inode *, struct file *);
+int	nfsd_reply_cache_stats_show(struct seq_file *m, void *v);
 
 #endif /* NFSCACHE_H */
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 9b31e1103e7b..3e64a3d50a1c 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -604,9 +604,10 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
  * scraping this file for info should test the labels to ensure they're
  * getting the correct field.
  */
-static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
+int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 {
-	struct nfsd_net *nn = m->private;
+	struct nfsd_net *nn = net_generic(file_inode(m->file)->i_sb->s_fs_info,
+					  nfsd_net_id);
 
 	seq_printf(m, "max entries:           %u\n", nn->max_drc_entries);
 	seq_printf(m, "num entries:           %u\n",
@@ -626,11 +627,3 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, "cachesize at longest:  %u\n", nn->longest_chain_cachesize);
 	return 0;
 }
-
-int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file)
-{
-	struct nfsd_net *nn = net_generic(file_inode(file)->i_sb->s_fs_info,
-								nfsd_net_id);
-
-	return single_open(file, nfsd_reply_cache_stats_show, nn);
-}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3ed0cfdb0c0b..1983f4f2908d 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -204,12 +204,7 @@ static const struct file_operations pool_stats_operations = {
 	.release	= nfsd_pool_stats_release,
 };
 
-static const struct file_operations reply_cache_stats_operations = {
-	.open		= nfsd_reply_cache_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats);
 
 static const struct file_operations filecache_ops = {
 	.open		= nfsd_file_cache_stats_open,
@@ -1354,7 +1349,8 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
 		[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
-		[NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO},
+		[NFSD_Reply_Cache_Stats] = {"reply_cache_stats",
+					&nfsd_reply_cache_stats_fops, S_IRUGO},
 		[NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
-- 
cgit v1.2.3


From 1342f9dd3fc219089deeb2620f6790f19b4129b1 Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong2@huawei.com>
Date: Fri, 23 Sep 2022 00:31:56 +0800
Subject: nfsd: use DEFINE_SHOW_ATTRIBUTE to define nfsd_file_cache_stats_fops

Use DEFINE_SHOW_ATTRIBUTE helper macro to simplify the code.

Signed-off-by: ChenXiaoSong <chenxiaosong2@huawei.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/filecache.c | 7 +------
 fs/nfsd/filecache.h | 2 +-
 fs/nfsd/nfsctl.c    | 9 ++-------
 3 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index eeed4ae5b4ad..d5c57360b418 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -1212,7 +1212,7 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
  * scraping this file for info should test the labels to ensure they're
  * getting the correct field.
  */
-static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
+int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
 {
 	unsigned long releases = 0, pages_flushed = 0, evictions = 0;
 	unsigned long hits = 0, acquisitions = 0;
@@ -1259,8 +1259,3 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, "pages flushed: %lu\n", pages_flushed);
 	return 0;
 }
-
-int nfsd_file_cache_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, nfsd_file_cache_stats_show, NULL);
-}
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 8e8c0c47d67d..357832bac736 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -60,5 +60,5 @@ __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  unsigned int may_flags, struct nfsd_file **nfp);
 __be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  unsigned int may_flags, struct nfsd_file **nfp);
-int	nfsd_file_cache_stats_open(struct inode *, struct file *);
+int nfsd_file_cache_stats_show(struct seq_file *m, void *v);
 #endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 1983f4f2908d..6a29bcfc9390 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -206,12 +206,7 @@ static const struct file_operations pool_stats_operations = {
 
 DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats);
 
-static const struct file_operations filecache_ops = {
-	.open		= nfsd_file_cache_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(nfsd_file_cache_stats);
 
 /*----------------------------------------------------------------------------*/
 /*
@@ -1355,7 +1350,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
 		[NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
 		[NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
-		[NFSD_Filecache] = {"filecache", &filecache_ops, S_IRUGO},
+		[NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO},
 #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
 		[NFSD_SupportedEnctypes] = {"supported_krb5_enctypes",
 					&supported_enctypes_fops, S_IRUGO},
-- 
cgit v1.2.3


From 781fde1a2ba2391f31142f46f964cf1148ca1791 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 22 Sep 2022 13:10:35 -0400
Subject: NFSD: Rename the fields in copy_stateid_t

Code maintenance: The name of the copy_stateid_t::sc_count field
collides with the sc_count field in struct nfs4_stid, making the
latter difficult to grep for when auditing stateid reference
counting.

No behavior change expected.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c  |  6 +++---
 fs/nfsd/nfs4state.c | 30 +++++++++++++++---------------
 fs/nfsd/state.h     |  6 +++---
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8e1e560effa2..a09901cf175c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1828,7 +1828,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		if (!nfs4_init_copy_state(nn, copy))
 			goto out_err;
 		refcount_set(&async_copy->refcount, 1);
-		memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid.stid,
+		memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid.cs_stid,
 			sizeof(copy->cp_res.cb_stateid));
 		dup_copy_fields(copy, async_copy);
 		async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
@@ -1864,7 +1864,7 @@ find_async_copy(struct nfs4_client *clp, stateid_t *stateid)
 
 	spin_lock(&clp->async_lock);
 	list_for_each_entry(copy, &clp->async_copies, copies) {
-		if (memcmp(&copy->cp_stateid.stid, stateid, NFS4_STATEID_SIZE))
+		if (memcmp(&copy->cp_stateid.cs_stid, stateid, NFS4_STATEID_SIZE))
 			continue;
 		refcount_inc(&copy->refcount);
 		spin_unlock(&clp->async_lock);
@@ -1918,7 +1918,7 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	cps = nfs4_alloc_init_cpntf_state(nn, stid);
 	if (!cps)
 		goto out;
-	memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.stid, sizeof(stateid_t));
+	memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.cs_stid, sizeof(stateid_t));
 	memcpy(&cps->cp_p_stateid, &stid->sc_stateid, sizeof(stateid_t));
 	memcpy(&cps->cp_p_clid, &clp->cl_clientid, sizeof(clientid_t));
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3d5eb00501a1..5cfeeeadcb78 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -974,19 +974,19 @@ out_free:
  * Create a unique stateid_t to represent each COPY.
  */
 static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid,
-			      unsigned char sc_type)
+			      unsigned char cs_type)
 {
 	int new_id;
 
-	stid->stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time;
-	stid->stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
-	stid->sc_type = sc_type;
+	stid->cs_stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time;
+	stid->cs_stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
+	stid->cs_type = cs_type;
 
 	idr_preload(GFP_KERNEL);
 	spin_lock(&nn->s2s_cp_lock);
 	new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT);
-	stid->stid.si_opaque.so_id = new_id;
-	stid->stid.si_generation = 1;
+	stid->cs_stid.si_opaque.so_id = new_id;
+	stid->cs_stid.si_generation = 1;
 	spin_unlock(&nn->s2s_cp_lock);
 	idr_preload_end();
 	if (new_id < 0)
@@ -1008,7 +1008,7 @@ struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
 	if (!cps)
 		return NULL;
 	cps->cpntf_time = ktime_get_boottime_seconds();
-	refcount_set(&cps->cp_stateid.sc_count, 1);
+	refcount_set(&cps->cp_stateid.cs_count, 1);
 	if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID))
 		goto out_free;
 	spin_lock(&nn->s2s_cp_lock);
@@ -1024,11 +1024,11 @@ void nfs4_free_copy_state(struct nfsd4_copy *copy)
 {
 	struct nfsd_net *nn;
 
-	WARN_ON_ONCE(copy->cp_stateid.sc_type != NFS4_COPY_STID);
+	WARN_ON_ONCE(copy->cp_stateid.cs_type != NFS4_COPY_STID);
 	nn = net_generic(copy->cp_clp->net, nfsd_net_id);
 	spin_lock(&nn->s2s_cp_lock);
 	idr_remove(&nn->s2s_cp_stateids,
-		   copy->cp_stateid.stid.si_opaque.so_id);
+		   copy->cp_stateid.cs_stid.si_opaque.so_id);
 	spin_unlock(&nn->s2s_cp_lock);
 }
 
@@ -6027,7 +6027,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 	spin_lock(&nn->s2s_cp_lock);
 	idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
 		cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid);
-		if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID &&
+		if (cps->cp_stateid.cs_type == NFS4_COPYNOTIFY_STID &&
 				state_expired(&lt, cps->cpntf_time))
 			_free_cpntf_state_locked(nn, cps);
 	}
@@ -6359,12 +6359,12 @@ out:
 static void
 _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps)
 {
-	WARN_ON_ONCE(cps->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID);
-	if (!refcount_dec_and_test(&cps->cp_stateid.sc_count))
+	WARN_ON_ONCE(cps->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID);
+	if (!refcount_dec_and_test(&cps->cp_stateid.cs_count))
 		return;
 	list_del(&cps->cp_list);
 	idr_remove(&nn->s2s_cp_stateids,
-		   cps->cp_stateid.stid.si_opaque.so_id);
+		   cps->cp_stateid.cs_stid.si_opaque.so_id);
 	kfree(cps);
 }
 /*
@@ -6386,12 +6386,12 @@ __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st,
 	if (cps_t) {
 		state = container_of(cps_t, struct nfs4_cpntf_state,
 				     cp_stateid);
-		if (state->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID) {
+		if (state->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID) {
 			state = NULL;
 			goto unlock;
 		}
 		if (!clp)
-			refcount_inc(&state->cp_stateid.sc_count);
+			refcount_inc(&state->cp_stateid.cs_count);
 		else
 			_free_cpntf_state_locked(nn, state);
 	}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4155be65d806..b3477087a9fc 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -57,11 +57,11 @@ typedef struct {
 } stateid_t;
 
 typedef struct {
-	stateid_t		stid;
+	stateid_t		cs_stid;
 #define NFS4_COPY_STID 1
 #define NFS4_COPYNOTIFY_STID 2
-	unsigned char		sc_type;
-	refcount_t		sc_count;
+	unsigned char		cs_type;
+	refcount_t		cs_count;
 } copy_stateid_t;
 
 struct nfsd4_callback {
-- 
cgit v1.2.3


From 76ce4dcec0dc08a032db916841ddc4e3998be317 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Thu, 1 Sep 2022 15:29:55 -0400
Subject: NFSD: Cap rsize_bop result based on send buffer size

Since before the git era, NFSD has conserved the number of pages
held by each nfsd thread by combining the RPC receive and send
buffers into a single array of pages. This works because there are
no cases where an operation needs a large RPC Call message and a
large RPC Reply at the same time.

Once an RPC Call has been received, svc_process() updates
svc_rqst::rq_res to describe the part of rq_pages that can be
used for constructing the Reply. This means that the send buffer
(rq_res) shrinks when the received RPC record containing the RPC
Call is large.

Add an NFSv4 helper that computes the size of the send buffer. It
replaces svc_max_payload() in spots where svc_max_payload() returns
a value that might be larger than the remaining send buffer space.
Callers who need to know the transport's actual maximum payload size
will continue to use svc_max_payload().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a09901cf175c..8beb2bc4c328 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2771,6 +2771,22 @@ out:
 
 #define op_encode_channel_attrs_maxsz	(6 + 1 + 1)
 
+/*
+ * The _rsize() helpers are invoked by the NFSv4 COMPOUND decoder, which
+ * is called before sunrpc sets rq_res.buflen. Thus we have to compute
+ * the maximum payload size here, based on transport limits and the size
+ * of the remaining space in the rq_pages array.
+ */
+static u32 nfsd4_max_payload(const struct svc_rqst *rqstp)
+{
+	u32 buflen;
+
+	buflen = (rqstp->rq_page_end - rqstp->rq_next_page) * PAGE_SIZE;
+	buflen -= rqstp->rq_auth_slack;
+	buflen -= rqstp->rq_res.head[0].iov_len;
+	return min_t(u32, buflen, svc_max_payload(rqstp));
+}
+
 static u32 nfsd4_only_status_rsize(const struct svc_rqst *rqstp,
 				   const struct nfsd4_op *op)
 {
@@ -2816,9 +2832,9 @@ static u32 nfsd4_getattr_rsize(const struct svc_rqst *rqstp,
 	u32 ret = 0;
 
 	if (bmap0 & FATTR4_WORD0_ACL)
-		return svc_max_payload(rqstp);
+		return nfsd4_max_payload(rqstp);
 	if (bmap0 & FATTR4_WORD0_FS_LOCATIONS)
-		return svc_max_payload(rqstp);
+		return nfsd4_max_payload(rqstp);
 
 	if (bmap1 & FATTR4_WORD1_OWNER) {
 		ret += IDMAP_NAMESZ + 4;
@@ -2878,10 +2894,7 @@ static u32 nfsd4_open_rsize(const struct svc_rqst *rqstp,
 static u32 nfsd4_read_rsize(const struct svc_rqst *rqstp,
 			    const struct nfsd4_op *op)
 {
-	u32 maxcount = 0, rlen = 0;
-
-	maxcount = svc_max_payload(rqstp);
-	rlen = min(op->u.read.rd_length, maxcount);
+	u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp));
 
 	return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
@@ -2889,8 +2902,7 @@ static u32 nfsd4_read_rsize(const struct svc_rqst *rqstp,
 static u32 nfsd4_read_plus_rsize(const struct svc_rqst *rqstp,
 				 const struct nfsd4_op *op)
 {
-	u32 maxcount = svc_max_payload(rqstp);
-	u32 rlen = min(op->u.read.rd_length, maxcount);
+	u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp));
 	/*
 	 * If we detect that the file changed during hole encoding, then we
 	 * recover by encoding the remaining reply as data. This means we need
@@ -2904,10 +2916,7 @@ static u32 nfsd4_read_plus_rsize(const struct svc_rqst *rqstp,
 static u32 nfsd4_readdir_rsize(const struct svc_rqst *rqstp,
 			       const struct nfsd4_op *op)
 {
-	u32 maxcount = 0, rlen = 0;
-
-	maxcount = svc_max_payload(rqstp);
-	rlen = min(op->u.readdir.rd_maxcount, maxcount);
+	u32 rlen = min(op->u.readdir.rd_maxcount, nfsd4_max_payload(rqstp));
 
 	return (op_encode_hdr_size + op_encode_verifier_maxsz +
 		XDR_QUADLEN(rlen)) * sizeof(__be32);
@@ -3046,10 +3055,7 @@ static u32 nfsd4_copy_notify_rsize(const struct svc_rqst *rqstp,
 static u32 nfsd4_getdeviceinfo_rsize(const struct svc_rqst *rqstp,
 				     const struct nfsd4_op *op)
 {
-	u32 maxcount = 0, rlen = 0;
-
-	maxcount = svc_max_payload(rqstp);
-	rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount);
+	u32 rlen = min(op->u.getdeviceinfo.gd_maxcount, nfsd4_max_payload(rqstp));
 
 	return (op_encode_hdr_size +
 		1 /* gd_layout_type*/ +
@@ -3099,10 +3105,7 @@ static u32 nfsd4_seek_rsize(const struct svc_rqst *rqstp,
 static u32 nfsd4_getxattr_rsize(const struct svc_rqst *rqstp,
 				const struct nfsd4_op *op)
 {
-	u32 maxcount, rlen;
-
-	maxcount = svc_max_payload(rqstp);
-	rlen = min_t(u32, XATTR_SIZE_MAX, maxcount);
+	u32 rlen = min_t(u32, XATTR_SIZE_MAX, nfsd4_max_payload(rqstp));
 
 	return (op_encode_hdr_size + 1 + XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
@@ -3116,10 +3119,7 @@ static u32 nfsd4_setxattr_rsize(const struct svc_rqst *rqstp,
 static u32 nfsd4_listxattrs_rsize(const struct svc_rqst *rqstp,
 				  const struct nfsd4_op *op)
 {
-	u32 maxcount, rlen;
-
-	maxcount = svc_max_payload(rqstp);
-	rlen = min(op->u.listxattrs.lsxa_maxcount, maxcount);
+	u32 rlen = min(op->u.listxattrs.lsxa_maxcount, nfsd4_max_payload(rqstp));
 
 	return (op_encode_hdr_size + 4 + XDR_QUADLEN(rlen)) * sizeof(__be32);
 }
-- 
cgit v1.2.3


From 019805fea91599b22dfa62ffb29c022f35abeb06 Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@oracle.com>
Date: Mon, 26 Sep 2022 10:59:16 -0700
Subject: NFSD: fix use-after-free on source server when doing inter-server
 copy

Use-after-free occurred when the laundromat tried to free expired
cpntf_state entry on the s2s_cp_stateids list after inter-server
copy completed. The sc_cp_list that the expired copy state was
inserted on was already freed.

When COPY completes, the Linux client normally sends LOCKU(lock_state x),
FREE_STATEID(lock_state x) and CLOSE(open_state y) to the source server.
The nfs4_put_stid call from nfsd4_free_stateid cleans up the copy state
from the s2s_cp_stateids list before freeing the lock state's stid.

However, sometimes the CLOSE was sent before the FREE_STATEID request.
When this happens, the nfsd4_close_open_stateid call from nfsd4_close
frees all lock states on its st_locks list without cleaning up the copy
state on the sc_cp_list list. When the time the FREE_STATEID arrives the
server returns BAD_STATEID since the lock state was freed. This causes
the use-after-free error to occur when the laundromat tries to free
the expired cpntf_state.

This patch adds a call to nfs4_free_cpntf_statelist in
nfsd4_close_open_stateid to clean up the copy state before calling
free_ol_stateid_reaplist to free the lock state's stid on the reaplist.

Signed-off-by: Dai Ngo <dai.ngo@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 5cfeeeadcb78..bf80ecbc5de7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1060,6 +1060,7 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
 
 static void nfs4_free_deleg(struct nfs4_stid *stid)
 {
+	WARN_ON(!list_empty(&stid->sc_cp_list));
 	kmem_cache_free(deleg_slab, stid);
 	atomic_long_dec(&num_delegations);
 }
@@ -1473,6 +1474,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
 	release_all_access(stp);
 	if (stp->st_stateowner)
 		nfs4_put_stateowner(stp->st_stateowner);
+	WARN_ON(!list_empty(&stid->sc_cp_list));
 	kmem_cache_free(stateid_slab, stid);
 }
 
@@ -6799,6 +6801,7 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 	struct nfs4_client *clp = s->st_stid.sc_client;
 	bool unhashed;
 	LIST_HEAD(reaplist);
+	struct nfs4_ol_stateid *stp;
 
 	spin_lock(&clp->cl_lock);
 	unhashed = unhash_open_stateid(s, &reaplist);
@@ -6807,6 +6810,8 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
 		if (unhashed)
 			put_ol_stateid_locked(s, &reaplist);
 		spin_unlock(&clp->cl_lock);
+		list_for_each_entry(stp, &reaplist, st_locks)
+			nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
 		free_ol_stateid_reaplist(&reaplist);
 	} else {
 		spin_unlock(&clp->cl_lock);
-- 
cgit v1.2.3


From 4d01416ab41540bb13ec4a39ac4e6c4aa5934bc9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 26 Sep 2022 12:38:44 -0400
Subject: nfsd: only fill out return pointer on success in nfsd4_lookup_stateid

In the case of a revoked delegation, we still fill out the pointer even
when returning an error, which is bad form. Only overwrite the pointer
on success.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bf80ecbc5de7..6b9be5a33f01 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -6266,6 +6266,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		     struct nfs4_stid **s, struct nfsd_net *nn)
 {
 	__be32 status;
+	struct nfs4_stid *stid;
 	bool return_revoked = false;
 
 	/*
@@ -6288,15 +6289,16 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 	}
 	if (status)
 		return status;
-	*s = find_stateid_by_type(cstate->clp, stateid, typemask);
-	if (!*s)
+	stid = find_stateid_by_type(cstate->clp, stateid, typemask);
+	if (!stid)
 		return nfserr_bad_stateid;
-	if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
-		nfs4_put_stid(*s);
+	if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
+		nfs4_put_stid(stid);
 		if (cstate->minorversion)
 			return nfserr_deleg_revoked;
 		return nfserr_bad_stateid;
 	}
+	*s = stid;
 	return nfs_ok;
 }
 
-- 
cgit v1.2.3


From 25fbe1fca14142beae6c882f7906510363d42bff Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 26 Sep 2022 12:38:45 -0400
Subject: nfsd: fix comments about spinlock handling with delegations

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6b9be5a33f01..e116f50afcf2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4855,14 +4855,14 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 	 * We're assuming the state code never drops its reference
 	 * without first removing the lease.  Since we're in this lease
 	 * callback (and since the lease code is serialized by the
-	 * i_lock) we know the server hasn't removed the lease yet, and
+	 * flc_lock) we know the server hasn't removed the lease yet, and
 	 * we know it's safe to take a reference.
 	 */
 	refcount_inc(&dp->dl_stid.sc_count);
 	nfsd4_run_cb(&dp->dl_recall);
 }
 
-/* Called from break_lease() with i_lock held. */
+/* Called from break_lease() with flc_lock held. */
 static bool
 nfsd_break_deleg_cb(struct file_lock *fl)
 {
-- 
cgit v1.2.3


From f67b90be20097294cd58b7db91435f4f3278d75f Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo@google.com>
Date: Fri, 2 Sep 2022 21:37:49 +0000
Subject: x86/paravirt: clean up typos and grammaros

Drive-by clean up of the comment.

[ Impact: cleanup]

Signed-off-by: Bill Wendling <morbo@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220902213750.1124421-2-morbo@google.com
---
 arch/x86/include/asm/paravirt_types.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 89df6c6617f5..f04157456a49 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -328,7 +328,7 @@ int paravirt_disable_iospace(void);
  * Unfortunately, this is a relatively slow operation for modern CPUs,
  * because it cannot necessarily determine what the destination
  * address is.  In this case, the address is a runtime constant, so at
- * the very least we can patch the call to e a simple direct call, or
+ * the very least we can patch the call to a simple direct call, or,
  * ideally, patch an inline implementation into the callsite.  (Direct
  * calls are essentially free, because the call and return addresses
  * are completely predictable.)
@@ -339,10 +339,10 @@ int paravirt_disable_iospace(void);
  * on the stack.  All caller-save registers (eax,edx,ecx) are expected
  * to be modified (either clobbered or used for return values).
  * X86_64, on the other hand, already specifies a register-based calling
- * conventions, returning at %rax, with parameters going on %rdi, %rsi,
+ * conventions, returning at %rax, with parameters going in %rdi, %rsi,
  * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
  * special handling for dealing with 4 arguments, unlike i386.
- * However, x86_64 also have to clobber all caller saved registers, which
+ * However, x86_64 also has to clobber all caller saved registers, which
  * unfortunately, are quite a bit (r8 - r11)
  *
  * The call instruction itself is marked by placing its start address
@@ -360,22 +360,22 @@ int paravirt_disable_iospace(void);
  * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
  * It could be extended to more arguments, but there would be little
  * to be gained from that.  For each number of arguments, there are
- * the two VCALL and CALL variants for void and non-void functions.
+ * two VCALL and CALL variants for void and non-void functions.
  *
  * When there is a return value, the invoker of the macro must specify
  * the return type.  The macro then uses sizeof() on that type to
- * determine whether its a 32 or 64 bit value, and places the return
+ * determine whether it's a 32 or 64 bit value and places the return
  * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
- * 64-bit). For x86_64 machines, it just returns at %rax regardless of
+ * 64-bit). For x86_64 machines, it just returns in %rax regardless of
  * the return value size.
  *
- * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
+ * 64-bit arguments are passed as a pair of adjacent 32-bit arguments;
  * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
  * in low,high order
  *
  * Small structures are passed and returned in registers.  The macro
  * calling convention can't directly deal with this, so the wrapper
- * functions must do this.
+ * functions must do it.
  *
  * These PVOP_* macros are only defined within this header.  This
  * means that all uses must be wrapped in inline functions.  This also
-- 
cgit v1.2.3


From 8c86f29bfb18465d15b05cfd26a6454ec787b793 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo@google.com>
Date: Fri, 2 Sep 2022 21:37:50 +0000
Subject: x86/paravirt: add extra clobbers with ZERO_CALL_USED_REGS enabled

The ZERO_CALL_USED_REGS feature may zero out caller-saved registers
before returning.

In spurious_kernel_fault(), the "pte_offset_kernel()" call results in
this assembly code:

.Ltmp151:
        #APP
        # ALT: oldnstr
.Ltmp152:
.Ltmp153:
.Ltmp154:
        .section        .discard.retpoline_safe,"",@progbits
        .quad   .Ltmp154
        .text

        callq   *pv_ops+536(%rip)

.Ltmp155:
        .section        .parainstructions,"a",@progbits
        .p2align        3, 0x0
        .quad   .Ltmp153
        .byte   67
        .byte   .Ltmp155-.Ltmp153
        .short  1
        .text
.Ltmp156:
        # ALT: padding
        .zero   (-(((.Ltmp157-.Ltmp158)-(.Ltmp156-.Ltmp152))>0))*((.Ltmp157-.Ltmp158)-(.Ltmp156-.Ltmp152)),144
.Ltmp159:
        .section        .altinstructions,"a",@progbits
.Ltmp160:
        .long   .Ltmp152-.Ltmp160
.Ltmp161:
        .long   .Ltmp158-.Ltmp161
        .short  33040
        .byte   .Ltmp159-.Ltmp152
        .byte   .Ltmp157-.Ltmp158
        .text

        .section        .altinstr_replacement,"ax",@progbits
        # ALT: replacement 1
.Ltmp158:
        movq    %rdi, %rax
.Ltmp157:
        .text
        #NO_APP
.Ltmp162:
        testb   $-128, %dil

The "testb" here is using %dil, but the %rdi register was cleared before
returning from "callq *pv_ops+536(%rip)". Adding the proper constraints
results in the use of a different register:

        movq    %r11, %rdi

        # Similar to above.

        testb   $-128, %r11b

Link: https://github.com/KSPP/linux/issues/192
Signed-off-by: Bill Wendling <morbo@google.com>
Reported-and-tested-by: Nathan Chancellor <nathan@kernel.org>
Fixes: 035f7f87b729 ("randstruct: Enable Clang support")
Reviewed-by: Juergen Gross <jgross@suse.com>
Link: https://lore.kernel.org/lkml/fa6df43b-8a1a-8ad1-0236-94d2a0b588fa@suse.com/
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220902213750.1124421-3-morbo@google.com
---
 arch/x86/include/asm/paravirt_types.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index f04157456a49..b1ab5d94881b 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -414,8 +414,17 @@ int paravirt_disable_iospace(void);
 				"=c" (__ecx)
 #define PVOP_CALL_CLOBBERS	PVOP_VCALL_CLOBBERS, "=a" (__eax)
 
-/* void functions are still allowed [re]ax for scratch */
+/*
+ * void functions are still allowed [re]ax for scratch.
+ *
+ * The ZERO_CALL_USED REGS feature may end up zeroing out callee-saved
+ * registers. Make sure we model this with the appropriate clobbers.
+ */
+#ifdef CONFIG_ZERO_CALL_USED_REGS
+#define PVOP_VCALLEE_CLOBBERS	"=a" (__eax), PVOP_VCALL_CLOBBERS
+#else
 #define PVOP_VCALLEE_CLOBBERS	"=a" (__eax)
+#endif
 #define PVOP_CALLEE_CLOBBERS	PVOP_VCALLEE_CLOBBERS
 
 #define EXTRA_CLOBBERS	 , "r8", "r9", "r10", "r11"
-- 
cgit v1.2.3


From b95239ca4954a0d48b19c09ce7e8f31b453b4216 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 26 Sep 2022 14:41:01 -0400
Subject: nfsd: make nfsd4_run_cb a bool return function

queue_work can return false and not queue anything, if the work is
already queued. If that happens in the case of a CB_RECALL, we'll have
taken an extra reference to the stid that will never be put. Ensure we
throw a warning in that case.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4callback.c | 14 ++++++++++++--
 fs/nfsd/nfs4state.c    |  5 ++---
 fs/nfsd/state.h        |  2 +-
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4ce328209f61..f0e69edf5f0f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1371,11 +1371,21 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
 	cb->cb_holds_slot = false;
 }
 
-void nfsd4_run_cb(struct nfsd4_callback *cb)
+/**
+ * nfsd4_run_cb - queue up a callback job to run
+ * @cb: callback to queue
+ *
+ * Kick off a callback to do its thing. Returns false if it was already
+ * on a queue, true otherwise.
+ */
+bool nfsd4_run_cb(struct nfsd4_callback *cb)
 {
 	struct nfs4_client *clp = cb->cb_clp;
+	bool queued;
 
 	nfsd41_cb_inflight_begin(clp);
-	if (!nfsd4_queue_cb(cb))
+	queued = nfsd4_queue_cb(cb);
+	if (!queued)
 		nfsd41_cb_inflight_end(clp);
+	return queued;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e116f50afcf2..c78c3223161e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4859,14 +4859,13 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 	 * we know it's safe to take a reference.
 	 */
 	refcount_inc(&dp->dl_stid.sc_count);
-	nfsd4_run_cb(&dp->dl_recall);
+	WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
 }
 
 /* Called from break_lease() with flc_lock held. */
 static bool
 nfsd_break_deleg_cb(struct file_lock *fl)
 {
-	bool ret = false;
 	struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
 	struct nfs4_client *clp = dp->dl_stid.sc_client;
@@ -4892,7 +4891,7 @@ nfsd_break_deleg_cb(struct file_lock *fl)
 	fp->fi_had_conflict = true;
 	nfsd_break_one_deleg(dp);
 	spin_unlock(&fp->fi_lock);
-	return ret;
+	return false;
 }
 
 /**
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index b3477087a9fc..e2daef3cc003 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -692,7 +692,7 @@ extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
 extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
 		const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
-extern void nfsd4_run_cb(struct nfsd4_callback *cb);
+extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
 extern void nfsd4_shutdown_callback(struct nfs4_client *);
-- 
cgit v1.2.3


From 895ddf5ed4c54ea9e3533606d7a8b4e4f27f95ef Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 26 Sep 2022 14:41:02 -0400
Subject: nfsd: extra checks when freeing delegation stateids

We've had some reports of problems in the refcounting for delegation
stateids that we've yet to track down. Add some extra checks to ensure
that we've removed the object from various lists before freeing it.

Link: https://bugzilla.redhat.com/show_bug.cgi?id=2127067
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c78c3223161e..198d7abf34e4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1060,7 +1060,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
 
 static void nfs4_free_deleg(struct nfs4_stid *stid)
 {
-	WARN_ON(!list_empty(&stid->sc_cp_list));
+	struct nfs4_delegation *dp = delegstateid(stid);
+
+	WARN_ON_ONCE(!list_empty(&stid->sc_cp_list));
+	WARN_ON_ONCE(!list_empty(&dp->dl_perfile));
+	WARN_ON_ONCE(!list_empty(&dp->dl_perclnt));
+	WARN_ON_ONCE(!list_empty(&dp->dl_recall_lru));
 	kmem_cache_free(deleg_slab, stid);
 	atomic_long_dec(&num_delegations);
 }
-- 
cgit v1.2.3


From bf7a87f1075f67c286f794519f0fedfa8b0b18cc Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 26 Sep 2022 17:33:35 +0200
Subject: kprobes: Add new KPROBE_FLAG_ON_FUNC_ENTRY kprobe flag

Adding KPROBE_FLAG_ON_FUNC_ENTRY kprobe flag to indicate that
attach address is on function entry. This is used in following
changes in get_func_ip helper to return correct function address.

Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220926153340.1621984-2-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/kprobes.h | 1 +
 kernel/kprobes.c        | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 55041d2f884d..a0b92be98984 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -103,6 +103,7 @@ struct kprobe {
 				   * this flag is only for optimized_kprobe.
 				   */
 #define KPROBE_FLAG_FTRACE	8 /* probe is using ftrace */
+#define KPROBE_FLAG_ON_FUNC_ENTRY	16 /* probe is on the function entry */
 
 /* Has this kprobe gone ? */
 static inline bool kprobe_gone(struct kprobe *p)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 08350e35aba2..51adc3c94503 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1606,9 +1606,10 @@ int register_kprobe(struct kprobe *p)
 	struct kprobe *old_p;
 	struct module *probed_mod;
 	kprobe_opcode_t *addr;
+	bool on_func_entry;
 
 	/* Adjust probe address from symbol */
-	addr = kprobe_addr(p);
+	addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
 	if (IS_ERR(addr))
 		return PTR_ERR(addr);
 	p->addr = addr;
@@ -1628,6 +1629,9 @@ int register_kprobe(struct kprobe *p)
 
 	mutex_lock(&kprobe_mutex);
 
+	if (on_func_entry)
+		p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY;
+
 	old_p = get_kprobe(p->addr);
 	if (old_p) {
 		/* Since this may unoptimize 'old_p', locking 'text_mutex'. */
-- 
cgit v1.2.3


From 9d68c19c57d690547cde977bb3d9ccd3ceb6afe9 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 26 Sep 2022 17:33:36 +0200
Subject: ftrace: Keep the resolved addr in kallsyms_callback

Keeping the resolved 'addr' in kallsyms_callback, instead of taking
ftrace_location value, because we depend on symbol address in the
cookie related code.

With CONFIG_X86_KERNEL_IBT option the ftrace_location value differs
from symbol address, which screwes the symbol address cookies matching.

There are 2 users of this function:
- bpf_kprobe_multi_link_attach
    for which this fix is for

- get_ftrace_locations
    which is used by register_fprobe_syms

    this function needs to get symbols resolved to addresses,
    but does not need 'ftrace location addresses' at this point
    there's another ftrace location translation in the path done
    by ftrace_set_filter_ips call:

     register_fprobe_syms
       addrs = get_ftrace_locations

       register_fprobe_ips(addrs)
         ...
         ftrace_set_filter_ips
           ...
             __ftrace_match_addr
               ip = ftrace_location(ip);
               ...

Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220926153340.1621984-3-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/ftrace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 439e2ab6905e..447d2e2a8549 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -8265,8 +8265,7 @@ static int kallsyms_callback(void *data, const char *name,
 	if (args->addrs[idx])
 		return 0;
 
-	addr = ftrace_location(addr);
-	if (!addr)
+	if (!ftrace_location(addr))
 		return 0;
 
 	args->addrs[idx] = addr;
-- 
cgit v1.2.3


From 4d854f4f31ec4b317dfe316111ddac0fab81f735 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 26 Sep 2022 17:33:37 +0200
Subject: bpf: Use given function address for trampoline ip arg

Using function address given at the generation time as the trampoline
ip argument. This way we get directly the function address that we
need, so we don't need to:
  - read the ip from the stack
  - subtract X86_PATCH_SIZE
  - subtract ENDBR_INSN_SIZE if CONFIG_X86_KERNEL_IBT is enabled
    which is not even implemented yet ;-)

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220926153340.1621984-4-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ae89f4143eb4..d4a6183197e9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -662,7 +662,7 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
 		 */
 		emit_mov_imm32(&prog, false, dst_reg, imm32_lo);
 	} else {
-		/* movabsq %rax, imm64 */
+		/* movabsq rax, imm64 */
 		EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
 		EMIT(imm32_lo, 4);
 		EMIT(imm32_hi, 4);
@@ -2039,13 +2039,14 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
 				struct bpf_tramp_links *tlinks,
-				void *orig_call)
+				void *func_addr)
 {
 	int ret, i, nr_args = m->nr_args, extra_nregs = 0;
 	int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off;
 	struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
 	struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
 	struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
+	void *orig_call = func_addr;
 	u8 **branches = NULL;
 	u8 *prog;
 	bool save_ret;
@@ -2126,12 +2127,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 
 	if (flags & BPF_TRAMP_F_IP_ARG) {
 		/* Store IP address of the traced function:
-		 * mov rax, QWORD PTR [rbp + 8]
-		 * sub rax, X86_PATCH_SIZE
+		 * movabsq rax, func_addr
 		 * mov QWORD PTR [rbp - ip_off], rax
 		 */
-		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
-		EMIT4(0x48, 0x83, 0xe8, X86_PATCH_SIZE);
+		emit_mov_imm64(&prog, BPF_REG_0, (long) func_addr >> 32, (u32) (long) func_addr);
 		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
 	}
 
-- 
cgit v1.2.3


From c09eb2e578eb1668bbc84dc07e8d8bd6f04b9a02 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 26 Sep 2022 17:33:38 +0200
Subject: bpf: Adjust kprobe_multi entry_ip for CONFIG_X86_KERNEL_IBT

Martynas reported bpf_get_func_ip returning +4 address when
CONFIG_X86_KERNEL_IBT option is enabled.

When CONFIG_X86_KERNEL_IBT is enabled we'll have endbr instruction
at the function entry, which screws return value of bpf_get_func_ip()
helper that should return the function address.

There's short term workaround for kprobe_multi bpf program made by
Alexei [1], but we need this fixup also for bpf_get_attach_cookie,
that returns cookie based on the entry_ip value.

Moving the fixup in the fprobe handler, so both bpf_get_func_ip
and bpf_get_attach_cookie get expected function address when
CONFIG_X86_KERNEL_IBT option is enabled.

Also renaming kprobe_multi_link_handler entry_ip argument to fentry_ip
so it's clearer this is an ftrace __fentry__ ip.

[1] commit 7f0059b58f02 ("selftests/bpf: Fix kprobe_multi test.")

Cc: Peter Zijlstra <peterz@infradead.org>
Reported-by: Martynas Pumputis <m@lambda.lt>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220926153340.1621984-5-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/trace/bpf_trace.c                         | 20 ++++++++++++++++++--
 tools/testing/selftests/bpf/progs/kprobe_multi.c |  4 +---
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b05f0310dbd3..ebd1b348beb3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1028,6 +1028,22 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
 	.arg1_type	= ARG_PTR_TO_CTX,
 };
 
+#ifdef CONFIG_X86_KERNEL_IBT
+static unsigned long get_entry_ip(unsigned long fentry_ip)
+{
+	u32 instr;
+
+	/* Being extra safe in here in case entry ip is on the page-edge. */
+	if (get_kernel_nofault(instr, (u32 *) fentry_ip - 1))
+		return fentry_ip;
+	if (is_endbr(instr))
+		fentry_ip -= ENDBR_INSN_SIZE;
+	return fentry_ip;
+}
+#else
+#define get_entry_ip(fentry_ip) fentry_ip
+#endif
+
 BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
 {
 	struct kprobe *kp = kprobe_running();
@@ -2600,13 +2616,13 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
 }
 
 static void
-kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
+kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
 			  struct pt_regs *regs)
 {
 	struct bpf_kprobe_multi_link *link;
 
 	link = container_of(fp, struct bpf_kprobe_multi_link, fp);
-	kprobe_multi_link_prog_run(link, entry_ip, regs);
+	kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
 }
 
 static int symbols_cmp_r(const void *a, const void *b, const void *priv)
diff --git a/tools/testing/selftests/bpf/progs/kprobe_multi.c b/tools/testing/selftests/bpf/progs/kprobe_multi.c
index 08f95a8155d1..98c3399e15c0 100644
--- a/tools/testing/selftests/bpf/progs/kprobe_multi.c
+++ b/tools/testing/selftests/bpf/progs/kprobe_multi.c
@@ -36,15 +36,13 @@ __u64 kretprobe_test6_result = 0;
 __u64 kretprobe_test7_result = 0;
 __u64 kretprobe_test8_result = 0;
 
-extern bool CONFIG_X86_KERNEL_IBT __kconfig __weak;
-
 static void kprobe_multi_check(void *ctx, bool is_return)
 {
 	if (bpf_get_current_pid_tgid() >> 32 != pid)
 		return;
 
 	__u64 cookie = test_cookie ? bpf_get_attach_cookie(ctx) : 0;
-	__u64 addr = bpf_get_func_ip(ctx) - (CONFIG_X86_KERNEL_IBT ? 4 : 0);
+	__u64 addr = bpf_get_func_ip(ctx);
 
 #define SET(__var, __addr, __cookie) ({			\
 	if (((const void *) addr == __addr) &&		\
-- 
cgit v1.2.3


From 0e253f7e558a3e250902ba2034091e0185448836 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 26 Sep 2022 17:33:39 +0200
Subject: bpf: Return value in kprobe get_func_ip only for entry address

Changing return value of kprobe's version of bpf_get_func_ip
to return zero if the attach address is not on the function's
entry point.

For kprobes attached in the middle of the function we can't easily
get to the function address especially now with the CONFIG_X86_KERNEL_IBT
support.

If user cares about current IP for kprobes attached within the
function body, they can get it with PT_REGS_IP(ctx).

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Martynas Pumputis <m@lambda.lt>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220926153340.1621984-6-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h                             | 1 +
 kernel/trace/bpf_trace.c                             | 5 ++++-
 tools/include/uapi/linux/bpf.h                       | 1 +
 tools/testing/selftests/bpf/progs/get_func_ip_test.c | 4 ++--
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ead35f39f185..d6bd10759eaf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4951,6 +4951,7 @@ union bpf_attr {
  * 		Get address of the traced function (for tracing and kprobe programs).
  * 	Return
  * 		Address of the traced function.
+ * 		0 for kprobes placed within the function (not at the entry).
  *
  * u64 bpf_get_attach_cookie(void *ctx)
  * 	Description
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ebd1b348beb3..688552df95ca 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1048,7 +1048,10 @@ BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
 {
 	struct kprobe *kp = kprobe_running();
 
-	return kp ? (uintptr_t)kp->addr : 0;
+	if (!kp || !(kp->flags & KPROBE_FLAG_ON_FUNC_ENTRY))
+		return 0;
+
+	return get_entry_ip((uintptr_t)kp->addr);
 }
 
 static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index ead35f39f185..d6bd10759eaf 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -4951,6 +4951,7 @@ union bpf_attr {
  * 		Get address of the traced function (for tracing and kprobe programs).
  * 	Return
  * 		Address of the traced function.
+ * 		0 for kprobes placed within the function (not at the entry).
  *
  * u64 bpf_get_attach_cookie(void *ctx)
  * 	Description
diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
index a587aeca5ae0..6db70757bc8b 100644
--- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c
+++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
@@ -69,7 +69,7 @@ int test6(struct pt_regs *ctx)
 {
 	__u64 addr = bpf_get_func_ip(ctx);
 
-	test6_result = (const void *) addr == &bpf_fentry_test6 + 5;
+	test6_result = (const void *) addr == 0;
 	return 0;
 }
 
@@ -79,6 +79,6 @@ int test7(struct pt_regs *ctx)
 {
 	__u64 addr = bpf_get_func_ip(ctx);
 
-	test7_result = (const void *) addr == &bpf_fentry_test7 + 5;
+	test7_result = (const void *) addr == 0;
 	return 0;
 }
-- 
cgit v1.2.3


From 738c345b74b8d11edd01b6cee5628c6b8368d8ea Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Mon, 26 Sep 2022 17:33:40 +0200
Subject: selftests/bpf: Fix get_func_ip offset test for CONFIG_X86_KERNEL_IBT

With CONFIG_X86_KERNEL_IBT enabled the test for kprobe with offset
won't work because of the extra endbr instruction.

As suggested by Andrii adding CONFIG_X86_KERNEL_IBT detection
and using appropriate offset value based on that.

Also removing test7 program, because it does the same as test6.

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20220926153340.1621984-7-jolsa@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../selftests/bpf/prog_tests/get_func_ip_test.c    | 59 ++++++++++++++++++----
 .../testing/selftests/bpf/progs/get_func_ip_test.c | 23 +++++----
 2 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c
index 938dbd4d7c2f..fede8ef58b5b 100644
--- a/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c
+++ b/tools/testing/selftests/bpf/prog_tests/get_func_ip_test.c
@@ -2,7 +2,7 @@
 #include <test_progs.h>
 #include "get_func_ip_test.skel.h"
 
-void test_get_func_ip_test(void)
+static void test_function_entry(void)
 {
 	struct get_func_ip_test *skel = NULL;
 	int err, prog_fd;
@@ -12,14 +12,6 @@ void test_get_func_ip_test(void)
 	if (!ASSERT_OK_PTR(skel, "get_func_ip_test__open"))
 		return;
 
-	/* test6 is x86_64 specifc because of the instruction
-	 * offset, disabling it for all other archs
-	 */
-#ifndef __x86_64__
-	bpf_program__set_autoload(skel->progs.test6, false);
-	bpf_program__set_autoload(skel->progs.test7, false);
-#endif
-
 	err = get_func_ip_test__load(skel);
 	if (!ASSERT_OK(err, "get_func_ip_test__load"))
 		goto cleanup;
@@ -43,11 +35,56 @@ void test_get_func_ip_test(void)
 	ASSERT_EQ(skel->bss->test3_result, 1, "test3_result");
 	ASSERT_EQ(skel->bss->test4_result, 1, "test4_result");
 	ASSERT_EQ(skel->bss->test5_result, 1, "test5_result");
+
+cleanup:
+	get_func_ip_test__destroy(skel);
+}
+
+/* test6 is x86_64 specific because of the instruction
+ * offset, disabling it for all other archs
+ */
 #ifdef __x86_64__
+static void test_function_body(void)
+{
+	struct get_func_ip_test *skel = NULL;
+	LIBBPF_OPTS(bpf_test_run_opts, topts);
+	LIBBPF_OPTS(bpf_kprobe_opts, kopts);
+	struct bpf_link *link6 = NULL;
+	int err, prog_fd;
+
+	skel = get_func_ip_test__open();
+	if (!ASSERT_OK_PTR(skel, "get_func_ip_test__open"))
+		return;
+
+	bpf_program__set_autoload(skel->progs.test6, true);
+
+	err = get_func_ip_test__load(skel);
+	if (!ASSERT_OK(err, "get_func_ip_test__load"))
+		goto cleanup;
+
+	kopts.offset = skel->kconfig->CONFIG_X86_KERNEL_IBT ? 9 : 5;
+
+	link6 = bpf_program__attach_kprobe_opts(skel->progs.test6, "bpf_fentry_test6", &kopts);
+	if (!ASSERT_OK_PTR(link6, "link6"))
+		goto cleanup;
+
+	prog_fd = bpf_program__fd(skel->progs.test1);
+	err = bpf_prog_test_run_opts(prog_fd, &topts);
+	ASSERT_OK(err, "test_run");
+	ASSERT_EQ(topts.retval, 0, "test_run");
+
 	ASSERT_EQ(skel->bss->test6_result, 1, "test6_result");
-	ASSERT_EQ(skel->bss->test7_result, 1, "test7_result");
-#endif
 
 cleanup:
+	bpf_link__destroy(link6);
 	get_func_ip_test__destroy(skel);
 }
+#else
+#define test_function_body()
+#endif
+
+void test_get_func_ip_test(void)
+{
+	test_function_entry();
+	test_function_body();
+}
diff --git a/tools/testing/selftests/bpf/progs/get_func_ip_test.c b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
index 6db70757bc8b..8559e698b40d 100644
--- a/tools/testing/selftests/bpf/progs/get_func_ip_test.c
+++ b/tools/testing/selftests/bpf/progs/get_func_ip_test.c
@@ -2,6 +2,7 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include <stdbool.h>
 
 char _license[] SEC("license") = "GPL";
 
@@ -13,6 +14,16 @@ extern const void bpf_modify_return_test __ksym;
 extern const void bpf_fentry_test6 __ksym;
 extern const void bpf_fentry_test7 __ksym;
 
+extern bool CONFIG_X86_KERNEL_IBT __kconfig __weak;
+
+/* This function is here to have CONFIG_X86_KERNEL_IBT
+ * used and added to object BTF.
+ */
+int unused(void)
+{
+	return CONFIG_X86_KERNEL_IBT ? 0 : 1;
+}
+
 __u64 test1_result = 0;
 SEC("fentry/bpf_fentry_test1")
 int BPF_PROG(test1, int a)
@@ -64,7 +75,7 @@ int BPF_PROG(test5, int a, int *b, int ret)
 }
 
 __u64 test6_result = 0;
-SEC("kprobe/bpf_fentry_test6+0x5")
+SEC("?kprobe")
 int test6(struct pt_regs *ctx)
 {
 	__u64 addr = bpf_get_func_ip(ctx);
@@ -72,13 +83,3 @@ int test6(struct pt_regs *ctx)
 	test6_result = (const void *) addr == 0;
 	return 0;
 }
-
-__u64 test7_result = 0;
-SEC("kprobe/bpf_fentry_test7+5")
-int test7(struct pt_regs *ctx)
-{
-	__u64 addr = bpf_get_func_ip(ctx);
-
-	test7_result = (const void *) addr == 0;
-	return 0;
-}
-- 
cgit v1.2.3


From 19c02415da2345d0dda2b5c4495bc17cc14b18b5 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 26 Sep 2022 11:47:38 -0700
Subject: bpf: use bpf_prog_pack for bpf_dispatcher

Allocate bpf_dispatcher with bpf_prog_pack_alloc so that bpf_dispatcher
can share pages with bpf programs.

arch_prepare_bpf_dispatcher() is updated to provide a RW buffer as working
area for arch code to write to.

This also fixes CPA W^X warnning like:

CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ...

Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20220926184739.3512547-2-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 16 ++++++++--------
 include/linux/bpf.h         |  3 ++-
 include/linux/filter.h      |  5 +++++
 kernel/bpf/core.c           |  9 +++++++--
 kernel/bpf/dispatcher.c     | 27 +++++++++++++++++++++------
 5 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index d4a6183197e9..35796db58116 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2242,7 +2242,7 @@ cleanup:
 	return ret;
 }
 
-static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
+static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
 {
 	u8 *jg_reloc, *prog = *pprog;
 	int pivot, err, jg_bytes = 1;
@@ -2258,12 +2258,12 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
 		EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3),
 			    progs[a]);
 		err = emit_cond_near_jump(&prog,	/* je func */
-					  (void *)progs[a], prog,
+					  (void *)progs[a], image + (prog - buf),
 					  X86_JE);
 		if (err)
 			return err;
 
-		emit_indirect_jump(&prog, 2 /* rdx */, prog);
+		emit_indirect_jump(&prog, 2 /* rdx */, image + (prog - buf));
 
 		*pprog = prog;
 		return 0;
@@ -2288,7 +2288,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
 	jg_reloc = prog;
 
 	err = emit_bpf_dispatcher(&prog, a, a + pivot,	/* emit lower_part */
-				  progs);
+				  progs, image, buf);
 	if (err)
 		return err;
 
@@ -2302,7 +2302,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
 	emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes);
 
 	err = emit_bpf_dispatcher(&prog, a + pivot + 1,	/* emit upper_part */
-				  b, progs);
+				  b, progs, image, buf);
 	if (err)
 		return err;
 
@@ -2322,12 +2322,12 @@ static int cmp_ips(const void *a, const void *b)
 	return 0;
 }
 
-int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
 {
-	u8 *prog = image;
+	u8 *prog = buf;
 
 	sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL);
-	return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs);
+	return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf);
 }
 
 struct x64_jit_data {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index edd43edb27d6..9ae155c75014 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -946,6 +946,7 @@ struct bpf_dispatcher {
 	struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX];
 	int num_progs;
 	void *image;
+	void *rw_image;
 	u32 image_off;
 	struct bpf_ksym ksym;
 };
@@ -964,7 +965,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin
 struct bpf_trampoline *bpf_trampoline_get(u64 key,
 					  struct bpf_attach_target_info *tgt_info);
 void bpf_trampoline_put(struct bpf_trampoline *tr);
-int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs);
+int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs);
 #define BPF_DISPATCHER_INIT(_name) {				\
 	.mutex = __MUTEX_INITIALIZER(_name.mutex),		\
 	.func = &_name##_func,					\
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 98e28126c24b..efc42a6e3aed 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1023,6 +1023,8 @@ extern long bpf_jit_limit_max;
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
+void bpf_jit_fill_hole_with_zero(void *area, unsigned int size);
+
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     unsigned int alignment,
@@ -1035,6 +1037,9 @@ void bpf_jit_free(struct bpf_prog *fp);
 struct bpf_binary_header *
 bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
 
+void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
+void bpf_prog_pack_free(struct bpf_binary_header *hdr);
+
 static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 {
 	return list_empty(&fp->aux->ksym.lnode) ||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d1be78c28619..711fd293b6de 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -825,6 +825,11 @@ struct bpf_prog_pack {
 	unsigned long bitmap[];
 };
 
+void bpf_jit_fill_hole_with_zero(void *area, unsigned int size)
+{
+	memset(area, 0, size);
+}
+
 #define BPF_PROG_SIZE_TO_NBITS(size)	(round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
 
 static DEFINE_MUTEX(pack_mutex);
@@ -864,7 +869,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins
 	return pack;
 }
 
-static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
+void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
 	struct bpf_prog_pack *pack;
@@ -905,7 +910,7 @@ out:
 	return ptr;
 }
 
-static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+void bpf_prog_pack_free(struct bpf_binary_header *hdr)
 {
 	struct bpf_prog_pack *pack = NULL, *tmp;
 	unsigned int nbits;
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 2444bd15cc2d..fa64b80b8bca 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -85,12 +85,12 @@ static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
 	return false;
 }
 
-int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
 {
 	return -ENOTSUPP;
 }
 
-static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf)
 {
 	s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
 	int i;
@@ -99,12 +99,12 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
 		if (d->progs[i].prog)
 			*ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
 	}
-	return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+	return arch_prepare_bpf_dispatcher(image, buf, &ips[0], d->num_progs);
 }
 
 static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
 {
-	void *old, *new;
+	void *old, *new, *tmp;
 	u32 noff;
 	int err;
 
@@ -117,8 +117,14 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
 	}
 
 	new = d->num_progs ? d->image + noff : NULL;
+	tmp = d->num_progs ? d->rw_image + noff : NULL;
 	if (new) {
-		if (bpf_dispatcher_prepare(d, new))
+		/* Prepare the dispatcher in d->rw_image. Then use
+		 * bpf_arch_text_copy to update d->image, which is RO+X.
+		 */
+		if (bpf_dispatcher_prepare(d, new, tmp))
+			return;
+		if (IS_ERR(bpf_arch_text_copy(new, tmp, PAGE_SIZE / 2)))
 			return;
 	}
 
@@ -140,9 +146,18 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 
 	mutex_lock(&d->mutex);
 	if (!d->image) {
-		d->image = bpf_jit_alloc_exec_page();
+		d->image = bpf_prog_pack_alloc(PAGE_SIZE, bpf_jit_fill_hole_with_zero);
 		if (!d->image)
 			goto out;
+		d->rw_image = bpf_jit_alloc_exec(PAGE_SIZE);
+		if (!d->rw_image) {
+			u32 size = PAGE_SIZE;
+
+			bpf_arch_text_copy(d->image, &size, sizeof(size));
+			bpf_prog_pack_free((struct bpf_binary_header *)d->image);
+			d->image = NULL;
+			goto out;
+		}
 		bpf_image_ksym_add(d->image, &d->ksym);
 	}
 
-- 
cgit v1.2.3


From 5b0d1c7bd5722467960829af51d523f5a6ffd848 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 26 Sep 2022 11:47:39 -0700
Subject: bpf: Enforce W^X for bpf trampoline

Mark the trampoline as RO+X after arch_prepare_bpf_trampoline, so that
the trampoine follows W^X rule strictly. This will turn off warnings like

CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ...

Also remove bpf_jit_alloc_exec_page(), since it is not used any more.

Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20220926184739.3512547-3-song@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h     |  1 -
 kernel/bpf/trampoline.c | 22 +++++-----------------
 2 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9ae155c75014..5161fac0513f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1008,7 +1008,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func
 void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
 				struct bpf_prog *to);
 /* Called only from JIT-enabled code, so there's no need for stubs. */
-void *bpf_jit_alloc_exec_page(void);
 void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
 void bpf_image_ksym_del(struct bpf_ksym *ksym);
 void bpf_ksym_add(struct bpf_ksym *ksym);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 41b67eb83ab3..6f7b939321d6 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -116,22 +116,6 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
 		(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
 }
 
-void *bpf_jit_alloc_exec_page(void)
-{
-	void *image;
-
-	image = bpf_jit_alloc_exec(PAGE_SIZE);
-	if (!image)
-		return NULL;
-
-	set_vm_flush_reset_perms(image);
-	/* Keep image as writeable. The alternative is to keep flipping ro/rw
-	 * every time new program is attached or detached.
-	 */
-	set_memory_x((long)image, 1);
-	return image;
-}
-
 void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
 {
 	ksym->start = (unsigned long) data;
@@ -404,9 +388,10 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
 		goto out_free_im;
 
 	err = -ENOMEM;
-	im->image = image = bpf_jit_alloc_exec_page();
+	im->image = image = bpf_jit_alloc_exec(PAGE_SIZE);
 	if (!image)
 		goto out_uncharge;
+	set_vm_flush_reset_perms(image);
 
 	err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
 	if (err)
@@ -483,6 +468,9 @@ again:
 	if (err < 0)
 		goto out;
 
+	set_memory_ro((long)im->image, 1);
+	set_memory_x((long)im->image, 1);
+
 	WARN_ON(tr->cur_image && tr->selector == 0);
 	WARN_ON(!tr->cur_image && tr->selector);
 	if (tr->cur_image)
-- 
cgit v1.2.3


From 6a4ab8869d0bfcf83d7c5184561df8235553cf28 Mon Sep 17 00:00:00 2001
From: Jon Doron <jond@wiz.io>
Date: Sun, 25 Sep 2022 10:04:31 +0300
Subject: libbpf: Fix the case of running as non-root with capabilities

When running rootless with special capabilities like:
FOWNER / DAC_OVERRIDE / DAC_READ_SEARCH

The "access" API will not make the proper check if there is really
access to a file or not.

>From the access man page:
"
The check is done using the calling process's real UID and GID, rather
than the effective IDs as is done when actually attempting an operation
(e.g., open(2)) on the file.  Similarly, for the root user, the check
uses the set of permitted capabilities  rather than the set of effective
capabilities; ***and for non-root users, the check uses an empty set of
capabilities.***
"

What that means is that for non-root user the access API will not do the
proper validation if the process really has permission to a file or not.

To resolve this this patch replaces all the access API calls with
faccessat with AT_EACCESS flag.

Signed-off-by: Jon Doron <jond@wiz.io>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220925070431.1313680-1-arilou@gmail.com
---
 tools/lib/bpf/btf.c    | 2 +-
 tools/lib/bpf/libbpf.c | 6 +++---
 tools/lib/bpf/usdt.c   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index b4d9a96c3c1b..d88647da2c7f 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -4664,7 +4664,7 @@ struct btf *btf__load_vmlinux_btf(void)
 	for (i = 0; i < ARRAY_SIZE(locations); i++) {
 		snprintf(path, PATH_MAX, locations[i], buf.release);
 
-		if (access(path, R_OK))
+		if (faccessat(AT_FDCWD, path, R_OK, AT_EACCESS))
 			continue;
 
 		btf = btf__parse(path, NULL);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e691f08a297f..184ce1684dcd 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -884,7 +884,7 @@ __u32 get_kernel_version(void)
 	__u32 major, minor, patch;
 	struct utsname info;
 
-	if (access(ubuntu_kver_file, R_OK) == 0) {
+	if (faccessat(AT_FDCWD, ubuntu_kver_file, R_OK, AT_EACCESS) == 0) {
 		FILE *f;
 
 		f = fopen(ubuntu_kver_file, "r");
@@ -9904,7 +9904,7 @@ static bool use_debugfs(void)
 	static int has_debugfs = -1;
 
 	if (has_debugfs < 0)
-		has_debugfs = access(DEBUGFS, F_OK) == 0;
+		has_debugfs = faccessat(AT_FDCWD, DEBUGFS, F_OK, AT_EACCESS) == 0;
 
 	return has_debugfs == 1;
 }
@@ -10721,7 +10721,7 @@ static int resolve_full_path(const char *file, char *result, size_t result_sz)
 				continue;
 			snprintf(result, result_sz, "%.*s/%s", seg_len, s, file);
 			/* ensure it has required permissions */
-			if (access(result, perm) < 0)
+			if (faccessat(AT_FDCWD, result, perm, AT_EACCESS) < 0)
 				continue;
 			pr_debug("resolved '%s' to '%s'\n", file, result);
 			return 0;
diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c
index d18e37982344..e83b497c2245 100644
--- a/tools/lib/bpf/usdt.c
+++ b/tools/lib/bpf/usdt.c
@@ -282,7 +282,7 @@ struct usdt_manager *usdt_manager_new(struct bpf_object *obj)
 	 * If this is not supported, USDTs with semaphores will not be supported.
 	 * Added in: a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe")
 	 */
-	man->has_sema_refcnt = access(ref_ctr_sysfs_path, F_OK) == 0;
+	man->has_sema_refcnt = faccessat(AT_FDCWD, ref_ctr_sysfs_path, F_OK, AT_EACCESS) == 0;
 
 	return man;
 }
-- 
cgit v1.2.3


From 31da107fdb0a01b889af41cd94a9904e1f6ffaa6 Mon Sep 17 00:00:00 2001
From: Yue Hu <huyue2@coolpad.com>
Date: Tue, 27 Sep 2022 11:25:18 +0800
Subject: erofs: fold in z_erofs_reload_indexes()

The name of this function looks not very accurate compared to it's
implementation and it's only a wrapper to erofs_read_metabuf(). So,
let's fold it directly instead.

Signed-off-by: Yue Hu <huyue2@coolpad.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20220927032518.25266-1-zbestahu@gmail.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/zmap.c | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index ccdddb755be8..4cecd32b87c6 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -166,18 +166,6 @@ struct z_erofs_maprecorder {
 	bool partialref;
 };
 
-static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
-				  erofs_blk_t eblk)
-{
-	struct super_block *const sb = m->inode->i_sb;
-
-	m->kaddr = erofs_read_metabuf(&m->map->buf, sb, eblk,
-				      EROFS_KMAP_ATOMIC);
-	if (IS_ERR(m->kaddr))
-		return PTR_ERR(m->kaddr);
-	return 0;
-}
-
 static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 					 unsigned long lcn)
 {
@@ -190,11 +178,11 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 		lcn * sizeof(struct z_erofs_vle_decompressed_index);
 	struct z_erofs_vle_decompressed_index *di;
 	unsigned int advise, type;
-	int err;
 
-	err = z_erofs_reload_indexes(m, erofs_blknr(pos));
-	if (err)
-		return err;
+	m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
+				      erofs_blknr(pos), EROFS_KMAP_ATOMIC);
+	if (IS_ERR(m->kaddr))
+		return PTR_ERR(m->kaddr);
 
 	m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index);
 	m->lcn = lcn;
@@ -393,7 +381,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 	unsigned int compacted_4b_initial, compacted_2b;
 	unsigned int amortizedshift;
 	erofs_off_t pos;
-	int err;
 
 	if (lclusterbits != 12)
 		return -EOPNOTSUPP;
@@ -430,9 +417,10 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 	amortizedshift = 2;
 out:
 	pos += lcn * (1 << amortizedshift);
-	err = z_erofs_reload_indexes(m, erofs_blknr(pos));
-	if (err)
-		return err;
+	m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
+				      erofs_blknr(pos), EROFS_KMAP_ATOMIC);
+	if (IS_ERR(m->kaddr))
+		return PTR_ERR(m->kaddr);
 	return unpack_compacted_index(m, amortizedshift, pos, lookahead);
 }
 
-- 
cgit v1.2.3


From c292a337d0e45a292c301e3cd51c35aa0ae91e95 Mon Sep 17 00:00:00 2001
From: Michael Kelley <mikelley@microsoft.com>
Date: Thu, 22 Sep 2022 21:49:09 -0700
Subject: nvme: Fix IOC_PR_CLEAR and IOC_PR_RELEASE ioctls for nvme devices

The IOC_PR_CLEAR and IOC_PR_RELEASE ioctls are
non-functional on NVMe devices because the nvme_pr_clear()
and nvme_pr_release() functions set the IEKEY field incorrectly.
The IEKEY field should be set only when the key is zero (i.e,
not specified).  The current code does it backwards.

Furthermore, the NVMe spec describes the persistent
reservation "clear" function as an option on the reservation
release command. The current implementation of nvme_pr_clear()
erroneously uses the reservation register command.

Fix these errors. Note that NVMe version 1.3 and later specify
that setting the IEKEY field will return an error of Invalid
Field in Command.  The fix will set IEKEY when the key is zero,
which is appropriate as these ioctls consider a zero key to
be "unspecified", and the intention of the spec change is
to require a valid key.

Tested on a version 1.4 PCI NVMe device in an Azure VM.

Fixes: 1673f1f08c88 ("nvme: move block_device_operations and ns/ctrl freeing to common code")
Fixes: 1d277a637a71 ("NVMe: Add persistent reservation ops")
Signed-off-by: Michael Kelley <mikelley@microsoft.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 70ebf27ad10e..4754188d9b04 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2162,14 +2162,14 @@ static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
 
 static int nvme_pr_clear(struct block_device *bdev, u64 key)
 {
-	u32 cdw10 = 1 | (key ? 1 << 3 : 0);
+	u32 cdw10 = 1 | (key ? 0 : 1 << 3);
 
-	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
 }
 
 static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
 {
-	u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 1 << 3 : 0);
+	u32 cdw10 = nvme_pr_type(type) << 8 | (key ? 0 : 1 << 3);
 
 	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
 }
-- 
cgit v1.2.3


From d14c273132aec81a1a8107c9ab4865b89e7910a7 Mon Sep 17 00:00:00 2001
From: Tina Hsu <tina_hsu@phison.corp-partner.google.com>
Date: Thu, 22 Sep 2022 14:16:30 +0800
Subject: nvme-pci: disable Write Zeroes on Phison E3C/E4C

E3C/E4C SSDs do support the Write Zeroes command in theory, but have very
bad performance when using it.  As the firmware has been frozen for these
products we can not expect firmware improvements for it, so disable
Write Zeroes.

Signed-off-by: Tina Hsu <tina_hsu@phison.corp-partner.google.com>
[hch: update the commit message]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ca1560240123..3bdb97205699 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3475,6 +3475,10 @@ static const struct pci_device_id nvme_id_table[] = {
 	{ PCI_DEVICE(0x1987, 0x5016),	/* Phison E16 */
 		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN |
 				NVME_QUIRK_BOGUS_NID, },
+	{ PCI_DEVICE(0x1987, 0x5019),  /* phison E19 */
+		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+	{ PCI_DEVICE(0x1987, 0x5021),   /* Phison E21 */
+		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
 	{ PCI_DEVICE(0x1b4b, 0x1092),	/* Lexar 256 GB SSD */
 		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
 				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
-- 
cgit v1.2.3


From 4e768c8e34e639cff66a0f175bc4aebf472e4305 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Mon, 21 Mar 2022 08:33:56 +0000
Subject: media: v4l2-compat-ioctl32.c: zero buffer passed to
 v4l2_compat_get_array_args()

The v4l2_compat_get_array_args() function can leave uninitialized memory in the
buffer it is passed. So zero it before copying array elements from userspace
into the buffer.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Reported-by: syzbot+ff18193ff05f3f87f226@syzkaller.appspotmail.com
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index 0f3d6b5667b0..55c26e7d370e 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -1040,6 +1040,8 @@ int v4l2_compat_get_array_args(struct file *file, void *mbuf,
 {
 	int err = 0;
 
+	memset(mbuf, 0, array_size);
+
 	switch (cmd) {
 	case VIDIOC_G_FMT32:
 	case VIDIOC_S_FMT32:
-- 
cgit v1.2.3


From f0da34f32920c16f34acb692cf1646465bd40c04 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Wed, 18 May 2022 14:06:31 +0100
Subject: media: v4l2-ioctl.c: fix incorrect error path

If allocating array_buf fails, or copying data from userspace into that
buffer fails, then just free memory and return the error. Don't attempt
to call video_put_user() since there is no point, and it would copy back
data on error even if INFO_FL_ALWAYS_COPY wasn't set.

So if writing the array back to userspace fails, then don't go to
out_array_args, instead just continue with the regular code that just
returns the error unless 'always_copy' is set.

Update the VIDIOC_G/S/TRY_EXT_CTRLS ioctls to set the ALWAYS_COPY flag
since they now need it. Before this worked due to this buggy code, but
now that that is fixed these ioctls need to set this flag explicitly.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/v4l2-core/v4l2-ioctl.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index c314025d977e..e6fd355a2e92 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -2872,9 +2872,9 @@ static const struct v4l2_ioctl_info v4l2_ioctls[] = {
 	IOCTL_INFO(VIDIOC_S_PRIORITY, v4l_s_priority, v4l_print_u32, INFO_FL_PRIO),
 	IOCTL_INFO(VIDIOC_G_SLICED_VBI_CAP, v4l_g_sliced_vbi_cap, v4l_print_sliced_vbi_cap, INFO_FL_CLEAR(v4l2_sliced_vbi_cap, type)),
 	IOCTL_INFO(VIDIOC_LOG_STATUS, v4l_log_status, v4l_print_newline, 0),
-	IOCTL_INFO(VIDIOC_G_EXT_CTRLS, v4l_g_ext_ctrls, v4l_print_ext_controls, INFO_FL_CTRL),
-	IOCTL_INFO(VIDIOC_S_EXT_CTRLS, v4l_s_ext_ctrls, v4l_print_ext_controls, INFO_FL_PRIO | INFO_FL_CTRL),
-	IOCTL_INFO(VIDIOC_TRY_EXT_CTRLS, v4l_try_ext_ctrls, v4l_print_ext_controls, INFO_FL_CTRL),
+	IOCTL_INFO(VIDIOC_G_EXT_CTRLS, v4l_g_ext_ctrls, v4l_print_ext_controls, INFO_FL_CTRL | INFO_FL_ALWAYS_COPY),
+	IOCTL_INFO(VIDIOC_S_EXT_CTRLS, v4l_s_ext_ctrls, v4l_print_ext_controls, INFO_FL_PRIO | INFO_FL_CTRL | INFO_FL_ALWAYS_COPY),
+	IOCTL_INFO(VIDIOC_TRY_EXT_CTRLS, v4l_try_ext_ctrls, v4l_print_ext_controls, INFO_FL_CTRL | INFO_FL_ALWAYS_COPY),
 	IOCTL_INFO(VIDIOC_ENUM_FRAMESIZES, v4l_stub_enum_framesizes, v4l_print_frmsizeenum, INFO_FL_CLEAR(v4l2_frmsizeenum, pixel_format)),
 	IOCTL_INFO(VIDIOC_ENUM_FRAMEINTERVALS, v4l_stub_enum_frameintervals, v4l_print_frmivalenum, INFO_FL_CLEAR(v4l2_frmivalenum, height)),
 	IOCTL_INFO(VIDIOC_G_ENC_INDEX, v4l_stub_g_enc_index, v4l_print_enc_idx, 0),
@@ -3367,8 +3367,7 @@ video_usercopy(struct file *file, unsigned int orig_cmd, unsigned long arg,
 		array_buf = kvmalloc(array_size, GFP_KERNEL);
 		err = -ENOMEM;
 		if (array_buf == NULL)
-			goto out_array_args;
-		err = -EFAULT;
+			goto out;
 		if (in_compat_syscall())
 			err = v4l2_compat_get_array_args(file, array_buf,
 							 user_ptr, array_size,
@@ -3377,7 +3376,7 @@ video_usercopy(struct file *file, unsigned int orig_cmd, unsigned long arg,
 			err = copy_from_user(array_buf, user_ptr, array_size) ?
 								-EFAULT : 0;
 		if (err)
-			goto out_array_args;
+			goto out;
 		*kernel_ptr = array_buf;
 	}
 
@@ -3395,6 +3394,13 @@ video_usercopy(struct file *file, unsigned int orig_cmd, unsigned long arg,
 			trace_v4l2_qbuf(video_devdata(file)->minor, parg);
 	}
 
+	/*
+	 * Some ioctls can return an error, but still have valid
+	 * results that must be returned.
+	 */
+	if (err < 0 && !always_copy)
+		goto out;
+
 	if (has_array_args) {
 		*kernel_ptr = (void __force *)user_ptr;
 		if (in_compat_syscall()) {
@@ -3409,16 +3415,8 @@ video_usercopy(struct file *file, unsigned int orig_cmd, unsigned long arg,
 		} else if (copy_to_user(user_ptr, array_buf, array_size)) {
 			err = -EFAULT;
 		}
-		goto out_array_args;
 	}
-	/*
-	 * Some ioctls can return an error, but still have valid
-	 * results that must be returned.
-	 */
-	if (err < 0 && !always_copy)
-		goto out;
 
-out_array_args:
 	if (video_put_user((void __user *)arg, parg, cmd, orig_cmd))
 		err = -EFAULT;
 out:
-- 
cgit v1.2.3


From 37238699073e7e93f05517e529661151173cd458 Mon Sep 17 00:00:00 2001
From: Hangyu Hua <hbh25y@gmail.com>
Date: Thu, 19 May 2022 03:17:43 +0100
Subject: media: dvb_vb2: fix possible out of bound access

vb2_core_qbuf and vb2_core_querybuf don't check the range of b->index
controlled by the user.

Fix this by adding range checking code before using them.

Fixes: 57868acc369a ("media: videobuf2: Add new uAPI for DVB streaming I/O")
Signed-off-by: Hangyu Hua <hbh25y@gmail.com>
Reviewed-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/dvb-core/dvb_vb2.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/media/dvb-core/dvb_vb2.c b/drivers/media/dvb-core/dvb_vb2.c
index a1bd6d9c9223..909df82fed33 100644
--- a/drivers/media/dvb-core/dvb_vb2.c
+++ b/drivers/media/dvb-core/dvb_vb2.c
@@ -354,6 +354,12 @@ int dvb_vb2_reqbufs(struct dvb_vb2_ctx *ctx, struct dmx_requestbuffers *req)
 
 int dvb_vb2_querybuf(struct dvb_vb2_ctx *ctx, struct dmx_buffer *b)
 {
+	struct vb2_queue *q = &ctx->vb_q;
+
+	if (b->index >= q->num_buffers) {
+		dprintk(1, "[%s] buffer index out of range\n", ctx->name);
+		return -EINVAL;
+	}
 	vb2_core_querybuf(&ctx->vb_q, b->index, b);
 	dprintk(3, "[%s] index=%d\n", ctx->name, b->index);
 	return 0;
@@ -378,8 +384,13 @@ int dvb_vb2_expbuf(struct dvb_vb2_ctx *ctx, struct dmx_exportbuffer *exp)
 
 int dvb_vb2_qbuf(struct dvb_vb2_ctx *ctx, struct dmx_buffer *b)
 {
+	struct vb2_queue *q = &ctx->vb_q;
 	int ret;
 
+	if (b->index >= q->num_buffers) {
+		dprintk(1, "[%s] buffer index out of range\n", ctx->name);
+		return -EINVAL;
+	}
 	ret = vb2_core_qbuf(&ctx->vb_q, b->index, b, NULL);
 	if (ret) {
 		dprintk(1, "[%s] index=%d errno=%d\n", ctx->name,
-- 
cgit v1.2.3


From a2d2e593d39bc2f29a1cd5e3779af457fd26490c Mon Sep 17 00:00:00 2001
From: "Nícolas F. R. A. Prado" <nfraprado@collabora.com>
Date: Fri, 17 Jun 2022 21:39:06 +0100
Subject: media: mediatek: vcodec: Drop platform_get_resource(IORESOURCE_IRQ)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit a1a2b7125e10 ("of/platform: Drop static setup of IRQ resource
from DT core") removed support for calling platform_get_resource(...,
IORESOURCE_IRQ, ...) on DT-based drivers, but the probe() function of
mtk-vcodec's encoder was still making use of it. This caused the encoder
driver to fail probe.

Since the platform_get_resource() call was only being used to check for
the presence of the interrupt (its returned resource wasn't even used)
and platform_get_irq() was already being used to get the IRQ, simply
drop the use of platform_get_resource(IORESOURCE_IRQ) and handle the
failure of platform_get_irq(), to get the driver probing again.

[hverkuil: drop unused struct resource *res]

Fixes: a1a2b7125e10 ("of/platform: Drop static setup of IRQ resource from DT core")
Signed-off-by: Nícolas F. R. A. Prado <nfraprado@collabora.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc_drv.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc_drv.c b/drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc_drv.c
index 95e8c29ccc65..d2f5f30582a9 100644
--- a/drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc_drv.c
+++ b/drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc_drv.c
@@ -228,7 +228,6 @@ static int mtk_vcodec_probe(struct platform_device *pdev)
 {
 	struct mtk_vcodec_dev *dev;
 	struct video_device *vfd_enc;
-	struct resource *res;
 	phandle rproc_phandle;
 	enum mtk_vcodec_fw_type fw_type;
 	int ret;
@@ -272,14 +271,12 @@ static int mtk_vcodec_probe(struct platform_device *pdev)
 		goto err_res;
 	}
 
-	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (res == NULL) {
-		dev_err(&pdev->dev, "failed to get irq resource");
-		ret = -ENOENT;
+	dev->enc_irq = platform_get_irq(pdev, 0);
+	if (dev->enc_irq < 0) {
+		ret = dev->enc_irq;
 		goto err_res;
 	}
 
-	dev->enc_irq = platform_get_irq(pdev, 0);
 	irq_set_status_flags(dev->enc_irq, IRQ_NOAUTOEN);
 	ret = devm_request_irq(&pdev->dev, dev->enc_irq,
 			       mtk_vcodec_enc_irq_handler,
-- 
cgit v1.2.3


From 3a99c4474112f49a5459933d8758614002ca0ddc Mon Sep 17 00:00:00 2001
From: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Date: Fri, 10 Jun 2022 13:52:11 +0100
Subject: media: rkvdec: Disable H.264 error detection

Quite often, the HW get stuck in error condition if a stream error
was detected. As documented, the HW should stop immediately and self
reset. There is likely a problem or a miss-understanding of the self
reset mechanism, as unless we make a long pause, the next command
will then report an error even if there is no error in it.

Disabling error detection fixes the issue, and let the decoder continue
after an error. This patch is safe for backport into older kernels.

Fixes: cd33c830448b ("media: rkvdec: Add the rkvdec driver")
Signed-off-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Reviewed-by: Brian Norris <briannorris@chromium.org>
Tested-by: Brian Norris <briannorris@chromium.org>
Reviewed-by: Ezequiel Garcia <ezequiel@vanguardiasur.com.ar>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/staging/media/rkvdec/rkvdec-h264.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/media/rkvdec/rkvdec-h264.c b/drivers/staging/media/rkvdec/rkvdec-h264.c
index 4af5a831bde0..4fc167b42cf0 100644
--- a/drivers/staging/media/rkvdec/rkvdec-h264.c
+++ b/drivers/staging/media/rkvdec/rkvdec-h264.c
@@ -1162,8 +1162,8 @@ static int rkvdec_h264_run(struct rkvdec_ctx *ctx)
 
 	schedule_delayed_work(&rkvdec->watchdog_work, msecs_to_jiffies(2000));
 
-	writel(0xffffffff, rkvdec->regs + RKVDEC_REG_STRMD_ERR_EN);
-	writel(0xffffffff, rkvdec->regs + RKVDEC_REG_H264_ERR_E);
+	writel(0, rkvdec->regs + RKVDEC_REG_STRMD_ERR_EN);
+	writel(0, rkvdec->regs + RKVDEC_REG_H264_ERR_E);
 	writel(1, rkvdec->regs + RKVDEC_REG_PREF_LUMA_CACHE_COMMAND);
 	writel(1, rkvdec->regs + RKVDEC_REG_PREF_CHR_CACHE_COMMAND);
 
-- 
cgit v1.2.3


From 415ba26cb73f7d22a892043301b91b57ae54db02 Mon Sep 17 00:00:00 2001
From: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Date: Thu, 22 Sep 2022 17:59:24 +0300
Subject: usb: typec: ucsi: Remove incorrect warning

Sink only devices do not have any source capabilities, so
the driver should not warn about that. Also DRP (Dual Role
Power) capable devices, such as USB Type-C docking stations,
do not return any source capabilities unless they are
plugged to a power supply themselves.

Fixes: 1f4642b72be7 ("usb: typec: ucsi: Retrieve all the PDOs instead of just the first 4")
Reported-by: Paul Menzel <pmenzel@molgen.mpg.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Link: https://lore.kernel.org/r/20220922145924.80667-1-heikki.krogerus@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/typec/ucsi/ucsi.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c
index 7f2624f42724..6364f0d467ea 100644
--- a/drivers/usb/typec/ucsi/ucsi.c
+++ b/drivers/usb/typec/ucsi/ucsi.c
@@ -588,8 +588,6 @@ static int ucsi_get_pdos(struct ucsi_connector *con, int is_partner,
 				num_pdos * sizeof(u32));
 	if (ret < 0 && ret != -ETIMEDOUT)
 		dev_err(ucsi->dev, "UCSI_GET_PDOS failed (%d)\n", ret);
-	if (ret == 0 && offset == 0)
-		dev_warn(ucsi->dev, "UCSI_GET_PDOS returned 0 bytes\n");
 
 	return ret;
 }
-- 
cgit v1.2.3


From f5c5936d6b4dfd3f9a145be5bc6ff27eb706bc39 Mon Sep 17 00:00:00 2001
From: Patrice Chotard <patrice.chotard@foss.st.com>
Date: Mon, 26 Sep 2022 14:43:59 +0200
Subject: usb: dwc3: st: Fix node's child name

Update node's child name from "dwc3" to "usb", this fixes
the following issue:

[3.773852] usb-st-dwc3 8f94000.dwc3: failed to find dwc3 core node

Fixes: 3120910a099b ("ARM: dts: stih407-family: Harmonize DWC USB3 DT nodes name")
Reported-by: Jerome Audu <jerome.audu@st.com>
Signed-off-by: Patrice Chotard <patrice.chotard@foss.st.com>
Link: https://lore.kernel.org/r/20220926124359.304770-1-patrice.chotard@foss.st.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/dwc3/dwc3-st.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/dwc3/dwc3-st.c b/drivers/usb/dwc3/dwc3-st.c
index 166b5bde45cb..6c14a79279f9 100644
--- a/drivers/usb/dwc3/dwc3-st.c
+++ b/drivers/usb/dwc3/dwc3-st.c
@@ -251,7 +251,7 @@ static int st_dwc3_probe(struct platform_device *pdev)
 	/* Manage SoftReset */
 	reset_control_deassert(dwc3_data->rstc_rst);
 
-	child = of_get_child_by_name(node, "dwc3");
+	child = of_get_child_by_name(node, "usb");
 	if (!child) {
 		dev_err(&pdev->dev, "failed to find dwc3 core node\n");
 		ret = -ENODEV;
-- 
cgit v1.2.3


From a625a4b8806cc1e928b7dd2cca1fee709c9de56e Mon Sep 17 00:00:00 2001
From: Hongling Zeng <zenghongling@kylinos.cn>
Date: Fri, 23 Sep 2022 10:46:13 +0800
Subject: uas: add no-uas quirk for Hiksemi usb_disk

The UAS mode of Hiksemi is reported to fail to work on several platforms
with the following error message, then after re-connecting the device will
be offlined and not working at all.

[  592.518442][ 2] sd 8:0:0:0: [sda] tag#17 uas_eh_abort_handler 0 uas-tag 18
                   inflight: CMD
[  592.527575][ 2] sd 8:0:0:0: [sda] tag#17 CDB: Write(10) 2a 00 03 6f 88 00 00
                   04 00 00
[  592.536330][ 2] sd 8:0:0:0: [sda] tag#0 uas_eh_abort_handler 0 uas-tag 1
                   inflight: CMD
[  592.545266][ 2] sd 8:0:0:0: [sda] tag#0 CDB: Write(10) 2a 00 07 44 1a 88 00
                   00 08 00

These disks have a broken uas implementation, the tag field of the status
iu-s is not set properly,so we need to fall-back to usb-storage.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
Cc: stable <stable@kernel.org>
Signed-off-by: Hongling Zeng <zenghongling@kylinos.cn>
Link: https://lore.kernel.org/r/1663901173-21020-1-git-send-email-zenghongling@kylinos.cn
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/storage/unusual_uas.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h
index 23ab3b048d9b..c967b3b62dce 100644
--- a/drivers/usb/storage/unusual_uas.h
+++ b/drivers/usb/storage/unusual_uas.h
@@ -52,6 +52,13 @@ UNUSUAL_DEV(0x059f, 0x1061, 0x0000, 0x9999,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_NO_REPORT_OPCODES | US_FL_NO_SAME),
 
+/* Reported-by: Hongling Zeng <zenghongling@kylinos.cn> */
+UNUSUAL_DEV(0x090c, 0x2000, 0x0000, 0x9999,
+		"Hiksemi",
+		"External HDD",
+		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+		US_FL_IGNORE_UAS),
+
 /*
  * Apricorn USB3 dongle sometimes returns "USBSUSBSUSBS" in response to SCSI
  * commands in UAS mode.  Observed with the 1.28 firmware; are there others?
-- 
cgit v1.2.3


From e00b488e813f0f1ad9f778e771b7cd2fe2877023 Mon Sep 17 00:00:00 2001
From: Hongling Zeng <zenghongling@kylinos.cn>
Date: Fri, 23 Sep 2022 10:46:25 +0800
Subject: usb-storage: Add Hiksemi USB3-FW to IGNORE_UAS

The UAS mode of Hiksemi USB_HDD is reported to fail to work on several
platforms with the following error message, then after re-connecting the
device will be offlined and not working at all.

[  592.518442][ 2] sd 8:0:0:0: [sda] tag#17 uas_eh_abort_handler 0 uas-tag 18
                   inflight: CMD
[  592.527575][ 2] sd 8:0:0:0: [sda] tag#17 CDB: Write(10) 2a 00 03 6f 88 00 00
                   04 00 00
[  592.536330][ 2] sd 8:0:0:0: [sda] tag#0 uas_eh_abort_handler 0 uas-tag 1
                   inflight: CMD
[  592.545266][ 2] sd 8:0:0:0: [sda] tag#0 CDB: Write(10) 2a 00 07 44 1a 88 00
                   00 08 00

These disks have a broken uas implementation, the tag field of the status
iu-s is not set properly,so we need to fall-back to usb-storage.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
Cc: stable <stable@kernel.org>
Signed-off-by: Hongling Zeng <zenghongling@kylinos.cn>
Link: https://lore.kernel.org/r/1663901185-21067-1-git-send-email-zenghongling@kylinos.cn
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/storage/unusual_uas.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h
index c967b3b62dce..e5960b29f902 100644
--- a/drivers/usb/storage/unusual_uas.h
+++ b/drivers/usb/storage/unusual_uas.h
@@ -83,6 +83,13 @@ UNUSUAL_DEV(0x0bc2, 0x331a, 0x0000, 0x9999,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_NO_REPORT_LUNS),
 
+/* Reported-by: Hongling Zeng <zenghongling@kylinos.cn> */
+UNUSUAL_DEV(0x0bda, 0x9210, 0x0000, 0x9999,
+		"Hiksemi",
+		"External HDD",
+		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+		US_FL_IGNORE_UAS),
+
 /* Reported-by: Benjamin Tissoires <benjamin.tissoires@redhat.com> */
 UNUSUAL_DEV(0x13fd, 0x3940, 0x0000, 0x9999,
 		"Initio Corporation",
-- 
cgit v1.2.3


From 0fb9703a3eade0bb84c635705d9c795345e55053 Mon Sep 17 00:00:00 2001
From: Hongling Zeng <zenghongling@kylinos.cn>
Date: Fri, 23 Sep 2022 10:46:35 +0800
Subject: uas: ignore UAS for Thinkplus chips

The UAS mode of Thinkplus(0x17ef, 0x3899) is reported to influence
performance and trigger kernel panic on several platforms with the
following error message:

[   39.702439] xhci_hcd 0000:0c:00.3: ERROR Transfer event for disabled
               endpoint or incorrect stream ring
[   39.702442] xhci_hcd 0000:0c:00.3: @000000026c61f810 00000000 00000000
               1b000000 05038000

[  720.545894][13] Workqueue: usb_hub_wq hub_event
[  720.550971][13]  ffff88026c143c38 0000000000016300 ffff8802755bb900 ffff880
                    26cb80000
[  720.559673][13]  ffff88026c144000 ffff88026ca88100 0000000000000000 ffff880
                    26cb80000
[  720.568374][13]  ffff88026cb80000 ffff88026c143c50 ffffffff8186ae25 ffff880
                    26ca880f8
[  720.577076][13] Call Trace:
[  720.580201][13]  [<ffffffff8186ae25>] schedule+0x35/0x80
[  720.586137][13]  [<ffffffff8186b0ce>] schedule_preempt_disabled+0xe/0x10
[  720.593623][13]  [<ffffffff8186cb94>] __mutex_lock_slowpath+0x164/0x1e0
[  720.601012][13]  [<ffffffff8186cc3f>] mutex_lock+0x2f/0x40
[  720.607141][13]  [<ffffffff8162b8e9>] usb_disconnect+0x59/0x290

Falling back to USB mass storage can solve this problem, so ignore UAS
function of this chip.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
Cc: stable <stable@kernel.org>
Signed-off-by: Hongling Zeng <zenghongling@kylinos.cn>
Link: https://lore.kernel.org/r/1663902249837086.19.seg@mailgw
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/storage/unusual_uas.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h
index e5960b29f902..251778d14e2d 100644
--- a/drivers/usb/storage/unusual_uas.h
+++ b/drivers/usb/storage/unusual_uas.h
@@ -132,6 +132,13 @@ UNUSUAL_DEV(0x154b, 0xf00d, 0x0000, 0x9999,
 		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
 		US_FL_NO_ATA_1X),
 
+/* Reported-by: Hongling Zeng <zenghongling@kylinos.cn> */
+UNUSUAL_DEV(0x17ef, 0x3899, 0x0000, 0x9999,
+		"Thinkplus",
+		"External HDD",
+		USB_SC_DEVICE, USB_PR_DEVICE, NULL,
+		US_FL_IGNORE_UAS),
+
 /* Reported-by: Hans de Goede <hdegoede@redhat.com> */
 UNUSUAL_DEV(0x2109, 0x0711, 0x0000, 0x9999,
 		"VIA",
-- 
cgit v1.2.3


From 53a7f9961cddf505a9f3a7baa017e5af31838840 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 27 Sep 2022 14:36:06 +0800
Subject: erofs: clean up unnecessary code and comments

Some conditional macros and comments are useless.

Link: https://lore.kernel.org/r/20220927063607.54832-1-hsiangkao@linux.alibaba.com
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/internal.h |  2 --
 fs/erofs/namei.c    | 11 +----------
 fs/erofs/xattr.h    |  2 --
 fs/erofs/zmap.c     |  3 +--
 4 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index a6333c283e3d..0318530bc78a 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -196,7 +196,6 @@ enum {
 	EROFS_ZIP_CACHE_READAROUND
 };
 
-#ifdef CONFIG_EROFS_FS_ZIP
 #define EROFS_LOCKED_MAGIC     (INT_MIN | 0xE0F510CCL)
 
 /* basic unit of the workstation of a super_block */
@@ -236,7 +235,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
 	return atomic_cond_read_relaxed(&grp->refcount,
 					VAL != EROFS_LOCKED_MAGIC);
 }
-#endif	/* !CONFIG_EROFS_FS_ZIP */
 
 /* we strictly follow PAGE_SIZE and no buffer head yet */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index fd75506799c4..afbb80d4e2f1 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -185,7 +185,6 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
 	if (IS_ERR(de))
 		return PTR_ERR(de);
 
-	/* the target page has been mapped */
 	if (ndirents)
 		de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents);
 
@@ -197,9 +196,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
 	return PTR_ERR_OR_ZERO(de);
 }
 
-/* NOTE: i_mutex is already held by vfs */
-static struct dentry *erofs_lookup(struct inode *dir,
-				   struct dentry *dentry,
+static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry,
 				   unsigned int flags)
 {
 	int err;
@@ -207,17 +204,11 @@ static struct dentry *erofs_lookup(struct inode *dir,
 	unsigned int d_type;
 	struct inode *inode;
 
-	DBG_BUGON(!d_really_is_negative(dentry));
-	/* dentry must be unhashed in lookup, no need to worry about */
-	DBG_BUGON(!d_unhashed(dentry));
-
 	trace_erofs_lookup(dir, dentry, flags);
 
-	/* file name exceeds fs limit */
 	if (dentry->d_name.len > EROFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	/* false uninitialized warnings on gcc 4.8.x */
 	err = erofs_namei(dir, &dentry->d_name, &nid, &d_type);
 
 	if (err == -ENOENT) {
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 332462c59f11..0a43c9ee9f8f 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -39,9 +39,7 @@ static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi,
 #ifdef CONFIG_EROFS_FS_XATTR
 extern const struct xattr_handler erofs_xattr_user_handler;
 extern const struct xattr_handler erofs_xattr_trusted_handler;
-#ifdef CONFIG_EROFS_FS_SECURITY
 extern const struct xattr_handler erofs_xattr_security_handler;
-#endif
 
 static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx)
 {
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 4cecd32b87c6..44c27ef39c43 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -731,8 +731,7 @@ out:
 	return err;
 }
 
-int z_erofs_map_blocks_iter(struct inode *inode,
-			    struct erofs_map_blocks *map,
+int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
 			    int flags)
 {
 	struct erofs_inode *const vi = EROFS_I(inode);
-- 
cgit v1.2.3


From 312fe643ad1153fe0337c46f4573030d0c2bac73 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 27 Sep 2022 14:36:07 +0800
Subject: erofs: clean up erofs_iget()

isdir indicated REQ_META|REQ_PRIO which no longer works now.
Get rid of isdir entirely.

Link: https://lore.kernel.org/r/20220927063607.54832-2-hsiangkao@linux.alibaba.com
Reviewed-by: Yue Hu <huyue2@coolpad.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/inode.c             | 24 ++++++++----------------
 fs/erofs/internal.h          |  2 +-
 fs/erofs/namei.c             |  2 +-
 fs/erofs/super.c             |  8 ++++----
 include/trace/events/erofs.h | 11 ++++-------
 5 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 16cf9a283557..ad2a82f2eb4c 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -241,7 +241,7 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr,
 	return 0;
 }
 
-static int erofs_fill_inode(struct inode *inode, int isdir)
+static int erofs_fill_inode(struct inode *inode)
 {
 	struct erofs_inode *vi = EROFS_I(inode);
 	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
@@ -249,7 +249,7 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
 	unsigned int ofs;
 	int err = 0;
 
-	trace_erofs_fill_inode(inode, isdir);
+	trace_erofs_fill_inode(inode);
 
 	/* read inode base data from disk */
 	kaddr = erofs_read_inode(&buf, inode, &ofs);
@@ -324,21 +324,13 @@ static int erofs_iget_set_actor(struct inode *inode, void *opaque)
 	return 0;
 }
 
-static inline struct inode *erofs_iget_locked(struct super_block *sb,
-					      erofs_nid_t nid)
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid)
 {
 	const unsigned long hashval = erofs_inode_hash(nid);
+	struct inode *inode;
 
-	return iget5_locked(sb, hashval, erofs_ilookup_test_actor,
+	inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor,
 		erofs_iget_set_actor, &nid);
-}
-
-struct inode *erofs_iget(struct super_block *sb,
-			 erofs_nid_t nid,
-			 bool isdir)
-{
-	struct inode *inode = erofs_iget_locked(sb, nid);
-
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
@@ -348,10 +340,10 @@ struct inode *erofs_iget(struct super_block *sb,
 
 		vi->nid = nid;
 
-		err = erofs_fill_inode(inode, isdir);
-		if (!err)
+		err = erofs_fill_inode(inode);
+		if (!err) {
 			unlock_new_inode(inode);
-		else {
+		} else {
 			iget_failed(inode);
 			inode = ERR_PTR(err);
 		}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 0318530bc78a..1701df48c446 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -492,7 +492,7 @@ extern const struct inode_operations erofs_generic_iops;
 extern const struct inode_operations erofs_symlink_iops;
 extern const struct inode_operations erofs_fast_symlink_iops;
 
-struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir);
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
 int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
 		  struct kstat *stat, u32 request_mask,
 		  unsigned int query_flags);
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index afbb80d4e2f1..0dc34721080c 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -219,7 +219,7 @@ static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry,
 	} else {
 		erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__,
 			  dentry, nid, d_type);
-		inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR);
+		inode = erofs_iget(dir->i_sb, nid);
 	}
 	return d_splice_alias(inode, dentry);
 }
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 8040534ae5c0..2cf96ce1c32e 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -385,7 +385,7 @@ static int erofs_read_superblock(struct super_block *sb)
 	sbi->packed_inode = NULL;
 	if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) {
 		sbi->packed_inode =
-			erofs_iget(sb, le64_to_cpu(dsb->packed_nid), false);
+			erofs_iget(sb, le64_to_cpu(dsb->packed_nid));
 		if (IS_ERR(sbi->packed_inode)) {
 			ret = PTR_ERR(sbi->packed_inode);
 			goto out;
@@ -668,7 +668,7 @@ static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
 static struct inode *erofs_nfs_get_inode(struct super_block *sb,
 					 u64 ino, u32 generation)
 {
-	return erofs_iget(sb, ino, false);
+	return erofs_iget(sb, ino);
 }
 
 static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
@@ -694,7 +694,7 @@ static struct dentry *erofs_get_parent(struct dentry *child)
 	err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type);
 	if (err)
 		return ERR_PTR(err);
-	return d_obtain_alias(erofs_iget(child->d_sb, nid, d_type == FT_DIR));
+	return d_obtain_alias(erofs_iget(child->d_sb, nid));
 }
 
 static const struct export_operations erofs_export_ops = {
@@ -782,7 +782,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 #endif
 
 	/* get the root inode */
-	inode = erofs_iget(sb, ROOT_NID(sbi), true);
+	inode = erofs_iget(sb, ROOT_NID(sbi));
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h
index 57de057bd503..4f4c44ea3a65 100644
--- a/include/trace/events/erofs.h
+++ b/include/trace/events/erofs.h
@@ -53,15 +53,14 @@ TRACE_EVENT(erofs_lookup,
 );
 
 TRACE_EVENT(erofs_fill_inode,
-	TP_PROTO(struct inode *inode, int isdir),
-	TP_ARGS(inode, isdir),
+	TP_PROTO(struct inode *inode),
+	TP_ARGS(inode),
 
 	TP_STRUCT__entry(
 		__field(dev_t,		dev	)
 		__field(erofs_nid_t,	nid	)
 		__field(erofs_blk_t,	blkaddr )
 		__field(unsigned int,	ofs	)
-		__field(int,		isdir	)
 	),
 
 	TP_fast_assign(
@@ -69,13 +68,11 @@ TRACE_EVENT(erofs_fill_inode,
 		__entry->nid		= EROFS_I(inode)->nid;
 		__entry->blkaddr	= erofs_blknr(iloc(EROFS_I_SB(inode), __entry->nid));
 		__entry->ofs		= erofs_blkoff(iloc(EROFS_I_SB(inode), __entry->nid));
-		__entry->isdir		= isdir;
 	),
 
-	TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u, isdir %d",
+	TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u",
 		  show_dev_nid(__entry),
-		  __entry->blkaddr, __entry->ofs,
-		  __entry->isdir)
+		  __entry->blkaddr, __entry->ofs)
 );
 
 TRACE_EVENT(erofs_readpage,
-- 
cgit v1.2.3


From 0f816e024f2fb5a52ebd42daf07233a5b45c25dd Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Mon, 26 Sep 2022 10:14:57 -0700
Subject: KVM: selftests: Skip tests that require EPT when it is not available

Skip selftests that require EPT support in the VM when it is not
available. For example, if running on a machine where kvm_intel.ept=N
since KVM does not offer EPT support to guests if EPT is not supported
on the host.

This commit causes vmx_dirty_log_test to be skipped instead of failing
on hosts where kvm_intel.ept=N.

Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20220926171457.532542-1-dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/include/x86_64/vmx.h |  1 +
 tools/testing/selftests/kvm/lib/x86_64/vmx.c     | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h
index 99fa1410964c..790c6d1ecb34 100644
--- a/tools/testing/selftests/kvm/include/x86_64/vmx.h
+++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
@@ -617,6 +617,7 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
 			uint32_t memslot);
 void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
 			    uint64_t addr, uint64_t size);
+bool kvm_vm_has_ept(struct kvm_vm *vm);
 void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
 		  uint32_t eptp_memslot);
 void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm);
diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
index 80a568c439b8..d21049c38fc5 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
@@ -5,6 +5,8 @@
  * Copyright (C) 2018, Google LLC.
  */
 
+#include <asm/msr-index.h>
+
 #include "test_util.h"
 #include "kvm_util.h"
 #include "processor.h"
@@ -542,9 +544,27 @@ void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm,
 	__nested_map(vmx, vm, addr, addr, size, PG_LEVEL_1G);
 }
 
+bool kvm_vm_has_ept(struct kvm_vm *vm)
+{
+	struct kvm_vcpu *vcpu;
+	uint64_t ctrl;
+
+	vcpu = list_first_entry(&vm->vcpus, struct kvm_vcpu, list);
+	TEST_ASSERT(vcpu, "Cannot determine EPT support without vCPUs.\n");
+
+	ctrl = vcpu_get_msr(vcpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS) >> 32;
+	if (!(ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
+		return false;
+
+	ctrl = vcpu_get_msr(vcpu, MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
+	return ctrl & SECONDARY_EXEC_ENABLE_EPT;
+}
+
 void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
 		  uint32_t eptp_memslot)
 {
+	TEST_REQUIRE(kvm_vm_has_ept(vm));
+
 	vmx->eptp = (void *)vm_vaddr_alloc_page(vm);
 	vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp);
 	vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp);
-- 
cgit v1.2.3


From 2702c789996d9001cb60ef1dac055aca84d3c51a Mon Sep 17 00:00:00 2001
From: Yauheni Kaliuta <ykaliuta@redhat.com>
Date: Mon, 26 Sep 2022 12:23:20 +0300
Subject: selftests/bpf: Fix passing arguments via function in test_kmod.sh

Since the tests are run in a function $@ there actually contains the
function arguments, not the script ones.

Pass "$@" to the function as well.

Fixes: 272d1f4cfa3c ("selftests: bpf: test_kmod.sh: Pass parameters to the module")
Signed-off-by: Yauheni Kaliuta <ykaliuta@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220926092320.564631-1-ykaliuta@redhat.com
---
 tools/testing/selftests/bpf/test_kmod.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_kmod.sh b/tools/testing/selftests/bpf/test_kmod.sh
index d4a4279c0181..50dca53ac536 100755
--- a/tools/testing/selftests/bpf/test_kmod.sh
+++ b/tools/testing/selftests/bpf/test_kmod.sh
@@ -29,6 +29,7 @@ test_run()
 	sysctl -w net.core.bpf_jit_harden=$2 2>&1 > /dev/null
 
 	echo "[ JIT enabled:$1 hardened:$2 ]"
+	shift 2
 	dmesg -C
 	if [ -f ${OUTPUT}/lib/test_bpf.ko ]; then
 		insmod ${OUTPUT}/lib/test_bpf.ko "$@" 2> /dev/null
@@ -64,9 +65,9 @@ test_restore()
 
 rc=0
 test_save
-test_run 0 0
-test_run 1 0
-test_run 1 1
-test_run 1 2
+test_run 0 0 "$@"
+test_run 1 0 "$@"
+test_run 1 1 "$@"
+test_run 1 2 "$@"
 test_restore
 exit $rc
-- 
cgit v1.2.3


From 987f20a9dcce3989e48d87cff3952c095c994445 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 26 Sep 2022 17:15:32 -0500
Subject: a.out: Remove the a.out implementation

In commit 19e8b701e258 ("a.out: Stop building a.out/osf1 support on
alpha and m68k") the last users of a.out were disabled.

As nothing has turned up to cause this change to be reverted, let's
remove the code implementing a.out support as well.

There may be userspace users of the uapi bits left so the uapi
headers have been left untouched.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Arnd Bergmann <arnd@arndb.de> # arm defconfigs
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/871qrx3hq3.fsf@email.froward.int.ebiederm.org
---
 MAINTAINERS                           |   1 -
 arch/alpha/include/asm/a.out.h        |  16 --
 arch/alpha/kernel/Makefile            |   4 -
 arch/alpha/kernel/binfmt_loader.c     |  46 -----
 arch/alpha/kernel/osf_sys.c           |  30 ---
 arch/arm/configs/badge4_defconfig     |   1 -
 arch/arm/configs/corgi_defconfig      |   1 -
 arch/arm/configs/ezx_defconfig        |   1 -
 arch/arm/configs/footbridge_defconfig |   1 -
 arch/arm/configs/hackkit_defconfig    |   1 -
 arch/arm/configs/iop32x_defconfig     |   1 -
 arch/arm/configs/jornada720_defconfig |   1 -
 arch/arm/configs/lart_defconfig       |   1 -
 arch/arm/configs/neponset_defconfig   |   1 -
 arch/arm/configs/netwinder_defconfig  |   1 -
 arch/arm/configs/rpc_defconfig        |   1 -
 arch/arm/configs/spitz_defconfig      |   1 -
 fs/Kconfig.binfmt                     |  33 ----
 fs/Makefile                           |   1 -
 fs/binfmt_aout.c                      | 342 ----------------------------------
 fs/exec.c                             |   3 +-
 include/linux/a.out.h                 |  18 --
 22 files changed, 1 insertion(+), 505 deletions(-)
 delete mode 100644 arch/alpha/include/asm/a.out.h
 delete mode 100644 arch/alpha/kernel/binfmt_loader.c
 delete mode 100644 fs/binfmt_aout.c
 delete mode 100644 include/linux/a.out.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 9d7f64dc0efe..d01ef6cbf3af 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7685,7 +7685,6 @@ R:	Kees Cook <keescook@chromium.org>
 L:	linux-mm@kvack.org
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/execve
-F:	arch/alpha/kernel/binfmt_loader.c
 F:	fs/*binfmt_*.c
 F:	fs/exec.c
 F:	include/linux/binfmts.h
diff --git a/arch/alpha/include/asm/a.out.h b/arch/alpha/include/asm/a.out.h
deleted file mode 100644
index d2346b7caff1..000000000000
--- a/arch/alpha/include/asm/a.out.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ALPHA_A_OUT_H__
-#define __ALPHA_A_OUT_H__
-
-#include <uapi/asm/a.out.h>
-
-
-/* Assume that start addresses below 4G belong to a TASO application.
-   Unfortunately, there is no proper bit in the exec header to check.
-   Worse, we have to notice the start address before swapping to use
-   /sbin/loader, which of course is _not_ a TASO application.  */
-#define SET_AOUT_PERSONALITY(BFPM, EX) \
-	set_personality (((BFPM->taso || EX.ah.entry < 0x100000000L \
-			   ? ADDR_LIMIT_32BIT : 0) | PER_OSF4))
-
-#endif /* __A_OUT_GNU_H__ */
diff --git a/arch/alpha/kernel/Makefile b/arch/alpha/kernel/Makefile
index 5a74581bf0ee..6a274c0d53a2 100644
--- a/arch/alpha/kernel/Makefile
+++ b/arch/alpha/kernel/Makefile
@@ -47,10 +47,6 @@ else
 # Misc support
 obj-$(CONFIG_ALPHA_SRM)		+= srmcons.o
 
-ifdef CONFIG_BINFMT_AOUT
-obj-y	+= binfmt_loader.o
-endif
-
 # Core logic support
 obj-$(CONFIG_ALPHA_APECS)	+= core_apecs.o
 obj-$(CONFIG_ALPHA_CIA)		+= core_cia.o
diff --git a/arch/alpha/kernel/binfmt_loader.c b/arch/alpha/kernel/binfmt_loader.c
deleted file mode 100644
index e4be7a543ecf..000000000000
--- a/arch/alpha/kernel/binfmt_loader.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/mm_types.h>
-#include <linux/binfmts.h>
-#include <linux/a.out.h>
-
-static int load_binary(struct linux_binprm *bprm)
-{
-	struct exec *eh = (struct exec *)bprm->buf;
-	unsigned long loader;
-	struct file *file;
-	int retval;
-
-	if (eh->fh.f_magic != 0x183 || (eh->fh.f_flags & 0x3000) != 0x3000)
-		return -ENOEXEC;
-
-	if (bprm->loader)
-		return -ENOEXEC;
-
-	loader = bprm->vma->vm_end - sizeof(void *);
-
-	file = open_exec("/sbin/loader");
-	retval = PTR_ERR(file);
-	if (IS_ERR(file))
-		return retval;
-
-	/* Remember if the application is TASO.  */
-	bprm->taso = eh->ah.entry < 0x100000000UL;
-
-	bprm->interpreter = file;
-	bprm->loader = loader;
-	return 0;
-}
-
-static struct linux_binfmt loader_format = {
-	.load_binary	= load_binary,
-};
-
-static int __init init_loader_binfmt(void)
-{
-	insert_binfmt(&loader_format);
-	return 0;
-}
-arch_initcall(init_loader_binfmt);
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index d257293401e2..b3ad8c44c971 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1278,45 +1278,15 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	return addr;
 }
 
-#ifdef CONFIG_OSF4_COMPAT
-/* Clear top 32 bits of iov_len in the user's buffer for
-   compatibility with old versions of OSF/1 where iov_len
-   was defined as int. */
-static int
-osf_fix_iov_len(const struct iovec __user *iov, unsigned long count)
-{
-	unsigned long i;
-
-	for (i = 0 ; i < count ; i++) {
-		int __user *iov_len_high = (int __user *)&iov[i].iov_len + 1;
-
-		if (put_user(0, iov_len_high))
-			return -EFAULT;
-	}
-	return 0;
-}
-#endif
-
 SYSCALL_DEFINE3(osf_readv, unsigned long, fd,
 		const struct iovec __user *, vector, unsigned long, count)
 {
-#ifdef CONFIG_OSF4_COMPAT
-	if (unlikely(personality(current->personality) == PER_OSF4))
-		if (osf_fix_iov_len(vector, count))
-			return -EFAULT;
-#endif
-
 	return sys_readv(fd, vector, count);
 }
 
 SYSCALL_DEFINE3(osf_writev, unsigned long, fd,
 		const struct iovec __user *, vector, unsigned long, count)
 {
-#ifdef CONFIG_OSF4_COMPAT
-	if (unlikely(personality(current->personality) == PER_OSF4))
-		if (osf_fix_iov_len(vector, count))
-			return -EFAULT;
-#endif
 	return sys_writev(fd, vector, count);
 }
 
diff --git a/arch/arm/configs/badge4_defconfig b/arch/arm/configs/badge4_defconfig
index 506f3378da07..6908032fbce8 100644
--- a/arch/arm/configs/badge4_defconfig
+++ b/arch/arm/configs/badge4_defconfig
@@ -6,7 +6,6 @@ CONFIG_UNUSED_BOARD_FILES=y
 CONFIG_CMDLINE="init=/linuxrc root=/dev/mtdblock3"
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
 CONFIG_FPE_NWFPE=y
-CONFIG_BINFMT_AOUT=m
 CONFIG_MODULES=y
 CONFIG_MODVERSIONS=y
 CONFIG_PARTITION_ADVANCED=y
diff --git a/arch/arm/configs/corgi_defconfig b/arch/arm/configs/corgi_defconfig
index 1f137f74050f..df84640f4f57 100644
--- a/arch/arm/configs/corgi_defconfig
+++ b/arch/arm/configs/corgi_defconfig
@@ -16,7 +16,6 @@ CONFIG_MACH_HUSKY=y
 CONFIG_UNUSED_BOARD_FILES=y
 CONFIG_CMDLINE="console=ttyS0,115200n8 console=tty1 noinitrd root=/dev/mtdblock2 rootfstype=jffs2   debug"
 CONFIG_FPE_NWFPE=y
-CONFIG_BINFMT_AOUT=m
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
diff --git a/arch/arm/configs/ezx_defconfig b/arch/arm/configs/ezx_defconfig
index 1a41391d7367..cd9ccc4e4627 100644
--- a/arch/arm/configs/ezx_defconfig
+++ b/arch/arm/configs/ezx_defconfig
@@ -25,7 +25,6 @@ CONFIG_CPU_FREQ_GOV_ONDEMAND=m
 CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m
 CONFIG_CPU_IDLE=y
 CONFIG_FPE_NWFPE=y
-CONFIG_BINFMT_AOUT=m
 CONFIG_PM=y
 CONFIG_APM_EMULATION=y
 CONFIG_MODULES=y
diff --git a/arch/arm/configs/footbridge_defconfig b/arch/arm/configs/footbridge_defconfig
index 504070812ad0..b5b56f8dda5f 100644
--- a/arch/arm/configs/footbridge_defconfig
+++ b/arch/arm/configs/footbridge_defconfig
@@ -9,7 +9,6 @@ CONFIG_ARCH_EBSA285_HOST=y
 CONFIG_ARCH_NETWINDER=y
 CONFIG_FPE_NWFPE=y
 CONFIG_FPE_NWFPE_XP=y
-CONFIG_BINFMT_AOUT=y
 CONFIG_MODULES=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_ACORN_PARTITION=y
diff --git a/arch/arm/configs/hackkit_defconfig b/arch/arm/configs/hackkit_defconfig
index b9327b2eacd3..398558c4ffa8 100644
--- a/arch/arm/configs/hackkit_defconfig
+++ b/arch/arm/configs/hackkit_defconfig
@@ -7,7 +7,6 @@ CONFIG_UNUSED_BOARD_FILES=y
 CONFIG_CMDLINE="console=ttySA0,115200 root=/dev/ram0 initrd=0xc0400000,8M init=/rootshell"
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
 CONFIG_FPE_NWFPE=y
-CONFIG_BINFMT_AOUT=y
 CONFIG_MODULES=y
 CONFIG_NET=y
 CONFIG_PACKET=y
diff --git a/arch/arm/configs/iop32x_defconfig b/arch/arm/configs/iop32x_defconfig
index c16e92cdfd00..19e30e790d35 100644
--- a/arch/arm/configs/iop32x_defconfig
+++ b/arch/arm/configs/iop32x_defconfig
@@ -12,7 +12,6 @@ CONFIG_MACH_N2100=y
 CONFIG_UNUSED_BOARD_FILES=y
 CONFIG_CMDLINE="console=ttyS0,115200 root=/dev/nfs ip=bootp cachepolicy=writealloc"
 CONFIG_FPE_NWFPE=y
-CONFIG_BINFMT_AOUT=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_PARTITION_ADVANCED=y
diff --git a/arch/arm/configs/jornada720_defconfig b/arch/arm/configs/jornada720_defconfig
index 3dcf89d3e1f1..1a11ee6b3e24 100644
--- a/arch/arm/configs/jornada720_defconfig
+++ b/arch/arm/configs/jornada720_defconfig
@@ -6,7 +6,6 @@ CONFIG_SA1100_JORNADA720=y
 CONFIG_SA1100_JORNADA720_SSP=y
 CONFIG_UNUSED_BOARD_FILES=y
 CONFIG_FPE_NWFPE=y
-CONFIG_BINFMT_AOUT=y
 CONFIG_PM=y
 CONFIG_MODULES=y
 CONFIG_NET=y
diff --git a/arch/arm/configs/lart_defconfig b/arch/arm/configs/lart_defconfig
index 0c2f19d756c0..00583d64d2ea 100644
--- a/arch/arm/configs/lart_defconfig
+++ b/arch/arm/configs/lart_defconfig
@@ -8,7 +8,6 @@ CONFIG_CMDLINE="console=ttySA0,9600 root=/dev/ram"
 CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
 CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_FPE_NWFPE=y
-CONFIG_BINFMT_AOUT=y
 CONFIG_PM=y
 CONFIG_MODULES=y
 CONFIG_NET=y
diff --git a/arch/arm/configs/neponset_defconfig b/arch/arm/configs/neponset_defconfig
index 907403529e30..2d16ddb0e7ff 100644
--- a/arch/arm/configs/neponset_defconfig
+++ b/arch/arm/configs/neponset_defconfig
@@ -9,7 +9,6 @@ CONFIG_ZBOOT_ROM_BSS=0xc1000000
 CONFIG_ZBOOT_ROM=y
 CONFIG_CMDLINE="console=ttySA0,38400n8 cpufreq=221200 rw root=/dev/mtdblock2 mtdparts=sa1100:512K(boot),1M(kernel),2560K(initrd),4M(root) load_ramdisk=1 prompt_ramdisk=0 mem=32M noinitrd initrd=0xc0800000,3M"
 CONFIG_FPE_NWFPE=y
-CONFIG_BINFMT_AOUT=y
 CONFIG_PM=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
diff --git a/arch/arm/configs/netwinder_defconfig b/arch/arm/configs/netwinder_defconfig
index cf7bbcf9d98a..7a14ea1faa65 100644
--- a/arch/arm/configs/netwinder_defconfig
+++ b/arch/arm/configs/netwinder_defconfig
@@ -5,7 +5,6 @@ CONFIG_ARCH_NETWINDER=y
 CONFIG_DEPRECATED_PARAM_STRUCT=y
 CONFIG_CMDLINE="root=0x801"
 CONFIG_FPE_NWFPE=y
-CONFIG_BINFMT_AOUT=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_NET=y
 CONFIG_PACKET=y
diff --git a/arch/arm/configs/rpc_defconfig b/arch/arm/configs/rpc_defconfig
index 16d74a1f027a..b667c9d4527c 100644
--- a/arch/arm/configs/rpc_defconfig
+++ b/arch/arm/configs/rpc_defconfig
@@ -7,7 +7,6 @@ CONFIG_MODULE_UNLOAD=y
 CONFIG_ARCH_RPC=y
 CONFIG_CPU_SA110=y
 CONFIG_FPE_NWFPE=y
-CONFIG_BINFMT_AOUT=y
 CONFIG_PARTITION_ADVANCED=y
 CONFIG_BSD_DISKLABEL=y
 CONFIG_SLAB=y
diff --git a/arch/arm/configs/spitz_defconfig b/arch/arm/configs/spitz_defconfig
index 1284a1d92ca3..66d74653f3fb 100644
--- a/arch/arm/configs/spitz_defconfig
+++ b/arch/arm/configs/spitz_defconfig
@@ -13,7 +13,6 @@ CONFIG_MACH_AKITA=y
 CONFIG_MACH_BORZOI=y
 CONFIG_CMDLINE="console=ttyS0,115200n8 console=tty1 noinitrd root=/dev/mtdblock2 rootfstype=jffs2   debug"
 CONFIG_FPE_NWFPE=y
-CONFIG_BINFMT_AOUT=m
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 21e154516bf2..f14478643b91 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -142,39 +142,6 @@ config BINFMT_ZFLAT
 	help
 	  Support FLAT format compressed binaries
 
-config HAVE_AOUT
-       def_bool n
-
-config BINFMT_AOUT
-	tristate "Kernel support for a.out and ECOFF binaries"
-	depends on HAVE_AOUT
-	help
-	  A.out (Assembler.OUTput) is a set of formats for libraries and
-	  executables used in the earliest versions of UNIX.  Linux used
-	  the a.out formats QMAGIC and ZMAGIC until they were replaced
-	  with the ELF format.
-
-	  The conversion to ELF started in 1995.  This option is primarily
-	  provided for historical interest and for the benefit of those
-	  who need to run binaries from that era.
-
-	  Most people should answer N here.  If you think you may have
-	  occasional use for this format, enable module support above
-	  and answer M here to compile this support as a module called
-	  binfmt_aout.
-
-	  If any crucial components of your system (such as /sbin/init
-	  or /lib/ld.so) are still in a.out format, you will have to
-	  say Y here.
-
-config OSF4_COMPAT
-	bool "OSF/1 v4 readv/writev compatibility"
-	depends on ALPHA && BINFMT_AOUT
-	help
-	  Say Y if you are using OSF/1 binaries (like Netscape and Acrobat)
-	  with v4 shared libraries freely available from Compaq. If you're
-	  going to use shared libraries from Tru64 version 5.0 or later, say N.
-
 config BINFMT_MISC
 	tristate "Kernel support for MISC binaries"
 	help
diff --git a/fs/Makefile b/fs/Makefile
index 93b80529f8e8..4dea17840761 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -38,7 +38,6 @@ obj-$(CONFIG_FS_DAX)		+= dax.o
 obj-$(CONFIG_FS_ENCRYPTION)	+= crypto/
 obj-$(CONFIG_FS_VERITY)		+= verity/
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
-obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
 obj-$(CONFIG_BINFMT_MISC)	+= binfmt_misc.o
 obj-$(CONFIG_BINFMT_SCRIPT)	+= binfmt_script.o
 obj-$(CONFIG_BINFMT_ELF)	+= binfmt_elf.o
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
deleted file mode 100644
index 0dcfc691e7e2..000000000000
--- a/fs/binfmt_aout.c
+++ /dev/null
@@ -1,342 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  linux/fs/binfmt_aout.c
- *
- *  Copyright (C) 1991, 1992, 1996  Linus Torvalds
- */
-
-#include <linux/module.h>
-
-#include <linux/time.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/a.out.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/binfmts.h>
-#include <linux/personality.h>
-#include <linux/init.h>
-#include <linux/coredump.h>
-#include <linux/slab.h>
-#include <linux/sched/task_stack.h>
-
-#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-
-static int load_aout_binary(struct linux_binprm *);
-static int load_aout_library(struct file*);
-
-static struct linux_binfmt aout_format = {
-	.module		= THIS_MODULE,
-	.load_binary	= load_aout_binary,
-	.load_shlib	= load_aout_library,
-};
-
-#define BAD_ADDR(x)	((unsigned long)(x) >= TASK_SIZE)
-
-static int set_brk(unsigned long start, unsigned long end)
-{
-	start = PAGE_ALIGN(start);
-	end = PAGE_ALIGN(end);
-	if (end > start)
-		return vm_brk(start, end - start);
-	return 0;
-}
-
-/*
- * create_aout_tables() parses the env- and arg-strings in new user
- * memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
- */
-static unsigned long __user *create_aout_tables(char __user *p, struct linux_binprm * bprm)
-{
-	char __user * __user *argv;
-	char __user * __user *envp;
-	unsigned long __user *sp;
-	int argc = bprm->argc;
-	int envc = bprm->envc;
-
-	sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
-#ifdef __alpha__
-/* whee.. test-programs are so much fun. */
-	put_user(0, --sp);
-	put_user(0, --sp);
-	if (bprm->loader) {
-		put_user(0, --sp);
-		put_user(1003, --sp);
-		put_user(bprm->loader, --sp);
-		put_user(1002, --sp);
-	}
-	put_user(bprm->exec, --sp);
-	put_user(1001, --sp);
-#endif
-	sp -= envc+1;
-	envp = (char __user * __user *) sp;
-	sp -= argc+1;
-	argv = (char __user * __user *) sp;
-#ifndef __alpha__
-	put_user((unsigned long) envp,--sp);
-	put_user((unsigned long) argv,--sp);
-#endif
-	put_user(argc,--sp);
-	current->mm->arg_start = (unsigned long) p;
-	while (argc-->0) {
-		char c;
-		put_user(p,argv++);
-		do {
-			get_user(c,p++);
-		} while (c);
-	}
-	put_user(NULL,argv);
-	current->mm->arg_end = current->mm->env_start = (unsigned long) p;
-	while (envc-->0) {
-		char c;
-		put_user(p,envp++);
-		do {
-			get_user(c,p++);
-		} while (c);
-	}
-	put_user(NULL,envp);
-	current->mm->env_end = (unsigned long) p;
-	return sp;
-}
-
-/*
- * These are the functions used to load a.out style executables and shared
- * libraries.  There is no binary dependent code anywhere else.
- */
-
-static int load_aout_binary(struct linux_binprm * bprm)
-{
-	struct pt_regs *regs = current_pt_regs();
-	struct exec ex;
-	unsigned long error;
-	unsigned long fd_offset;
-	unsigned long rlim;
-	int retval;
-
-	ex = *((struct exec *) bprm->buf);		/* exec-header */
-	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC &&
-	     N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) ||
-	    N_TRSIZE(ex) || N_DRSIZE(ex) ||
-	    i_size_read(file_inode(bprm->file)) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
-		return -ENOEXEC;
-	}
-
-	/*
-	 * Requires a mmap handler. This prevents people from using a.out
-	 * as part of an exploit attack against /proc-related vulnerabilities.
-	 */
-	if (!bprm->file->f_op->mmap)
-		return -ENOEXEC;
-
-	fd_offset = N_TXTOFF(ex);
-
-	/* Check initial limits. This avoids letting people circumvent
-	 * size limits imposed on them by creating programs with large
-	 * arrays in the data or bss.
-	 */
-	rlim = rlimit(RLIMIT_DATA);
-	if (rlim >= RLIM_INFINITY)
-		rlim = ~0;
-	if (ex.a_data + ex.a_bss > rlim)
-		return -ENOMEM;
-
-	/* Flush all traces of the currently running executable */
-	retval = begin_new_exec(bprm);
-	if (retval)
-		return retval;
-
-	/* OK, This is the point of no return */
-#ifdef __alpha__
-	SET_AOUT_PERSONALITY(bprm, ex);
-#else
-	set_personality(PER_LINUX);
-#endif
-	setup_new_exec(bprm);
-
-	current->mm->end_code = ex.a_text +
-		(current->mm->start_code = N_TXTADDR(ex));
-	current->mm->end_data = ex.a_data +
-		(current->mm->start_data = N_DATADDR(ex));
-	current->mm->brk = ex.a_bss +
-		(current->mm->start_brk = N_BSSADDR(ex));
-
-	retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
-	if (retval < 0)
-		return retval;
-
-
-	if (N_MAGIC(ex) == OMAGIC) {
-		unsigned long text_addr, map_size;
-		loff_t pos;
-
-		text_addr = N_TXTADDR(ex);
-
-#ifdef __alpha__
-		pos = fd_offset;
-		map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
-#else
-		pos = 32;
-		map_size = ex.a_text+ex.a_data;
-#endif
-		error = vm_brk(text_addr & PAGE_MASK, map_size);
-		if (error)
-			return error;
-
-		error = read_code(bprm->file, text_addr, pos,
-				  ex.a_text+ex.a_data);
-		if ((signed long)error < 0)
-			return error;
-	} else {
-		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
-		    (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
-		{
-			printk(KERN_NOTICE "executable not page aligned\n");
-		}
-
-		if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
-		{
-			printk(KERN_WARNING 
-			       "fd_offset is not page aligned. Please convert program: %pD\n",
-			       bprm->file);
-		}
-
-		if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
-			error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-			if (error)
-				return error;
-
-			read_code(bprm->file, N_TXTADDR(ex), fd_offset,
-				  ex.a_text + ex.a_data);
-			goto beyond_if;
-		}
-
-		error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
-			PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE,
-			fd_offset);
-
-		if (error != N_TXTADDR(ex))
-			return error;
-
-		error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
-				PROT_READ | PROT_WRITE | PROT_EXEC,
-				MAP_FIXED | MAP_PRIVATE,
-				fd_offset + ex.a_text);
-		if (error != N_DATADDR(ex))
-			return error;
-	}
-beyond_if:
-	set_binfmt(&aout_format);
-
-	retval = set_brk(current->mm->start_brk, current->mm->brk);
-	if (retval < 0)
-		return retval;
-
-	current->mm->start_stack =
-		(unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
-#ifdef __alpha__
-	regs->gp = ex.a_gpvalue;
-#endif
-	finalize_exec(bprm);
-	start_thread(regs, ex.a_entry, current->mm->start_stack);
-	return 0;
-}
-
-static int load_aout_library(struct file *file)
-{
-	struct inode * inode;
-	unsigned long bss, start_addr, len;
-	unsigned long error;
-	int retval;
-	struct exec ex;
-	loff_t pos = 0;
-
-	inode = file_inode(file);
-
-	retval = -ENOEXEC;
-	error = kernel_read(file, &ex, sizeof(ex), &pos);
-	if (error != sizeof(ex))
-		goto out;
-
-	/* We come in here for the regular a.out style of shared libraries */
-	if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) ||
-	    N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) ||
-	    i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) {
-		goto out;
-	}
-
-	/*
-	 * Requires a mmap handler. This prevents people from using a.out
-	 * as part of an exploit attack against /proc-related vulnerabilities.
-	 */
-	if (!file->f_op->mmap)
-		goto out;
-
-	if (N_FLAGS(ex))
-		goto out;
-
-	/* For  QMAGIC, the starting address is 0x20 into the page.  We mask
-	   this off to get the starting address for the page */
-
-	start_addr =  ex.a_entry & 0xfffff000;
-
-	if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
-		if (printk_ratelimit())
-		{
-			printk(KERN_WARNING 
-			       "N_TXTOFF is not page aligned. Please convert library: %pD\n",
-			       file);
-		}
-		retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-		if (retval)
-			goto out;
-
-		read_code(file, start_addr, N_TXTOFF(ex),
-			  ex.a_text + ex.a_data);
-		retval = 0;
-		goto out;
-	}
-	/* Now use mmap to map the library into memory. */
-	error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
-			PROT_READ | PROT_WRITE | PROT_EXEC,
-			MAP_FIXED | MAP_PRIVATE,
-			N_TXTOFF(ex));
-	retval = error;
-	if (error != start_addr)
-		goto out;
-
-	len = PAGE_ALIGN(ex.a_text + ex.a_data);
-	bss = ex.a_text + ex.a_data + ex.a_bss;
-	if (bss > len) {
-		retval = vm_brk(start_addr + len, bss - len);
-		if (retval)
-			goto out;
-	}
-	retval = 0;
-out:
-	return retval;
-}
-
-static int __init init_aout_binfmt(void)
-{
-	register_binfmt(&aout_format);
-	return 0;
-}
-
-static void __exit exit_aout_binfmt(void)
-{
-	unregister_binfmt(&aout_format);
-}
-
-core_initcall(init_aout_binfmt);
-module_exit(exit_aout_binfmt);
-MODULE_LICENSE("GPL");
diff --git a/fs/exec.c b/fs/exec.c
index d046dbb9cbd0..69a572fc57db 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -957,8 +957,7 @@ struct file *open_exec(const char *name)
 }
 EXPORT_SYMBOL(open_exec);
 
-#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \
-    defined(CONFIG_BINFMT_ELF_FDPIC)
+#if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC)
 ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
 {
 	ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
diff --git a/include/linux/a.out.h b/include/linux/a.out.h
deleted file mode 100644
index 600cf45645c6..000000000000
--- a/include/linux/a.out.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __A_OUT_GNU_H__
-#define __A_OUT_GNU_H__
-
-#include <uapi/linux/a.out.h>
-
-#ifndef __ASSEMBLY__
-#ifdef linux
-#include <asm/page.h>
-#if defined(__i386__) || defined(__mc68000__)
-#else
-#ifndef SEGMENT_SIZE
-#define SEGMENT_SIZE	PAGE_SIZE
-#endif
-#endif
-#endif
-#endif /*__ASSEMBLY__ */
-#endif /* __A_OUT_GNU_H__ */
-- 
cgit v1.2.3


From 4ca165fc6c49c3b0100f61524ffbca4743d46e8d Mon Sep 17 00:00:00 2001
From: Lontke Michael <michael.lontke@elektrobit.com>
Date: Wed, 31 Aug 2022 14:03:26 +0200
Subject: SMACK: Add sk_clone_security LSM hook

Using smk_of_current() during sk_alloc_security hook leads in
rare cases to a faulty initialization of the security context
of the created socket.

By adding the LSM hook sk_clone_security to SMACK this initialization
fault is corrected by copying the security context of the old socket
pointer to the newly cloned one.

Co-authored-by: Martin Ostertag: <martin.ostertag@elektrobit.com>
Signed-off-by: Lontke Michael <michael.lontke@elektrobit.com>
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
---
 security/smack/smack_lsm.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 001831458fa2..077bf6fd569b 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -2278,6 +2278,21 @@ static void smack_sk_free_security(struct sock *sk)
 	kfree(sk->sk_security);
 }
 
+/**
+ * smack_sk_clone_security - Copy security context
+ * @sk: the old socket
+ * @newsk: the new socket
+ *
+ * Copy the security context of the old socket pointer to the cloned
+ */
+static void smack_sk_clone_security(const struct sock *sk, struct sock *newsk)
+{
+	struct socket_smack *ssp_old = sk->sk_security;
+	struct socket_smack *ssp_new = newsk->sk_security;
+
+	*ssp_new = *ssp_old;
+}
+
 /**
 * smack_ipv4host_label - check host based restrictions
 * @sip: the object end
@@ -4851,6 +4866,7 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = {
 	LSM_HOOK_INIT(socket_getpeersec_dgram, smack_socket_getpeersec_dgram),
 	LSM_HOOK_INIT(sk_alloc_security, smack_sk_alloc_security),
 	LSM_HOOK_INIT(sk_free_security, smack_sk_free_security),
+	LSM_HOOK_INIT(sk_clone_security, smack_sk_clone_security),
 	LSM_HOOK_INIT(sock_graft, smack_sock_graft),
 	LSM_HOOK_INIT(inet_conn_request, smack_inet_conn_request),
 	LSM_HOOK_INIT(inet_csk_clone, smack_inet_csk_clone),
-- 
cgit v1.2.3


From d3f84f5c9627576b555976c7584514a2ca3ed02e Mon Sep 17 00:00:00 2001
From: Xu Panda <xu.panda@zte.com.cn>
Date: Mon, 12 Sep 2022 10:05:36 +0000
Subject: smack: lsm: remove the unneeded result variable

Return the value smk_ptrace_rule_check() directly instead of storing it
in another redundant variable.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: Xu Panda <xu.panda@zte.com.cn>
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
---
 security/smack/smack_lsm.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 077bf6fd569b..de7db0b18cd5 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -496,13 +496,11 @@ static int smack_ptrace_access_check(struct task_struct *ctp, unsigned int mode)
  */
 static int smack_ptrace_traceme(struct task_struct *ptp)
 {
-	int rc;
 	struct smack_known *skp;
 
 	skp = smk_of_task(smack_cred(current_cred()));
 
-	rc = smk_ptrace_rule_check(ptp, skp, PTRACE_MODE_ATTACH, __func__);
-	return rc;
+	return smk_ptrace_rule_check(ptp, skp, PTRACE_MODE_ATTACH, __func__);
 }
 
 /**
-- 
cgit v1.2.3


From cc71271f5b793d619f8a7d2ef905374102533c75 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Thu, 8 Sep 2022 18:35:43 +0800
Subject: smack: cleanup obsolete mount option flags

These mount option flags are obsolete since commit 12085b14a444 ("smack:
switch to private smack_mnt_opts"), remove them.

Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
---
 security/smack/smack.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/security/smack/smack.h b/security/smack/smack.h
index fc837dcebf96..e2239be7bd60 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -180,15 +180,6 @@ struct smack_known_list_elem {
 	struct smack_known	*smk_label;
 };
 
-/* Super block security struct flags for mount options */
-#define FSDEFAULT_MNT	0x01
-#define FSFLOOR_MNT	0x02
-#define FSHAT_MNT	0x04
-#define FSROOT_MNT	0x08
-#define FSTRANS_MNT	0x10
-
-#define NUM_SMK_MNT_OPTS	5
-
 enum {
 	Opt_error = -1,
 	Opt_fsdefault = 0,
-- 
cgit v1.2.3


From 87dbdc230d162bf9ee1ac77c8ade178b6b1e199e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 26 Sep 2022 21:29:39 -0700
Subject: libbpf: Don't require full struct enum64 in UAPI headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop the requirement for system-wide kernel UAPI headers to provide full
struct btf_enum64 definition. This is an unexpected requirement that
slipped in libbpf 1.0 and put unnecessary pressure ([0]) on users to have
a bleeding-edge kernel UAPI header from unreleased Linux 6.0.

To achieve this, we forward declare struct btf_enum64. But that's not
enough as there is btf_enum64_value() helper that expects to know the
layout of struct btf_enum64. So we get a bit creative with
reinterpreting memory layout as array of __u32 and accesing lo32/hi32
fields as array elements. Alternative way would be to have a local
pointer variable for anonymous struct with exactly the same layout as
struct btf_enum64, but that gets us into C++ compiler errors complaining
about invalid type casts. So play it safe, if ugly.

  [0] Closes: https://github.com/libbpf/libbpf/issues/562

Fixes: d90ec262b35b ("libbpf: Add enum64 support for btf_dump")
Reported-by: Toke Høiland-Jørgensen <toke@toke.dk>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Toke Høiland-Jørgensen <toke@toke.dk>
Link: https://lore.kernel.org/bpf/20220927042940.147185-1-andrii@kernel.org
---
 tools/lib/bpf/btf.h | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index ae543144ee30..8e6880d91c84 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -486,6 +486,8 @@ static inline struct btf_enum *btf_enum(const struct btf_type *t)
 	return (struct btf_enum *)(t + 1);
 }
 
+struct btf_enum64;
+
 static inline struct btf_enum64 *btf_enum64(const struct btf_type *t)
 {
 	return (struct btf_enum64 *)(t + 1);
@@ -493,7 +495,28 @@ static inline struct btf_enum64 *btf_enum64(const struct btf_type *t)
 
 static inline __u64 btf_enum64_value(const struct btf_enum64 *e)
 {
-	return ((__u64)e->val_hi32 << 32) | e->val_lo32;
+	/* struct btf_enum64 is introduced in Linux 6.0, which is very
+	 * bleeding-edge. Here we are avoiding relying on struct btf_enum64
+	 * definition coming from kernel UAPI headers to support wider range
+	 * of system-wide kernel headers.
+	 *
+	 * Given this header can be also included from C++ applications, that
+	 * further restricts C tricks we can use (like using compatible
+	 * anonymous struct). So just treat struct btf_enum64 as
+	 * a three-element array of u32 and access second (lo32) and third
+	 * (hi32) elements directly.
+	 *
+	 * For reference, here is a struct btf_enum64 definition:
+	 *
+	 * const struct btf_enum64 {
+	 *	__u32	name_off;
+	 *	__u32	val_lo32;
+	 *	__u32	val_hi32;
+	 * };
+	 */
+	const __u32 *e64 = (const __u32 *)e;
+
+	return ((__u64)e64[2] << 32) | e64[1];
 }
 
 static inline struct btf_member *btf_members(const struct btf_type *t)
-- 
cgit v1.2.3


From df8238629112b8f61f96367f84d5b236051d8c93 Mon Sep 17 00:00:00 2001
From: Wu XiangCheng <bobwxc@email.cn>
Date: Tue, 2 Aug 2022 11:03:06 +0800
Subject: docs/zh_CN: Update zh_CN/process/email-clients.rst to 5.19

* update to commit cbf4adfd4d19 ("Documentation: process: Update email
  client instructions for Thunderbird")
* clean the whole translation

Signed-off-by: Wu XiangCheng <bobwxc@email.cn>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/3a19acf5929357f2702ac1e3538d1a9cc0085cc0.1659406843.git.bobwxc@email.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../translations/zh_CN/process/email-clients.rst   | 263 +++++++++++++--------
 1 file changed, 170 insertions(+), 93 deletions(-)

diff --git a/Documentation/translations/zh_CN/process/email-clients.rst b/Documentation/translations/zh_CN/process/email-clients.rst
index 102023651118..fa113f071855 100644
--- a/Documentation/translations/zh_CN/process/email-clients.rst
+++ b/Documentation/translations/zh_CN/process/email-clients.rst
@@ -1,17 +1,18 @@
-﻿.. _cn_email_clients:
+.. SPDX-License-Identifier: GPL-2.0-or-later
 
 .. include:: ../disclaimer-zh_CN.rst
 
-:Original: :ref:`Documentation/process/email-clients.rst <email_clients>`
+:Original: Documentation/process/email-clients.rst
 
-译者::
+:译者:
+ - 贾威威  Harry Wei <harryxiyou@gmail.com>
+ - 时奎亮  Alex Shi <alexs@kernel.org>
+ - 吴想成  Wu XiangCheng <bobwxc@email.cn>
 
-        中文版维护者： 贾威威  Harry Wei <harryxiyou@gmail.com>
-        中文版翻译者： 贾威威  Harry Wei <harryxiyou@gmail.com>
-                       时奎亮  Alex Shi <alex.shi@linux.alibaba.com>
-        中文版校译者： Yinglin Luan <synmyth@gmail.com>
-        	       Xiaochen Wang <wangxiaochen0@gmail.com>
-                       yaxinsn <yaxinsn@163.com>
+:校译:
+ - Yinglin Luan <synmyth@gmail.com>
+ - Xiaochen Wang <wangxiaochen0@gmail.com>
+ - yaxinsn <yaxinsn@163.com>
 
 Linux邮件客户端配置信息
 =======================
@@ -27,12 +28,17 @@ Git
 改日志。如果工作正常，再将补丁发送到相应的邮件列表。
 
 
-普通配置
+通用配置
 --------
+
 Linux内核补丁是通过邮件被提交的，最好把补丁作为邮件体的内嵌文本。有些维护者
 接收附件，但是附件的内容格式应该是"text/plain"。然而，附件一般是不赞成的，
 因为这会使补丁的引用部分在评论过程中变的很困难。
 
+同时也强烈建议在补丁或其他邮件的正文中使用纯文本格式。https://useplaintext.email
+有助于了解如何配置你喜欢的邮件客户端，并在您还没有首选的情况下列出一些推荐的
+客户端。
+
 用来发送Linux内核补丁的邮件客户端在发送补丁时应该处于文本的原始状态。例如，
 他们不能改变或者删除制表符或者空格，甚至是在每一行的开头或者结尾。
 
@@ -40,17 +46,17 @@ Linux内核补丁是通过邮件被提交的，最好把补丁作为邮件体的
 
 不要让你的邮件客户端进行自动换行。这样也会破坏你的补丁。
 
-邮件客户端不能改变文本的字符集编码方式。要发送的补丁只能是ASCII或者UTF-8编码方式，
-如果你使用UTF-8编码方式发送邮件，那么你将会避免一些可能发生的字符集问题。
+邮件客户端不能改变文本的字符集编码方式。要发送的补丁只能是ASCII或者UTF-8编码
+方式，如果你使用UTF-8编码方式发送邮件，那么你将会避免一些可能发生的字符集问题。
 
-邮件客户端应该形成并且保持 References: 或者 In-Reply-To: 标题，那么
-邮件话题就不会中断。
+邮件客户端应该生成并且保持“References:”或者“In-Reply-To:”邮件头，这样邮件会话
+就不会中断。
 
-复制粘帖(或者剪贴粘帖)通常不能用于补丁，因为制表符会转换为空格。使用xclipboard, xclip
-或者xcutsel也许可以，但是最好测试一下或者避免使用复制粘帖。
+复制粘帖(或者剪贴粘帖)通常不能用于补丁，因为制表符会转换为空格。使用xclipboard,
+xclip或者xcutsel也许可以，但是最好测试一下或者避免使用复制粘帖。
 
-不要在使用PGP/GPG署名的邮件中包含补丁。这样会使得很多脚本不能读取和适用于你的补丁。
-（这个问题应该是可以修复的）
+不要在使用PGP/GPG签名的邮件中包含补丁。这样会使得很多脚本不能读取和适用于你的
+补丁。（这个问题应该是可以修复的）
 
 在给内核邮件列表发送补丁之前，给自己发送一个补丁是个不错的主意，保存接收到的
 邮件，将补丁用'patch'命令打上，如果成功了，再给内核邮件列表发送。
@@ -58,98 +64,133 @@ Linux内核补丁是通过邮件被提交的，最好把补丁作为邮件体的
 
 一些邮件客户端提示
 ------------------
+
 这里给出一些详细的MUA配置提示，可以用于给Linux内核发送补丁。这些并不意味是
 所有的软件包配置总结。
 
 说明：
-TUI = 以文本为基础的用户接口
-GUI = 图形界面用户接口
+
+- TUI = 以文本为基础的用户接口
+- GUI = 图形界面用户接口
 
 Alpine (TUI)
-~~~~~~~~~~~~
+************
 
 配置选项：
-在"Sending Preferences"部分：
 
-- "Do Not Send Flowed Text"必须开启
-- "Strip Whitespace Before Sending"必须关闭
+在 :menuselection:`Sending Preferences` 菜单：
+
+- :menuselection:`Do Not Send Flowed Text` 必须开启
+- :menuselection:`Strip Whitespace Before Sending` 必须关闭
+
+当写邮件时，光标应该放在补丁会出现的地方，然后按下 :kbd:`CTRL-R` 组合键，使指
+定的补丁文件嵌入到邮件中。
+
+Claws Mail (GUI)
+****************
+
+可以用，有人用它成功地发过补丁。
+
+用 :menuselection:`Message-->Insert File` (:kbd:`CTRL-I`) 或外置编辑器插入补丁。
 
-当写邮件时，光标应该放在补丁会出现的地方，然后按下CTRL-R组合键，使指定的
-补丁文件嵌入到邮件中。
+若要在Claws编辑窗口重修改插入的补丁，需关闭
+:menuselection:`Configuration-->Preferences-->Compose-->Wrapping`
+的 `Auto wrapping` 。
 
 Evolution (GUI)
-~~~~~~~~~~~~~~~
+***************
 
-一些开发者成功的使用它发送补丁
+一些开发者成功的使用它发送补丁。
 
-当选择邮件选项：Preformat
-  从Format->Heading->Preformatted (Ctrl-7)或者工具栏
+撰写邮件时：
+从 :menuselection:`格式-->段落样式-->预格式化` (:kbd:`CTRL-7`)
+或工具栏选择 :menuselection:`预格式化` ；
 
 然后使用：
-  Insert->Text File... (Alt-n x)插入补丁文件。
+:menuselection:`插入-->文本文件...` (:kbd:`ALT-N x`) 插入补丁文件。
 
-你还可以"diff -Nru old.c new.c | xclip"，选择Preformat，然后使用中间键进行粘帖。
+你还可以 ``diff -Nru old.c new.c | xclip`` ，选择 :menuselection:`预格式化` ，
+然后使用鼠标中键进行粘帖。
 
 Kmail (GUI)
-~~~~~~~~~~~
+***********
 
 一些开发者成功的使用它发送补丁。
 
-默认设置不为HTML格式是合适的；不要启用它。
+默认撰写设置禁用HTML格式是合适的；不要启用它。
+
+当书写一封邮件的时候，在选项下面不要选择自动换行。唯一的缺点就是你在邮件中输
+入的任何文本都不会被自动换行，因此你必须在发送补丁之前手动换行。最简单的方法
+就是启用自动换行来书写邮件，然后把它保存为草稿。一旦你在草稿中再次打开它，它
+已经全部自动换行了，那么你的邮件虽然没有选择自动换行，但是还不会失去已有的自
+动换行。
 
-当书写一封邮件的时候，在选项下面不要选择自动换行。唯一的缺点就是你在邮件中输入的任何文本
-都不会被自动换行，因此你必须在发送补丁之前手动换行。最简单的方法就是启用自动换行来书写邮件，
-然后把它保存为草稿。一旦你在草稿中再次打开它，它已经全部自动换行了，那么你的邮件虽然没有
-选择自动换行，但是还不会失去已有的自动换行。
+在邮件的底部，插入补丁之前，放上常用的补丁定界符：三个连字符(``---``)。
 
-在邮件的底部，插入补丁之前，放上常用的补丁定界符：三个连字号(---)。
+然后在 :menuselection:`信件` 菜单，选择 :menuselection:`插入文本文件` ，接
+着选取你的补丁文件。还有一个额外的选项，你可以通过它配置你的创建新邮件工具栏，
+加上 :menuselection:`插入文本文件` 图标。
 
-然后在"Message"菜单条目，选择插入文件，接着选取你的补丁文件。还有一个额外的选项，你可以
-通过它配置你的邮件建立工具栏菜单，还可以带上"insert file"图标。
+将编辑器窗口拉到足够宽避免折行。对于KMail 1.13.5 (KDE 4.5.4)，它会在发送邮件
+时对编辑器窗口中显示折行的地方自动换行。在选项菜单中取消自动换行仍不能解决。
+因此，如果你的补丁中有非常长的行，必须在发送之前把编辑器窗口拉得非常宽。
+参见：https://bugs.kde.org/show_bug.cgi?id=174034
 
-你可以安全地通过GPG标记附件，但是内嵌补丁最好不要使用GPG标记它们。作为内嵌文本的签发补丁，
-当从GPG中提取7位编码时会使他们变的更加复杂。
+你可以安全地用GPG签名附件，但是内嵌补丁最好不要使用GPG签名它们。作为内嵌文本
+插入的签名补丁将使其难以从7-bit编码中提取。
 
-如果你非要以附件的形式发送补丁，那么就右键点击附件，然后选中属性，突出"Suggest automatic
-display"，这样内嵌附件更容易让读者看到。
+如果你非要以附件的形式发送补丁，那么就右键点击附件，然后选择
+:menuselection:`属性` ，打开 :menuselection:`建议自动显示` ，使附件内联更容
+易让读者看到。
 
-当你要保存将要发送的内嵌文本补丁，你可以从消息列表窗格选择包含补丁的邮件，然后右击选择
-"save as"。你可以使用一个没有更改的包含补丁的邮件，如果它是以正确的形式组成。当你正真在它
-自己的窗口之下察看，那时没有选项可以保存邮件--已经有一个这样的bug被汇报到了kmail的bugzilla
-并且希望这将会被处理。邮件是以只针对某个用户可读写的权限被保存的，所以如果你想把邮件复制到其他地方，
-你不得不把他们的权限改为组或者整体可读。
+当你要保存将要发送的内嵌文本补丁，你可以从消息列表窗格选择包含补丁的邮件，然
+后右键选择 :menuselection:`另存为` 。如果整个电子邮件的组成正确，您可直接将
+其作为补丁使用。电子邮件以当前用户可读写权限保存，因此您必须 ``chmod`` ，以
+使其在复制到别处时用户组和其他人可读。
 
 Lotus Notes (GUI)
-~~~~~~~~~~~~~~~~~
+*****************
 
 不要使用它。
 
+IBM Verse (Web GUI)
+*******************
+
+同上条。
+
 Mutt (TUI)
-~~~~~~~~~~
+**********
 
-很多Linux开发人员使用mutt客户端，所以证明它肯定工作的非常漂亮。
+很多Linux开发人员使用mutt客户端，这证明它肯定工作得非常漂亮。
 
-Mutt不自带编辑器，所以不管你使用什么编辑器都不应该带有自动断行。大多数编辑器都带有
-一个"insert file"选项，它可以通过不改变文件内容的方式插入文件。
+Mutt不自带编辑器，所以不管你使用什么编辑器，不自动断行就行。大多数编辑器都有
+:menuselection:`插入文件` 选项，它可以在不改变文件内容的情况下插入文件。
+
+用 ``vim`` 作为mutt的编辑器::
 
-'vim'作为mutt的编辑器：
   set editor="vi"
 
-  如果使用xclip，敲入以下命令
+如果使用xclip，敲入以下命令::
+
   :set paste
-  按中键之前或者shift-insert或者使用
+
+然后再按中键或者shift-insert或者使用::
+
   :r filename
 
-如果想要把补丁作为内嵌文本。
-(a)ttach工作的很好，不带有"set paste"。
+把补丁插入为内嵌文本。
+在未设置  ``set paste`` 时(a)ttach工作的很好。
 
 你可以通过 ``git format-patch`` 生成补丁，然后用 Mutt发送它们::
 
-        $ mutt -H 0001-some-bug-fix.patch
+    $ mutt -H 0001-some-bug-fix.patch
 
 配置选项：
+
 它应该以默认设置的形式工作。
-然而，把"send_charset"设置为"us-ascii::utf-8"也是一个不错的主意。
+然而，把 ``send_charset`` 设置一下也是一个不错的主意::
+
+  set send_charset="us-ascii:utf-8"
 
 Mutt 是高度可配置的。 这里是个使用mutt通过 Gmail 发送的补丁的最小配置::
 
@@ -178,71 +219,107 @@ Mutt 是高度可配置的。 这里是个使用mutt通过 Gmail 发送的补丁
   set from = "username@gmail.com"
   set use_from = yes
 
-Mutt文档含有更多信息:
+Mutt文档含有更多信息：
 
-    http://dev.mutt.org/trac/wiki/UseCases/Gmail
+    https://gitlab.com/muttmua/mutt/-/wikis/UseCases/Gmail
 
-    http://dev.mutt.org/doc/manual.html
+    http://www.mutt.org/doc/manual/
 
 Pine (TUI)
-~~~~~~~~~~
+**********
 
 Pine过去有一些空格删减问题，但是这些现在应该都被修复了。
 
-如果可以，请使用alpine(pine的继承者)
+如果可以，请使用alpine（pine的继承者）。
 
 配置选项：
-- 最近的版本需要消除流程文本
-- "no-strip-whitespace-before-send"选项也是需要的。
+
+- 最近的版本需要 ``quell-flowed-text``
+- ``no-strip-whitespace-before-send`` 选项也是需要的。
 
 
 Sylpheed (GUI)
-~~~~~~~~~~~~~~
+**************
 
 - 内嵌文本可以很好的工作（或者使用附件）。
 - 允许使用外部的编辑器。
-- 对于目录较多时非常慢。
+- 收件箱较多时非常慢。
 - 如果通过non-SSL连接，无法使用TLS SMTP授权。
-- 在组成窗口中有一个很有用的ruler bar。
-- 给地址本中添加地址就不会正确的了解显示名。
+- 撰写窗口的标尺很有用。
+- 将地址添加到通讯簿时无法正确理解显示的名称。
 
 Thunderbird (GUI)
-~~~~~~~~~~~~~~~~~
+*****************
+
+Thunderbird是Outlook的克隆版本，它很容易损坏文本，但也有一些方法强制修正。
+
+在完成修改后（包括安装扩展），您需要重新启动Thunderbird。
+
+- 允许使用外部编辑器：
+
+  使用Thunderbird发补丁最简单的方法是使用扩展来打开您最喜欢的外部编辑器。
+
+  下面是一些能够做到这一点的扩展样例。
+
+  - “External Editor Revived”
+
+    https://github.com/Frederick888/external-editor-revived
+
+    https://addons.thunderbird.net/en-GB/thunderbird/addon/external-editor-revived/
+
+    它需要安装“本地消息主机（native messaging host）”。
+    参见以下文档:
+    https://github.com/Frederick888/external-editor-revived/wiki
+
+  - “External Editor”
+
+    https://github.com/exteditor/exteditor
+
+    下载并安装此扩展，然后打开 :menuselection:`新建消息` 窗口, 用
+    :menuselection:`查看-->工具栏-->自定义...` 给它增加一个按钮，直接点击此
+    按钮即可使用外置编辑器。
+
+    请注意，“External Editor”要求你的编辑器不能fork，换句话说，编辑器必须在
+    关闭前不返回。你可能需要传递额外的参数或修改编辑器设置。最值得注意的是，
+    如果您使用的是gvim，那么您必须将 :menuselection:`external editor` 设置的
+    编辑器字段设置为 ``/usr/bin/gvim --nofork"`` （假设可执行文件在
+    ``/usr/bin`` ），以传递 ``-f`` 参数。如果您正在使用其他编辑器，请阅读其
+    手册了解如何处理。
 
-默认情况下，thunderbird很容易损坏文本，但是还有一些方法可以强制它变得更好。
+若要修正内部编辑器，请执行以下操作：
 
-- 在用户帐号设置里，组成和寻址，不要选择"Compose messages in HTML format"。
+- 修改你的Thunderbird设置，不要使用 ``format=flowed`` ！
+  回到主窗口，按照
+  :menuselection:`主菜单-->首选项-->常规-->配置编辑器...`
+  打开Thunderbird的配置编辑器。
 
-- 编辑你的Thunderbird配置设置来使它不要拆行使用：user_pref("mailnews.wraplength", 0);
+  - 将 ``mailnews.send_plaintext_flowed`` 设为 ``false``
 
-- 编辑你的Thunderbird配置设置，使它不要使用"format=flowed"格式：user_pref("mailnews.
-  send_plaintext_flowed", false);
+  - 将 ``mailnews.wraplength`` 从 ``72`` 改为 ``0``
 
-- 你需要使Thunderbird变为预先格式方式：
-  如果默认情况下你书写的是HTML格式，那不是很难。仅仅从标题栏的下拉框中选择"Preformat"格式。
-  如果默认情况下你书写的是文本格式，你不得把它改为HTML格式（仅仅作为一次性的）来书写新的消息，
-  然后强制使它回到文本格式，否则它就会拆行。要实现它，在写信的图标上使用shift键来使它变为HTML
-  格式，然后标题栏的下拉框中选择"Preformat"格式。
+- 不要写HTML邮件！
+  回到主窗口，打开
+  :menuselection:`主菜单-->账户设置-->你的@邮件.地址-->通讯录/编写&地址簿` ，
+  关掉 ``以HTML格式编写消息`` 。
 
-- 允许使用外部的编辑器：
-  针对Thunderbird打补丁最简单的方法就是使用一个"external editor"扩展，然后使用你最喜欢的
-  $EDITOR来读取或者合并补丁到文本中。要实现它，可以下载并且安装这个扩展，然后添加一个使用它的
-  按键View->Toolbars->Customize...最后当你书写信息的时候仅仅点击它就可以了。
+- 只用纯文本格式查看邮件！
+  回到主窗口， :menuselection:`主菜单-->查看-->消息体为-->纯文本` ！
 
 TkRat (GUI)
-~~~~~~~~~~~
+***********
 
 可以使用它。使用"Insert file..."或者外部的编辑器。
 
 Gmail (Web GUI)
-~~~~~~~~~~~~~~~
+***************
 
 不要使用它发送补丁。
 
 Gmail网页客户端自动地把制表符转换为空格。
 
-虽然制表符转换为空格问题可以被外部编辑器解决，同时它还会使用回车换行把每行拆分为78个字符。
+虽然制表符转换为空格问题可以被外部编辑器解决，但它同时还会使用回车换行把每行
+拆分为78个字符。
 
-另一个问题是Gmail还会把任何不是ASCII的字符的信息改为base64编码。它把东西变的像欧洲人的名字。
+另一个问题是Gmail还会把任何含有非ASCII的字符的消息改用base64编码，如欧洲人的
+名字。
 
-                                ###
-- 
cgit v1.2.3


From 7114fe6f906cdec2d263c1c84cd0cf5f0726652f Mon Sep 17 00:00:00 2001
From: Wu XiangCheng <bobwxc@email.cn>
Date: Tue, 2 Aug 2022 11:03:40 +0800
Subject: docs/zh_CN: Update zh_CN/process/submitting-patches.rst to 5.19

* update to commit 9db370de2780 ("docs: process: remove outdated
  submitting-drivers.rst")
* clean and reconstruct the whole translation

Signed-off-by: Wu XiangCheng <bobwxc@email.cn>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/717baee07920d3cecf09197a10c973dd46089fcb.1659406843.git.bobwxc@email.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../zh_CN/process/submitting-patches.rst           | 712 ++++++++++-----------
 1 file changed, 355 insertions(+), 357 deletions(-)

diff --git a/Documentation/translations/zh_CN/process/submitting-patches.rst b/Documentation/translations/zh_CN/process/submitting-patches.rst
index ebb7f37575c1..26565abcbd8d 100644
--- a/Documentation/translations/zh_CN/process/submitting-patches.rst
+++ b/Documentation/translations/zh_CN/process/submitting-patches.rst
@@ -1,142 +1,90 @@
-.. _cn_submittingpatches:
+.. SPDX-License-Identifier: GPL-2.0-or-later
 
 .. include:: ../disclaimer-zh_CN.rst
 
-:Original: :ref:`Documentation/process/submitting-patches.rst <submittingpatches>`
+:Original: Documentation/process/submitting-patches.rst
 
-译者::
+:译者:
+ - 钟宇 TripleX Chung <xxx.phy@gmail.com>
+ - 时奎亮 Alex Shi <alexs@kernel.org>
+ - 吴想成 Wu XiangCheng <bobwxc@email.cn>
 
-        中文版维护者： 钟宇 TripleX Chung <xxx.phy@gmail.com>
-        中文版翻译者： 钟宇 TripleX Chung <xxx.phy@gmail.com>
-                       时奎亮 Alex Shi <alex.shi@linux.alibaba.com>
-        中文版校译者： 李阳 Li Yang <leoyang.li@nxp.com>
-                       王聪 Wang Cong <xiyou.wangcong@gmail.com>
+:校译:
+ - 李阳 Li Yang <leoyang.li@nxp.com>
+ - 王聪 Wang Cong <xiyou.wangcong@gmail.com>
 
 
-如何让你的改动进入内核
-======================
+提交补丁：如何让你的改动进入内核
+================================
 
 对于想要将改动提交到 Linux 内核的个人或者公司来说，如果不熟悉“规矩”，
-提交的流程会让人畏惧。本文档收集了一系列建议，这些建议可以大大的提高你
+提交的流程会让人畏惧。本文档包含了一系列建议，可以大大提高你
 的改动被接受的机会.
 
-以下文档含有大量简洁的建议， 具体请见：
-:ref:`Documentation/process <development_process_main>`
-同样，:ref:`Documentation/translations/zh_CN/process/submit-checklist.rst <cn_submitchecklist>`
-给出在提交代码前需要检查的项目的列表。
+本文档以较为简洁的行文给出了大量建议。关于内核开发流程如何进行的详细信息，
+参见： Documentation/translations/zh_CN/process/development-process.rst 。
+Documentation/translations/zh_CN/process/submit-checklist.rst 给出了一系列
+提交补丁之前要检查的事项。设备树相关的补丁，请参阅
+Documentation/devicetree/bindings/submitting-patches.rst 。
 
-其中许多步骤描述了Git版本控制系统的默认行为；如果您使用Git来准备补丁，
-您将发现它为您完成的大部分机械工作，尽管您仍然需要准备和记录一组合理的
-补丁。一般来说，使用git将使您作为内核开发人员的生活更轻松。
+本文档假设您正在使用 ``git`` 准备你的补丁。如果您不熟悉 ``git`` ，最好学习
+如何使用它，这将使您作为内核开发人员的生活变得更加轻松。
 
+部分子系统和维护人员的树有一些关于其工作流程和要求的额外信息，请参阅
+Documentation/process/maintainer-handbooks.rst 。
 
-0) 获取当前源码树
------------------
+获取当前源码树
+--------------
 
-如果您没有一个可以使用当前内核源代码的存储库，请使用git获取一个。您将要
-从主线存储库开始，它可以通过以下方式获取::
+如果您手头没有当前内核源代码的存储库，请使用 ``git`` 获取一份。您需要先获取
+主线存储库，它可以通过以下命令拉取::
 
-        git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
+    git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
 
-但是，请注意，您可能不希望直接针对主线树进行开发。大多数子系统维护人员运
+但是，请注意，您可能不想直接针对主线树进行开发。大多数子系统维护人员运
 行自己的树，并希望看到针对这些树准备的补丁。请参见MAINTAINERS文件中子系
-统的 **T:** 项以查找该树，或者简单地询问维护者该树是否未在其中列出。
+统的 **T:** 项以查找该树，或者直接询问维护者该树是否未在其中列出。
 
-仍然可以通过tarballs下载内核版本（如下一节所述），但这是进行内核开发的
-一种困难的方式。
+.. _zh_describe_changes:
 
-1) "diff -up"
--------------
-
-使用 "diff -up" 或者 "diff -uprN" 来创建补丁。
-
-所有内核的改动，都是以补丁的形式呈现的，补丁由 diff(1) 生成。创建补丁的
-时候，要确认它是以 "unified diff" 格式创建的，这种格式由 diff(1) 的 '-u'
-参数生成。而且，请使用 '-p' 参数，那样会显示每个改动所在的C函数，使得
-产生的补丁容易读得多。补丁应该基于内核源代码树的根目录，而不是里边的任
-何子目录。
-
-为一个单独的文件创建补丁，一般来说这样做就够了::
-
-        SRCTREE=linux
-        MYFILE=drivers/net/mydriver.c
-
-        cd $SRCTREE
-        cp $MYFILE $MYFILE.orig
-        vi $MYFILE      # make your change
-        cd ..
-        diff -up $SRCTREE/$MYFILE{.orig,} > /tmp/patch
-
-为多个文件创建补丁，你可以解开一个没有修改过的内核源代码树，然后和你自
-己的代码树之间做 diff 。例如::
-
-        MYSRC=/devel/linux
-
-        tar xvfz linux-3.19.tar.gz
-        mv linux-3.19 linux-3.19-vanilla
-        diff -uprN -X linux-3.19-vanilla/Documentation/dontdiff \
-                linux-3.19-vanilla $MYSRC > /tmp/patch
-
-"dontdiff" 是内核在编译的时候产生的文件的列表，列表中的文件在 diff(1)
-产生的补丁里会被跳过。
-
-确定你的补丁里没有包含任何不属于这次补丁提交的额外文件。记得在用diff(1)
-生成补丁之后，审阅一次补丁，以确保准确。
-
-如果你的改动很散乱，你应该研究一下如何将补丁分割成独立的部分，将改动分
-割成一系列合乎逻辑的步骤。这样更容易让其他内核开发者审核，如果你想你的
-补丁被接受，这是很重要的。请参阅：
-:ref:`cn_split_changes`
-
-如果你用 ``git`` , ``git rebase -i`` 可以帮助你这一点。如果你不用 ``git``,
-``quilt`` <https://savannah.nongnu.org/projects/quilt> 另外一个流行的选择。
-
-.. _cn_describe_changes:
-
-2) 描述你的改动
----------------
+描述你的改动
+------------
 
 描述你的问题。无论您的补丁是一行错误修复还是5000行新功能，都必须有一个潜在
-的问题激励您完成这项工作。让审稿人相信有一个问题值得解决，让他们读完第一段
-是有意义的。
+的问题激励您完成这项工作。说服审阅者相信有一个问题值得解决，让他们读完第一段
+后就能明白这一点。
 
 描述用户可见的影响。直接崩溃和锁定是相当有说服力的，但并不是所有的错误都那么
-明目张胆。即使在代码审查期间发现了这个问题，也要描述一下您认为它可能对用户产
+明目张胆。即使在代码审阅期间发现了这个问题，也要描述一下您认为它可能对用户产
 生的影响。请记住，大多数Linux安装运行的内核来自二级稳定树或特定于供应商/产品
 的树，只从上游精选特定的补丁，因此请包含任何可以帮助您将更改定位到下游的内容：
 触发的场景、DMESG的摘录、崩溃描述、性能回归、延迟尖峰、锁定等。
 
-量化优化和权衡。如果您声称在性能、内存消耗、堆栈占用空间或二进制大小方面有所
-改进，请包括支持它们的数字。但也要描述不明显的成本。优化通常不是免费的，而是
-在CPU、内存和可读性之间进行权衡；或者，探索性的工作，在不同的工作负载之间进
+质量优化和权衡。如果您声称在性能、内存消耗、堆栈占用空间或二进制大小方面有所
+改进，请包括支持它们的数据。但也要描述不明显的成本。优化通常不是零成本的，而是
+在CPU、内存和可读性之间进行权衡；或者，做探索性的工作，在不同的工作负载之间进
 行权衡。请描述优化的预期缺点，以便审阅者可以权衡成本和收益。
 
-一旦问题建立起来，就要详细地描述一下您实际在做什么。对于审阅者来说，用简单的
-英语描述代码的变化是很重要的，以验证代码的行为是否符合您的意愿。
+提出问题之后，就要详细地描述一下您实际在做的技术细节。对于审阅者来说，用简练的
+英语描述代码的变化是很重要的，以验证代码的行为是否符合您的意图。
 
-如果您将补丁描述写在一个表单中，这个表单可以很容易地作为“提交日志”放入Linux
-的源代码管理系统git中，那么维护人员将非常感谢您。见 :ref:`cn_explicit_in_reply_to`.
+如果您将补丁描述写成“标准格式”，可以很容易地作为“提交日志”放入Linux的源代
+码管理系统 ``git`` 中，那么维护人员将非常感谢您。
+参见 :ref:`zh_the_canonical_patch_format` 。
 
 每个补丁只解决一个问题。如果你的描述开始变长，这就表明你可能需要拆分你的补丁。
-请见 :ref:`cn_split_changes`
+请见 :ref:`zh_split_changes` 。
 
-提交或重新提交修补程序或修补程序系列时，请包括完整的修补程序说明和理由。不要
+提交或重新提交补丁或补丁系列时，请包括完整的补丁说明和理由。不要
 只说这是补丁（系列）的第几版。不要期望子系统维护人员引用更早的补丁版本或引用
 URL来查找补丁描述并将其放入补丁中。也就是说，补丁（系列）及其描述应该是独立的。
-这对维护人员和审查人员都有好处。一些评审者可能甚至没有收到补丁的早期版本。
+这对维护人员和审阅者都有好处。一些审阅者可能甚至没有收到补丁的早期版本。
 
-描述你在命令语气中的变化，例如“make xyzzy do frotz”而不是“[This patch]make
+用祈使句描述你的变更，例如“make xyzzy do frotz”而不是“[This patch]make
 xyzzy do frotz”或“[I]changed xyzzy to do frotz”，就好像你在命令代码库改变
 它的行为一样。
 
-如果修补程序修复了一个记录的bug条目，请按编号和URL引用该bug条目。如果补丁来
-自邮件列表讨论，请给出邮件列表存档的URL；使用带有 ``Message-ID`` 的
-https://lore.kernel.org/ 重定向，以确保链接不会过时。
-
-但是，在没有外部资源的情况下，尽量让你的解释可理解。除了提供邮件列表存档或
-bug的URL之外，还要总结需要提交补丁的相关讨论要点。
-
-如果您想要引用一个特定的提交，不要只引用提交的 SHA-1 ID。还请包括提交的一行
+如果您想要引用一个特定的提交，不要只引用提交的SHA-1 ID。还请包括提交的一行
 摘要，以便于审阅者了解它是关于什么的。例如::
 
         Commit e21d2170f36602ae2708 ("video: remove unnecessary
@@ -144,82 +92,104 @@ bug的URL之外，还要总结需要提交补丁的相关讨论要点。
         platform_set_drvdata(), but left the variable "dev" unused,
         delete it.
 
-您还应该确保至少使用前12位 SHA-1 ID. 内核存储库包含*许多*对象，使与较短的ID
+您还应该确保至少使用前12位SHA-1 ID。内核存储库包含 *许多* 对象，使较短的ID
 发生冲突的可能性很大。记住，即使现在不会与您的六个字符ID发生冲突，这种情况
-可能五年后改变。
+也可能在五年后改变。
+
+如果该变更的相关讨论或背景信息可以在网上查阅，请加上“Link:”标签指向它。例如
+你的补丁修复了一个缺陷，需要添加一个带有URL的标签指向邮件列表存档或缺陷跟踪器
+的相关报告；如果该补丁是由一些早先邮件列表讨论或网络上的记录引起的，请指向它。
 
-如果修补程序修复了特定提交中的错误，例如，使用 ``git bisct`` ，请使用带有前
-12个字符SHA-1 ID 的"Fixes:"标记和单行摘要。为了简化不要将标记拆分为多个，
-行、标记不受分析脚本“75列换行”规则的限制。例如::
+当链接到邮件列表存档是，请首选lore.kernel.org邮件存档服务。用邮件中的
+``Message-ID`` 头（去掉尖括号）可以创建链接URL。例如::
 
-        Fixes: 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return the number of pages it actually freed")
+    Link: https://lore.kernel.org/r/30th.anniversary.repost@klaava.Helsinki.FI/
 
-下列 ``git config`` 设置可以添加让 ``git log``, ``git show`` 漂亮的显示格式::
+请检查该链接以确保可用且指向正确的邮件。
+
+不过，在没有外部资源的情况下，也要尽量让你的解释可理解。除了提供邮件列表存档或
+缺陷的URL之外，还要需要总结该补丁的相关讨论要点。
+
+如果补丁修复了特定提交中的错误，例如使用 ``git bisct`` 发现了一个问题，请使用
+带有前12个字符SHA-1 ID的“Fixes:”标签和单行摘要。为了简化解析脚本，不要将该
+标签拆分为多行，标签不受“75列换行”规则的限制。例如::
+
+  Fixes: 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return the number of pages it actually freed")
+
+下列 ``git config`` 设置可以让 ``git log``, ``git show`` 增加上述风格的显示格式::
 
 	[core]
 		abbrev = 12
 	[pretty]
 		fixes = Fixes: %h (\"%s\")
 
-.. _cn_split_changes:
+使用示例::
 
-3) 拆分你的改动
----------------
+	$ git log -1 --pretty=fixes 54a4f0239f2e
+	Fixes: 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return the number of pages it actually freed")
 
-将每个逻辑更改分隔成一个单独的补丁。
+.. _zh_split_changes:
+
+拆分你的改动
+------------
+
+将每个 **逻辑更改** 拆分成一个单独的补丁。
 
 例如，如果你的改动里同时有bug修正和性能优化，那么把这些改动拆分到两个或
-者更多的补丁文件中。如果你的改动包含对API的修改，并且修改了驱动程序来适
-应这些新的API，那么把这些修改分成两个补丁。
+者更多的补丁文件中。如果你的改动包含对API的修改，并且增加了一个使用该新API
+的驱动，那么把这些修改分成两个补丁。
 
 另一方面，如果你将一个单独的改动做成多个补丁文件，那么将它们合并成一个
 单独的补丁文件。这样一个逻辑上单独的改动只被包含在一个补丁文件里。
 
-如果有一个补丁依赖另外一个补丁来完成它的改动，那没问题。简单的在你的补
-丁描述里指出“这个补丁依赖某补丁”就好了。
+需要记住的一点是，每个补丁的更改都应易于理解，以便审阅者验证。每个补丁都应该
+对其价值进行阐述。
 
-在将您的更改划分为一系列补丁时，要特别注意确保内核在系列中的每个补丁之后
-都能正常构建和运行。使用 ``git bisect`` 来追踪问题的开发者可能会在任何时
-候分割你的补丁系列；如果你在中间引入错误，他们不会感谢你。
+如果有一个补丁依赖另外一个补丁来完成它的改动，那没问题。直接在你的补
+丁描述里指出 **“这个补丁依赖某补丁”** 就好了。
 
-如果你不能将补丁浓缩成更少的文件，那么每次大约发送出15个，然后等待审查
+在将您的更改划分为一系列补丁时，要特别注意确保内核在应用系列中的每个补丁之后
+都能正常构建和运行。使用 ``git bisect`` 来追踪问题的开发者可能会在任何地方分
+割你的补丁系列；如果你在中间引入错误，他们不会感谢你。
+
+如果你不能将补丁系列浓缩得更小，那么每次大约发送出15个补丁，然后等待审阅
 和集成。
 
-4) 检查你的更改风格
--------------------
+检查你的更改风格
+----------------
 
-检查您的补丁是否存在基本样式冲突，详细信息可在
-:ref:`Documentation/translations/zh_CN/process/coding-style.rst <cn_codingstyle>`
-中找到。如果不这样做，只会浪费审稿人的时间，并且会导致你的补丁被拒绝，甚至
+检查您的补丁是否违反了基本样式规定，详细信息参见
+Documentation/translations/zh_CN/process/coding-style.rst
+中找到。如果不这样做，只会浪费审阅者的时间，并且会导致你的补丁被拒绝，甚至
 可能没有被阅读。
 
 一个重要的例外是在将代码从一个文件移动到另一个文件时——在这种情况下，您不应
 该在移动代码的同一个补丁中修改移动的代码。这清楚地描述了移动代码和您的更改
-的行为。这大大有助于审查实际差异，并允许工具更好地跟踪代码本身的历史。
+的行为。这大大有助于审阅实际差异，并允许工具更好地跟踪代码本身的历史。
 
 在提交之前，使用补丁样式检查程序检查补丁（scripts/check patch.pl）。不过，
 请注意，样式检查程序应该被视为一个指南，而不是作为人类判断的替代品。如果您
-的代码看起来更好，但有违规行为，那么最好不要使用它。
+的代码看起来更好，但有违规行为，那么最好别管它。
 
 检查者报告三个级别：
 
  - ERROR：很可能出错的事情
- - WARNING：需要仔细审查的事项
+ - WARNING：需要仔细审阅的事项
  - CHECK：需要思考的事情
 
 您应该能够判断您的补丁中存在的所有违规行为。
 
-5) 选择补丁收件人
------------------
+选择补丁收件人
+--------------
 
-您应该总是在任何补丁上复制相应的子系统维护人员，以获得他们维护的代码；查看
+您应该总是知会任何补丁相应代码的子系统维护人员；查看
 维护人员文件和源代码修订历史记录，以了解这些维护人员是谁。脚本
-scripts/get_Maintainer.pl在这个步骤中非常有用。如果您找不到正在工作的子系统
+scripts/get_maintainer.pl在这个步骤中非常有用。如果您找不到正在工作的子系统
 的维护人员，那么Andrew Morton（akpm@linux-foundation.org）将充当最后的维护
 人员。
 
-您通常还应该选择至少一个邮件列表来接收补丁集的。linux-kernel@vger.kernel.org
-作为最后一个解决办法的列表，但是这个列表上的体积已经引起了许多开发人员的拒绝。
+您通常还应该选择至少一个邮件列表来接收补丁集的副本。linux-kernel@vger.kernel.org
+是所有补丁的默认列表，但是这个列表的流量已经导致了许多开发人员不再看它。
 在MAINTAINERS文件中查找子系统特定的列表；您的补丁可能会在那里得到更多的关注。
 不过，请不要发送垃圾邮件到无关的列表。
 
@@ -229,189 +199,170 @@ http://vger.kernel.org/vger-lists.html 上找到它们的列表。不过，也
 
 不要一次发送超过15个补丁到vger邮件列表！！！！
 
-Linus Torvalds 是决定改动能否进入 Linux 内核的最终裁决者。他的 e-mail
-地址是 <torvalds@linux-foundation.org> 。他收到的 e-mail 很多，所以一般
-的说，最好别给他发 e-mail。
+Linus Torvalds是决定改动能否进入 Linux 内核的最终裁决者。他的邮件地址是
+torvalds@linux-foundation.org 。他收到的邮件很多，所以一般来说最好 **别**
+给他发邮件。
 
-如果您有修复可利用安全漏洞的补丁，请将该补丁发送到 security@kernel.org。对于
-严重的bug，可以考虑短期暂停以允许分销商向用户发布补丁；在这种情况下，显然不应
-将补丁发送到任何公共列表。
+如果您有修复可利用安全漏洞的补丁，请将该补丁发送到 security@kernel.org 。对于
+严重的bug，可以考虑短期禁令以允许分销商（有时间）向用户发布补丁；在这种情况下，
+显然不应将补丁发送到任何公共列表。
+参见 Documentation/translations/zh_CN/admin-guide/security-bugs.rst 。
 
-修复已发布内核中严重错误的补丁程序应该指向稳定版维护人员，方法是放这样的一行::
+修复已发布内核中严重错误的补丁程序应该抄送给稳定版维护人员，方法是把以下列行
+放进补丁的签准区（注意，不是电子邮件收件人）::
 
-        Cc: stable@vger.kernel.org
+  Cc: stable@vger.kernel.org
 
-进入补丁的签准区（注意，不是电子邮件收件人）。除了这个文件之外，您还应该阅读
-:ref:`Documentation/process/stable-kernel-rules.rst <stable_kernel_rules>`
+除了本文件之外，您还应该阅读
+Documentation/translations/zh_CN/process/stable-kernel-rules.rst 。
 
-但是，请注意，一些子系统维护人员希望得出他们自己的结论，即哪些补丁应该被放到
-稳定的树上。尤其是网络维护人员，不希望看到单个开发人员在补丁中添加像上面这样
-的行。
-
-如果更改影响到用户和内核接口，请向手册页维护人员（如维护人员文件中所列）发送
+如果更改影响到用户侧内核接口，请向手册页维护人员（如维护人员文件中所列）发送
 手册页补丁，或至少发送更改通知，以便一些信息进入手册页。还应将用户空间API
-更改复制到 linux-api@vger.kernel.org。
+更改抄送到 linux-api@vger.kernel.org 。
 
 
-6) 没有 MIME 编码，没有链接，没有压缩，没有附件，只有纯文本
------------------------------------------------------------
+不要MIME编码，不要链接，不要压缩，不要附件，只要纯文本
+------------------------------------------------------
 
 Linus 和其他的内核开发者需要阅读和评论你提交的改动。对于内核开发者来说
-，可以“引用”你的改动很重要，使用一般的 e-mail 工具，他们就可以在你的
+，可以“引用”你的改动很重要，使用一般的邮件工具，他们就可以在你的
 代码的任何位置添加评论。
 
-因为这个原因，所有的提交的补丁都是 e-mail 中“内嵌”的。
+因为这个原因，所有的提交的补丁都是邮件中“内嵌”的。最简单（和推荐）的方法就
+是使用 ``git send-email`` 。https://git-send-email.io 有 ``git send-email``
+的交互式教程。
+
+如果你选择不用 ``git send-email`` ：
 
 .. warning::
-   如果你使用剪切-粘贴你的补丁，小心你的编辑器的自动换行功能破坏你的补丁
 
-不要将补丁作为 MIME 编码的附件，不管是否压缩。很多流行的 e-mail 软件不
-是任何时候都将 MIME 编码的附件当作纯文本发送的，这会使得别人无法在你的
-代码中加评论。另外，MIME 编码的附件会让 Linus 多花一点时间来处理，这就
-降低了你的改动被接受的可能性。
+  如果你使用剪切-粘贴你的补丁，小心你的编辑器的自动换行功能破坏你的补丁
 
-例外：如果你的邮递员弄坏了补丁，那么有人可能会要求你使用mime重新发送补丁
+不要将补丁作为MIME编码的附件，不管是否压缩。很多流行的邮件软件不
+是任何时候都将MIME编码的附件当作纯文本发送的，这会使得别人无法在你的
+代码中加评论。另外，MIME编码的附件会让Linus多花一点时间来处理，这就
+降低了你的改动被接受的可能性。
 
-请参阅 :ref:`Documentation/translations/zh_CN/process/email-clients.rst <cn_email_clients>`
-以获取有关配置电子邮件客户端以使其不受影响地发送修补程序的提示。
+例外：如果你的邮路损坏了补丁，那么有人可能会要求你使用MIME重新发送补丁。
 
-7) e-mail 的大小
-----------------
+请参阅 Documentation/translations/zh_CN/process/email-clients.rst
+以获取有关配置电子邮件客户端以使其不受影响地发送补丁的提示。
 
-大的改动对邮件列表不合适，对某些维护者也不合适。如果你的补丁，在不压缩
-的情况下，超过了300kB，那么你最好将补丁放在一个能通过 internet 访问的服
-务器上，然后用指向你的补丁的 URL 替代。但是请注意，如果您的补丁超过了
-300kb，那么它几乎肯定需要被破坏。
+回复审阅意见
+------------
 
-8）回复评审意见
----------------
+你的补丁几乎肯定会得到审阅者对补丁改进方法的评论（以回复邮件的形式）。您必须
+对这些评论作出回应；让补丁被忽略的一个好办法就是忽略审阅者的意见。直接回复邮
+件来回应意见即可。不会导致代码更改的意见或问题几乎肯定会带来注释或变更日志的
+改变，以便下一个审阅者更好地了解正在发生的事情。
 
-你的补丁几乎肯定会得到评审者对补丁改进方法的评论。您必须对这些评论作出
-回应；让补丁被忽略的一个好办法就是忽略审阅者的意见。不会导致代码更改的
-意见或问题几乎肯定会带来注释或变更日志的改变，以便下一个评审者更好地了解
-正在发生的事情。
+一定要告诉审阅者你在做什么改变，并感谢他们的时间。代码审阅是一个累人且耗时的
+过程，审阅者有时会变得暴躁。即使在这种情况下，也要礼貌地回应并解决他们指出的
+问题。当发送下一版时，在封面邮件或独立补丁里加上 ``patch changelog`` 说明与
+前一版本的不同之处（参见 :ref:`zh_the_canonical_patch_format` ）。
 
-一定要告诉审稿人你在做什么改变，并感谢他们的时间。代码审查是一个累人且
-耗时的过程，审查人员有时会变得暴躁。即使在这种情况下，也要礼貌地回应并
-解决他们指出的问题。
+.. _zh_resend_reminders:
 
-9）不要泄气或不耐烦
--------------------
+不要泄气或不耐烦
+----------------
 
-提交更改后，请耐心等待。审阅者是忙碌的人，可能无法立即访问您的修补程序。
+提交更改后，请耐心等待。审阅者是大忙人，可能无法立即审阅您的补丁。
 
-曾几何时，补丁曾在没有评论的情况下消失在空白中，但开发过程比现在更加顺利。
+曾几何时，补丁曾在没收到评论的情况下消失在虚空中，但现在开发过程应该更加顺利了。
 您应该在一周左右的时间内收到评论；如果没有收到评论，请确保您已将补丁发送
-到正确的位置。在重新提交或联系审阅者之前至少等待一周-在诸如合并窗口之类的
+到正确的位置。在重新提交或联系审阅者之前至少等待一周——在诸如合并窗口之类的
 繁忙时间可能更长。
 
-10）主题中包含 PATCH
---------------------
+在等了几个星期后，用带RESEND的主题重发补丁也是可以的::
+
+   [PATCH Vx RESEND] sub/sys: Condensed patch summary
+
+当你发布补丁（系列）修改版的时候，不要加上“RESEND”——“RESEND”只适用于重
+新提交之前未经修改的补丁（系列）。
 
-由于到linus和linux内核的电子邮件流量很高，通常会在主题行前面加上[PATCH]
-前缀. 这使Linus和其他内核开发人员更容易将补丁与其他电子邮件讨论区分开。
+主题中包含 PATCH
+----------------
+
+由于到Linus和linux-kernel的电子邮件流量很高，通常会在主题行前面加上[PATCH]
+前缀。这使Linus和其他内核开发人员更容易将补丁与其他电子邮件讨论区分开。
+
+``git send-email`` 会自动为你加上。
 
-11）签署你的作品-开发者原始认证
--------------------------------
+签署你的作品——开发者来源认证
+------------------------------
 
-为了加强对谁做了何事的追踪，尤其是对那些透过好几层的维护者的补丁，我们
-建议在发送出去的补丁上加一个 “sign-off” 的过程。
+为了加强对谁做了何事的追踪，尤其是对那些透过好几层维护者才最终到达的补丁，我
+们在通过邮件发送的补丁上引入了“签署（sign-off）”流程。
 
-"sign-off" 是在补丁的注释的最后的简单的一行文字，认证你编写了它或者其他
+“签署”是在补丁注释最后的一行简单文字，认证你编写了它或者其他
 人有权力将它作为开放源代码的补丁传递。规则很简单：如果你能认证如下信息:
 
-开发者来源证书 1.1
+开发者来源认证 1.1
 ^^^^^^^^^^^^^^^^^^
 
 对于本项目的贡献，我认证如下信息：
 
-      （a）这些贡献是完全或者部分的由我创建，我有权利以文件中指出
+       (a) 这些贡献是完全或者部分的由我创建，我有权利以文件中指出
            的开放源代码许可证提交它；或者
-      （b）这些贡献基于以前的工作，据我所知，这些以前的工作受恰当的开放
-           源代码许可证保护，而且，根据许可证，我有权提交修改后的贡献，
+
+       (b) 这些贡献基于以前的工作，据我所知，这些以前的工作受恰当的开放
+           源代码许可证保护，而且，根据文件中指出的许可证，我有权提交修改后的贡献，
            无论是完全还是部分由我创造，这些贡献都使用同一个开放源代码许可证
-           （除非我被允许用其它的许可证），正如文件中指出的；或者
-      （c）这些贡献由认证（a），（b）或者（c）的人直接提供给我，而
+           （除非我被允许用其它的许可证）；或者
+
+       (c) 这些贡献由认证（a），（b）或者（c）的人直接提供给我，而
            且我没有修改它。
-      （d）我理解并同意这个项目和贡献是公开的，贡献的记录（包括我
-           一起提交的个人记录，包括 sign-off ）被永久维护并且可以和这个项目
+
+       (d) 我理解并同意这个项目和贡献是公开的，贡献的记录（包括我
+           一起提交的个人记录，包括sign-off）被永久维护并且可以和这个项目
            或者开放源代码的许可证同步地再发行。
 
 那么加入这样一行::
 
-       Signed-off-by: Random J Developer <random@developer.example.org>
-
-使用你的真名（抱歉，不能使用假名或者匿名。）
-
-有人在最后加上标签。现在这些东西会被忽略，但是你可以这样做，来标记公司
-内部的过程，或者只是指出关于 sign-off 的一些特殊细节。
-
-如果您是子系统或分支维护人员，有时需要稍微修改收到的补丁，以便合并它们，
-因为树和提交者中的代码不完全相同。如果你严格遵守规则（c），你应该要求提交者
-重新发布，但这完全是在浪费时间和精力。规则（b）允许您调整代码，但是更改一个
-提交者的代码并让他认可您的错误是非常不礼貌的。要解决此问题，建议在最后一个
-由签名行和您的行之间添加一行，指示更改的性质。虽然这并不是强制性的，但似乎
-在描述前加上您的邮件和/或姓名（全部用方括号括起来），这足以让人注意到您对最
-后一分钟的更改负有责任。例如::
+  Signed-off-by: Random J Developer <random@developer.example.org>
 
-	Signed-off-by: Random J Developer <random@developer.example.org>
-	[lucky@maintainer.example.org: struct foo moved from foo.c to foo.h]
-	Signed-off-by: Lucky K Maintainer <lucky@maintainer.example.org>
+使用你的真名（抱歉，不能使用假名或者匿名。）如果使用 ``git commit -s`` 的话
+将会自动完成。撤销也应当包含“Signed-off-by”， ``git revert -s`` 会帮你搞定。
 
-如果您维护一个稳定的分支机构，同时希望对作者进行致谢、跟踪更改、合并修复并
-保护提交者不受投诉，那么这种做法尤其有用。请注意，在任何情况下都不能更改作者
-的ID（From 头），因为它是出现在更改日志中的标识。
+有些人会在最后加上额外的标签。现在这些东西会被忽略，但是你可以这样做，来标记
+公司内部的过程，或者只是指出关于签署的一些特殊细节。
 
-对回合（back-porters）的特别说明：在提交消息的顶部（主题行之后）插入一个补丁
-的起源指示似乎是一种常见且有用的实践，以便于跟踪。例如，下面是我们在3.x稳定
-版本中看到的内容::
+作者签署之后的任何其他签署（Signed-off-by:'s）均来自处理和传递补丁的人员，但
+未参与其开发。签署链应当反映补丁传播到维护者并最终传播到Linus所经过的 **真实**
+路径，首个签署指明单个作者的主要作者身份。
 
-  Date:   Tue Oct 7 07:26:38 2014 -0400
+何时使用Acked-by:，CC:，和Co-Developed by:
+------------------------------------------
 
-    libata: Un-break ATA blacklist
+Singed-off-by: 标签表示签名者参与了补丁的开发，或者他/她在补丁的传递路径中。
 
-    commit 1c40279960bcd7d52dbdf1d466b20d24b99176c8 upstream.
+如果一个人没有直接参与补丁的准备或处理，但希望表示并记录他们对补丁的批准/赞成，
+那么他们可以要求在补丁的变更日志中添加一个Acked-by:。
 
-还有， 这里是一个旧版内核中的一个回合补丁::
+Acked-by: 通常由受影响代码的维护者使用，当该维护者既没有贡献也没有转发补丁时。
 
-    Date:   Tue May 13 22:12:27 2008 +0200
-
-        wireless, airo: waitbusy() won't delay
-
-        [backport of 2.6 commit b7acbdfbd1f277c1eb23f344f899cfa4cd0bf36a]
-
-12）何时使用Acked-by:，CC:，和Co-Developed by:
-----------------------------------------------
-
-Singed-off-by: 标记表示签名者参与了补丁的开发，或者他/她在补丁的传递路径中。
-
-如果一个人没有直接参与补丁的准备或处理，但希望表示并记录他们对补丁的批准，
-那么他们可以要求在补丁的变更日志中添加一个 Acked-by:
-
-Acked-by：通常由受影响代码的维护者使用，当该维护者既没有贡献也没有转发补丁时。
-
-Acked-by: 不像签字人那样正式。这是一个记录，确认人至少审查了补丁，并表示接受。
-因此，补丁合并有时会手动将Acker的“Yep，looks good to me”转换为 Acked-By：（但
+Acked-by: 不像签署那样正式。这是一个记录，确认人至少审阅了补丁，并表示接受。
+因此，补丁合并有时会手动将Acker的“Yep，looks good to me”转换为 Acked-By:（但
 请注意，通常最好要求一个明确的Ack）。
 
 Acked-by：不一定表示对整个补丁的确认。例如，如果一个补丁影响多个子系统，并且
-有一个：来自一个子系统维护者，那么这通常表示只确认影响维护者代码的部分。这里
-应该仔细判断。如有疑问，应参考邮件列表档案中的原始讨论。
+有一个来自某个子系统维护者的Acked-By:，那么这通常表示只确认影响维护者代码的部
+分。这里应该仔细判断。如有疑问，应参考邮件列表存档中的原始讨论。
 
-如果某人有机会对补丁进行评论，但没有提供此类评论，您可以选择在补丁中添加 ``Cc:``
-这是唯一一个标签，它可以在没有被它命名的人显式操作的情况下添加，但它应该表明
-这个人是在补丁上抄送的。讨论中包含了潜在利益相关方。
+如果某人本应有机会对补丁进行评论，但没有提供此类评论，您可以选择在补丁中添加
+``Cc:`` 这是唯一可以在没有被该人明确同意的情况下添加的标签——但它应该表明
+这个人是在补丁上抄送的。此标签记录了讨论中包含的潜在利益相关方。
 
 Co-developed-by: 声明补丁是由多个开发人员共同创建的；当几个人在一个补丁上工
-作时，它用于将属性赋予共同作者（除了 From: 所赋予的作者之外）。因为
-Co-developed-by: 表示作者身份，所以每个共同开发人：必须紧跟在相关合作作者的
-签名之后。标准的签核程序要求：标记的签核顺序应尽可能反映补丁的时间历史，而不
-管作者是通过 From ：还是由 Co-developed-by: 共同开发的。值得注意的是，最后一
-个签字人：必须始终是提交补丁的开发人员。
+作时，它用于给出共同作者（除了From:所给出的作者之外）。因为Co-developed-by:
+表示作者身份，所以每个Co-developed-by:必须紧跟在相关合作作者的签署之后。标准
+签署程序要求Singed-off-by:标签的顺序应尽可能反映补丁的时间历史，无论作者是通
+过From:还是Co-developed-by:表明。值得注意的是，最后一个Singed-off-by:必须是
+提交补丁的开发人员。
 
-注意，当作者也是电子邮件标题“发件人：”行中列出的人时，“From: ” 标记是可选的。
+注意，如果From:作者也是电子邮件标题的From:行中列出的人，则From:标签是可选的。
 
-作者提交的补丁程序示例::
+被From:作者提交的补丁示例::
 
 	<changelog>
 
@@ -421,7 +372,7 @@ Co-developed-by: 表示作者身份，所以每个共同开发人：必须紧跟
 	Signed-off-by: Second Co-Author <second@coauthor.example.org>
 	Signed-off-by: From Author <from@author.example.org>
 
-合作开发者提交的补丁示例::
+被合作开发者提交的补丁示例::
 
 	From: From Author <from@author.example.org>
 
@@ -434,76 +385,85 @@ Co-developed-by: 表示作者身份，所以每个共同开发人：必须紧跟
 	Signed-off-by: Submitting Co-Author <sub@coauthor.example.org>
 
 
-13）使用报告人：、测试人：、审核人：、建议人：、修复人：
---------------------------------------------------------
+使用Reported-by:、Tested-by:、Reviewed-by:、Suggested-by:和Fixes:
+-----------------------------------------------------------------
 
 Reported-by: 给那些发现错误并报告错误的人致谢，它希望激励他们在将来再次帮助
-我们。请注意，如果bug是以私有方式报告的，那么在使用Reported-by标记之前，请
-先请求权限。
+我们。请注意，如果bug是以私有方式报告的，那么在使用Reported-by标签之前，请
+先请求许可。此标签是为Bug设计的；请不要将其用于感谢功能请求。
 
-Tested-by: 标记表示补丁已由指定的人（在某些环境中）成功测试。这个标签通知
-维护人员已经执行了一些测试，为将来的补丁提供了一种定位测试人员的方法，并确
-保测试人员的信誉。
+Tested-by: 标签表示补丁已由指定的人（在某些环境中）成功测试。这个标签通知
+维护人员已经执行了一些测试，为将来的补丁提供了一种定位测试人员的方法，并彰显测试人员的功劳。
 
-Reviewed-by：相反，根据审查人的声明，表明该补丁已被审查并被认为是可接受的：
+Reviewed-by：根据审阅者的监督声明，表明该补丁已被审阅并被认为是可接受的：
 
 
-审查人的监督声明
+审阅者的监督声明
 ^^^^^^^^^^^^^^^^
 
-通过提供我的 Reviewed-by，我声明：
+通过提供我的Reviewed-by:标签，我声明：
 
-        (a) 我已经对这个补丁进行了一次技术审查，以评估它是否适合被包含到
+        (a) 我已经对这个补丁进行了一次技术审阅，以评估它是否适合被包含到
             主线内核中。
 
         (b) 与补丁相关的任何问题、顾虑或问题都已反馈给提交者。我对提交者对
             我的评论的回应感到满意。
 
-        (c) 虽然这一提交可能会改进一些东西，但我相信，此时，（1）对内核
+        (c) 虽然这一提交可能仍可被改进，但我相信，此时，（1）对内核
             进行了有价值的修改，（2）没有包含争论中涉及的已知问题。
 
-        (d) 虽然我已经审查了补丁并认为它是健全的，但我不会（除非另有明确
-            说明）作出任何保证或保证它将在任何给定情况下实现其规定的目的
+        (d) 虽然我已经审阅了补丁并认为它是健全的，但我不会（除非另有明确
+            说明）作出任何保证或担保它会在任何给定情况下实现其规定的目的
             或正常运行。
 
-Reviewed-by 是一种观点声明，即补丁是对内核的适当修改，没有任何遗留的严重技术
-问题。任何感兴趣的审阅者（完成工作的人）都可以为一个补丁提供一个 Review-by
-标签。此标签用于向审阅者提供致谢，并通知维护者已在修补程序上完成的审阅程度。
-Reviewed-by: 当由已知了解主题区域并执行彻底检查的审阅者提供时，通常会增加
+Reviewed-by是一种观点声明，即补丁是对内核的适当修改，没有任何遗留的严重技术
+问题。任何感兴趣的审阅者（完成工作的人）都可以为一个补丁提供一个Reviewed-by
+标签。此标签用于向审阅者提供致谢，并通知维护者补丁的审阅进度。
+当Reviewed-by:标签由已知了解主题区域并执行彻底检查的审阅者提供时，通常会增加
 补丁进入内核的可能性。
 
+一旦从测试人员或审阅者的“Tested-by”和“Reviewed-by”标签出现在邮件列表中，
+作者应在发送下一个版本时将其添加到适用的补丁中。但是，如果补丁在以下版本中发
+生了实质性更改，这些标签可能不再适用，因此应该删除。通常，在补丁更改日志中
+（在 ``---`` 分隔符之后）应该提到删除某人的测试者或审阅者标签。
+
 Suggested-by: 表示补丁的想法是由指定的人提出的，并确保将此想法归功于指定的
 人。请注意，未经许可，不得添加此标签，特别是如果该想法未在公共论坛上发布。
-这就是说，如果我们勤快地致谢我们的创意者，他们很有希望在未来得到鼓舞，再次
+也就是说，如果我们勤快地致谢创意提供者，他们将受到鼓舞，很有希望在未来再次
 帮助我们。
 
-Fixes: 指示补丁在以前的提交中修复了一个问题。它可以很容易地确定错误的来源，
-这有助于检查错误修复。这个标记还帮助稳定内核团队确定应该接收修复的稳定内核
-版本。这是指示补丁修复的错误的首选方法。请参阅 :ref:`cn_describe_changes`
-描述您的更改以了解更多详细信息。
+Fixes: 指示补丁修复了之前提交的一个问题。它可以便于确定错误的来源，这有助于
+检查错误修复。这个标签还帮助稳定内核团队确定应该接收修复的稳定内核版本。这是
+指示补丁修复的错误的首选方法。请参阅 :ref:`zh_describe_changes` 了解更多信息。
 
-.. _cn_the_canonical_patch_format:
+.. note::
 
-12）标准补丁格式
-----------------
+  附加Fixes:标签不会改变稳定内核规则流程，也不改变所有稳定版补丁抄送
+  stable@vger.kernel.org的要求。有关更多信息，请阅读
+  Documentation/translations/zh_CN/process/stable-kernel-rules.rst 。
+
+.. _zh_the_canonical_patch_format:
+
+标准补丁格式
+------------
 
 本节描述如何格式化补丁本身。请注意，如果您的补丁存储在 ``Git`` 存储库中，则
-可以使用 ``git format-patch`` 进行正确的补丁格式设置。但是，这些工具无法创建
+可以使用 ``git format-patch`` 进行正确的补丁格式化。但是，这些工具无法创建
 必要的文本，因此请务必阅读下面的说明。
 
-标准的补丁，标题行是::
+标准的补丁标题行是::
 
     Subject: [PATCH 001/123] 子系统:一句话概述
 
-标准补丁的信体存在如下部分：
+标准补丁的信体包含如下部分：
 
-  - 一个 "from" 行指出补丁作者。后跟空行（仅当发送修补程序的人不是作者时才需要）。
+  - 一个 ``from`` 行指出补丁作者。后跟空行（仅当发送补丁的人不是作者时才需要）。
 
-  - 解释的正文，行以75列包装，这将被复制到永久变更日志来描述这个补丁。
+  - 说明文字，每行最长75列，这将被复制到永久变更日志来描述这个补丁。
 
   - 一个空行
 
-  - 上面描述的“Signed-off-by” 行，也将出现在更改日志中。
+  - 上述的 ``Signed-off-by:`` 行，也将出现在更改日志中。
 
   - 只包含 ``---`` 的标记线。
 
@@ -511,29 +471,29 @@ Fixes: 指示补丁在以前的提交中修复了一个问题。它可以很容
 
   - 实际补丁（ ``diff`` 输出）。
 
-标题行的格式，使得对标题行按字母序排序非常的容易 - 很多 e-mail 客户端都
-可以支持 - 因为序列号是用零填充的，所以按数字排序和按字母排序是一样的。
+标题行的格式，使得对标题行按字母序排序非常的容易——很多邮件客户端都
+可以支持——因为序列号是用零填充的，所以按数字排序和按字母排序是一样的。
 
-e-mail 标题中的“子系统”标识哪个内核子系统将被打补丁。
+邮件标题中的“子系统”标识哪个内核子系统将被打补丁。
 
-e-mail 标题中的“一句话概述”扼要的描述 e-mail 中的补丁。“一句话概述”
+邮件标题中的“一句话概述”扼要的描述邮件中的补丁。“一句话概述”
 不应该是一个文件名。对于一个补丁系列（“补丁系列”指一系列的多个相关补
 丁），不要对每个补丁都使用同样的“一句话概述”。
 
-记住 e-mail 的“一句话概述”会成为该补丁的全局唯一标识。它会蔓延到 git
+记住邮件的“一句话概述”会成为该补丁的全局唯一标识。它会进入 ``git``
 的改动记录里。然后“一句话概述”会被用在开发者的讨论里，用来指代这个补
-丁。用户将希望通过 google 来搜索"一句话概述"来找到那些讨论这个补丁的文
+丁。用户将希望通过搜索引擎搜索“一句话概述”来找到那些讨论这个补丁的文
 章。当人们在两三个月后使用诸如 ``gitk`` 或 ``git log --oneline`` 之类
 的工具查看数千个补丁时，也会很快看到它。
 
 出于这些原因，概述必须不超过70-75个字符，并且必须描述补丁的更改以及为
-什么需要补丁。既要简洁又要描述性很有挑战性，但写得好的概述应该这样做。
+什么需要补丁。既要简洁又要描述性很有挑战性，但写得好的概述应该这样。
 
 概述的前缀可以用方括号括起来：“Subject: [PATCH <tag>...] <概述>”。标记
 不被视为概述的一部分，而是描述应该如何处理补丁。如果补丁的多个版本已发
-送出来以响应评审（即“v1，v2，v3”）或“rfc”，以指示评审请求，那么通用标记
-可能包括版本描述符。如果一个补丁系列中有四个补丁，那么各个补丁可以这样
-编号：1/4、2/4、3/4、4/4。这可以确保开发人员了解补丁应用的顺序，并且他们
+送出来以响应评审（即“v1，v2，v3”）则必须包含版本号，或包含“RFC”以指示
+评审请求。如果一个补丁系列中有四个补丁，那么各个补丁可以这样编号：1/4、2/4、
+3/4、4/4。这可以确保开发人员了解补丁应用的顺序，且
 已经查看或应用了补丁系列中的所有补丁。
 
 一些标题的例子::
@@ -541,95 +501,134 @@ e-mail 标题中的“一句话概述”扼要的描述 e-mail 中的补丁。
     Subject: [patch 2/5] ext2: improve scalability of bitmap searching
     Subject: [PATCHv2 001/207] x86: fix eflags tracking
 
-"From" 行是信体里的最上面一行，具有如下格式：
+``From`` 行是信体里的最上面一行，具有如下格式::
+
         From: Patch Author <author@example.com>
 
-"From" 行指明在永久改动日志里，谁会被确认为作者。如果没有 "From" 行，那
-么邮件头里的 "From: " 行会被用来决定改动日志中的作者。
+``From`` 行指明在永久改动日志里，谁会被确认为作者。如果没有 ``From`` 行，那
+么邮件头里的 ``From:`` 行会被用来决定改动日志中的作者。
 
-说明的主题将会被提交到永久的源代码改动日志里，因此对那些早已经不记得和
-这个补丁相关的讨论细节的有能力的读者来说，是有意义的。包括补丁程序定位
-错误的（内核日志消息、OOPS消息等）症状，对于搜索提交日志以寻找适用补丁的人
-尤其有用。如果一个补丁修复了一个编译失败，那么可能不需要包含所有编译失败；
+说明文字将会被提交到永久的源代码改动日志里，因此应针对那些早已经不记得和这
+个补丁相关的讨论细节的读者。包括补丁处理的故障症状（内核日志消息、oops消息
+等），这对于可能正在搜索提交日志以查找适用补丁的人特别有用。文本应该写得如
+此详细，以便在数周、数月甚至数年后阅读时，能够为读者提供所需的细节信息，以
+掌握创建补丁的 **原因** 。
+
+如果一个补丁修复了一个编译失败，那么可能不需要包含 *所有* 编译失败；
 只要足够让搜索补丁的人能够找到它就行了。与概述一样，既要简洁又要描述性。
 
-"---" 标记行对于补丁处理工具要找到哪里是改动日志信息的结束，是不可缺少
+``---`` 标记行对于补丁处理工具要找到哪里是改动日志信息的结束，是不可缺少
 的。
 
-对于 "---" 标记之后的额外注解，一个好的用途就是用来写 diffstat，用来显
-示修改了什么文件和每个文件都增加和删除了多少行。diffstat 对于比较大的补
-丁特别有用。其余那些只是和时刻或者开发者相关的注解，不合适放到永久的改
-动日志里的，也应该放这里。
-使用 diffstat的选项 "-p 1 -w 70" 这样文件名就会从内核源代码树的目录开始
+对于 ``---`` 标记之后的额外注解，一个好的用途就是用来写 ``diffstat`` ，用来显
+示修改了什么文件和每个文件都增加和删除了多少行。 ``diffstat`` 对于比较大的补
+丁特别有用。
+使用 ``diffstat`` 的选项 ``-p 1 -w 70`` 这样文件名就会从内核源代码树的目录开始
 ，不会占用太宽的空间（很容易适合80列的宽度，也许会有一些缩进。）
+（ ``git`` 默认会生成合适的diffstat。）
+
+其余那些只适用于当时或者与维护者相关的注解，不合适放到永久的改动日志里的，也
+应该放这里。较好的例子就是 ``补丁更改记录`` ，记录了v1和v2版本补丁之间的差异。
+
+请将此信息放在将变更日志与补丁的其余部分分隔开的 ``---`` 行 **之后** 。版本
+信息不是提交到git树的变更日志的一部分。只是供审阅人员使用的附加信息。如果将
+其放置在提交标记上方，则需要手动交互才能将其删除。如果它位于分隔线以下，则在
+应用补丁时会自动剥离::
+
+  <commit message>
+  ...
+  Signed-off-by: Author <author@mail>
+  ---
+  V2 -> V3: Removed redundant helper function
+  V1 -> V2: Cleaned up coding style and addressed review comments
 
-在后面的参考资料中能看到适当的补丁格式的更多细节。
+  path/to/file | 5+++--
+  ...
 
-.. _cn_explicit_in_reply_to:
+在后面的参考资料中能看到正确补丁格式的更多细节。
 
-15) 明确回复邮件头(In-Reply-To)
--------------------------------
+.. _zh_backtraces:
 
-手动添加回复补丁的的标题头(In-Reply_To:) 是有帮助的（例如，使用 ``git send-email`` ）
-将补丁与以前的相关讨论关联起来，例如，将bug修复程序链接到电子邮件和bug报告。
+提交消息中的回溯（Backtraces）
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+回溯有助于记录导致问题的调用链。然而，并非所有回溯都有帮助。例如，早期引导调
+用链是独特而明显的。而逐字复制完整的dmesg输出则会增加时间戳、模块列表、寄存
+器和堆栈转储等分散注意力的信息。
+
+因此，最有用的回溯应该从转储中提取相关信息，以更容易集中在真实问题上。下面是
+一个剪裁良好的回溯示例::
+
+  unchecked MSR access error: WRMSR to 0xd51 (tried to write 0x0000000000000064)
+  at rIP: 0xffffffffae059994 (native_write_msr+0x4/0x20)
+  Call Trace:
+  mba_wrmsr
+  update_domains
+  rdtgroup_mkdir
+
+.. _zh_explicit_in_reply_to:
+
+明确回复邮件头（In-Reply-To）
+-----------------------------
+
+手动添加回复补丁的的邮件头（In-Reply_To:）是有用的（例如，使用 ``git send-email`` ），
+可以将补丁与以前的相关讨论关联起来，例如，将bug补丁链接到电子邮件和bug报告。
 但是，对于多补丁系列，最好避免在回复时使用链接到该系列的旧版本。这样，
-补丁的多个版本就不会成为电子邮件客户端中无法管理的引用序列。如果链接有用，
+补丁的多个版本就不会成为电子邮件客户端中无法管理的引用树。如果链接有用，
 可以使用 https://lore.kernel.org/ 重定向器（例如，在封面电子邮件文本中）
 链接到补丁系列的早期版本。
 
-16) 发送git pull请求
---------------------
-
-如果您有一系列补丁，那么让维护人员通过git pull操作将它们直接拉入子系统存储
-库可能是最方便的。但是，请注意，从开发人员那里获取补丁比从邮件列表中获取补
-丁需要更高的信任度。因此，许多子系统维护人员不愿意接受请求，特别是来自新的
-未知开发人员的请求。如果有疑问，您可以在封面邮件中使用pull 请求作为补丁系列
-正常发布的一个选项，让维护人员可以选择使用其中之一。
-
-pull 请求的主题行中应该有[Git Pull]。请求本身应该在一行中包含存储库名称和
-感兴趣的分支；它应该看起来像::
+给出基础树信息
+--------------
 
-  Please pull from
+当其他开发人员收到您的补丁并开始审阅时，知道应该将您的工作放到代码树历史记录
+中的什么位置通常很有用。这对于自动化持续集成流水（CI）特别有用，这些流水线试
+图运行一系列测试，以便在维护人员开始审阅之前确定提交的质量。
 
-      git://jdelvare.pck.nerim.net/jdelvare-2.6 i2c-for-linus
+如果您使用 ``git format-patch`` 生成补丁，则可以通过 ``--base`` 标志在提交中
+自动包含基础树信息。使用此选项最简单、最方便的方法是配合主题分支::
 
-  to get these changes:
+    $ git checkout -t -b my-topical-branch master
+    Branch 'my-topical-branch' set up to track local branch 'master'.
+    Switched to a new branch 'my-topical-branch'
 
+    [perform your edits and commits]
 
-pull 请求还应该包含一条整体消息，说明请求中将包含什么，一个补丁本身的 ``Git shortlog``
-以及一个显示补丁系列整体效果的 ``diffstat`` 。当然，将所有这些信息收集在一起
-的最简单方法是让 ``git`` 使用 ``git request-pull`` 命令为您完成这些工作。
+    $ git format-patch --base=auto --cover-letter -o outgoing/ master
+    outgoing/0000-cover-letter.patch
+    outgoing/0001-First-Commit.patch
+    outgoing/...
 
-一些维护人员（包括Linus）希望看到来自已签名提交的请求；这增加了他们对你的
-请求信心。特别是，在没有签名标签的情况下，Linus 不会从像 Github 这样的公共
-托管站点拉请求。
+当你编辑 ``outgoing/0000-cover-letter.patch`` 时，您会注意到在它的最底部有一
+行 ``base-commit:`` 尾注，它为审阅者和CI工具提供了足够的信息以正确执行
+``git am`` 而不必担心冲突::
 
-创建此类签名的第一步是生成一个 GNRPG 密钥，并由一个或多个核心内核开发人员对
-其进行签名。这一步对新开发人员来说可能很困难，但没有办法绕过它。参加会议是
-找到可以签署您的密钥的开发人员的好方法。
+    $ git checkout -b patch-review [base-commit-id]
+    Switched to a new branch 'patch-review'
+    $ git am patches.mbox
+    Applying: First Commit
+    Applying: ...
 
-一旦您在Git 中准备了一个您希望有人拉的补丁系列，就用 ``git tag -s`` 创建一
-个签名标记。这将创建一个新标记，标识该系列中的最后一次提交，并包含用您的私
-钥创建的签名。您还可以将changelog样式的消息添加到标记中；这是一个描述拉请求
-整体效果的理想位置。
+有关此选项的更多信息，请参阅 ``man git-format-patch`` 。
 
-如果维护人员将要从中提取的树不是您正在使用的存储库，请不要忘记将已签名的标记
-显式推送到公共树。
+.. note::
 
-生成拉请求时，请使用已签名的标记作为目标。这样的命令可以实现::
+    ``--base`` 功能是在2.9.0版git中引入的。
 
-  git request-pull master git://my.public.tree/linux.git my-signed-tag
+如果您不使用git格式化补丁，仍然可以包含相同的 ``base-commit`` 尾注，以指示您
+的工作所基于的树的提交哈希。你应该在封面邮件或系列的第一个补丁中添加它，它应
+该放在 ``---`` 行的下面或所有其他内容之后，即只在你的电子邮件签名之前。
 
 参考文献
 --------
 
-Andrew Morton, "The perfect patch" (tpp).
+Andrew Morton，“完美的补丁”（tpp）
   <https://www.ozlabs.org/~akpm/stuff/tpp.txt>
 
-Jeff Garzik, "Linux kernel patch submission format".
+Jeff Garzik，“Linux内核补丁提交格式”
   <https://web.archive.org/web/20180829112450/http://linux.yyz.us/patch-format.html>
 
-Greg Kroah-Hartman, "How to piss off a kernel subsystem maintainer".
+Greg Kroah-Hartman，“如何惹恼内核子系统维护人员”
   <http://www.kroah.com/log/linux/maintainer.html>
 
   <http://www.kroah.com/log/linux/maintainer-02.html>
@@ -642,16 +641,15 @@ Greg Kroah-Hartman, "How to piss off a kernel subsystem maintainer".
 
   <http://www.kroah.com/log/linux/maintainer-06.html>
 
-NO!!!! No more huge patch bombs to linux-kernel@vger.kernel.org people!
+不！！！别再发巨型补丁炸弹给linux-kernel@vger.kernel.org的人们了！
   <https://lore.kernel.org/r/20050711.125305.08322243.davem@davemloft.net>
 
-Kernel Documentation/process/coding-style.rst:
-  :ref:`Documentation/translations/zh_CN/process/coding-style.rst <cn_codingstyle>`
+内核 Documentation/translations/zh_CN/process/coding-style.rst
 
-Linus Torvalds's mail on the canonical patch format:
+Linus Torvalds关于标准补丁格式的邮件
   <https://lore.kernel.org/r/Pine.LNX.4.58.0504071023190.28951@ppc970.osdl.org>
 
-Andi Kleen, "On submitting kernel patches"
-  Some strategies to get difficult or controversial changes in.
+Andi Kleen，“提交补丁之路”
+  一些帮助合入困难或有争议的变更的策略。
 
   http://halobates.de/on-submitting-patches.pdf
-- 
cgit v1.2.3


From 78b07714c4ef4f7faeb4e7e57ed0530986ba8c88 Mon Sep 17 00:00:00 2001
From: Wu XiangCheng <bobwxc@email.cn>
Date: Fri, 19 Aug 2022 09:09:42 +0800
Subject: docs/zh_CN: Fix two missing labels in zh_CN/process

* Add back still referenced labels in submitting-patches.rst and
  email-clients.rst.
* Fix a typo.

Fixes: fdb34b18b959 ("docs/zh_CN: Update zh_CN/process/submitting-patches.rst to 5.19")
Fixes: d7aeaebb920f ("docs/zh_CN: Update zh_CN/process/email-clients.rst to 5.19")
Signed-off-by: Wu XiangCheng <bobwxc@email.cn>
Link: https://lore.kernel.org/r/Yv7i1tYMvK9J/NHj@bobwxc.mipc
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/translations/zh_CN/process/email-clients.rst      | 2 ++
 Documentation/translations/zh_CN/process/submitting-patches.rst | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/translations/zh_CN/process/email-clients.rst b/Documentation/translations/zh_CN/process/email-clients.rst
index fa113f071855..34d51cdadc7b 100644
--- a/Documentation/translations/zh_CN/process/email-clients.rst
+++ b/Documentation/translations/zh_CN/process/email-clients.rst
@@ -2,6 +2,8 @@
 
 .. include:: ../disclaimer-zh_CN.rst
 
+.. _cn_email_clients:
+
 :Original: Documentation/process/email-clients.rst
 
 :译者:
diff --git a/Documentation/translations/zh_CN/process/submitting-patches.rst b/Documentation/translations/zh_CN/process/submitting-patches.rst
index 26565abcbd8d..f8978f02057c 100644
--- a/Documentation/translations/zh_CN/process/submitting-patches.rst
+++ b/Documentation/translations/zh_CN/process/submitting-patches.rst
@@ -2,6 +2,8 @@
 
 .. include:: ../disclaimer-zh_CN.rst
 
+.. _cn_submittingpatches:
+
 :Original: Documentation/process/submitting-patches.rst
 
 :译者:
@@ -100,7 +102,7 @@ xyzzy do frotz”或“[I]changed xyzzy to do frotz”，就好像你在命令
 你的补丁修复了一个缺陷，需要添加一个带有URL的标签指向邮件列表存档或缺陷跟踪器
 的相关报告；如果该补丁是由一些早先邮件列表讨论或网络上的记录引起的，请指向它。
 
-当链接到邮件列表存档是，请首选lore.kernel.org邮件存档服务。用邮件中的
+当链接到邮件列表存档时，请首选lore.kernel.org邮件存档服务。用邮件中的
 ``Message-ID`` 头（去掉尖括号）可以创建链接URL。例如::
 
     Link: https://lore.kernel.org/r/30th.anniversary.repost@klaava.Helsinki.FI/
-- 
cgit v1.2.3


From 021904f38baed568bc12b017386740b7a4cff1df Mon Sep 17 00:00:00 2001
From: Zhao Mengmeng <zhaomengmeng@kylinos.cn>
Date: Mon, 22 Aug 2022 21:36:53 -0400
Subject: Documentation: filesystems: xfs: update pseudocode and typo fixes

According to the implementation of xfs_trans_roll(), it calls
xfs_trans_reserve(), which reserves not only log space, but also
free disk blocks. In short, the "transaction stuff". So change
xfs_log_reserve() to xfs_trans_reserve().

Besides, fix several typo issues.

Signed-off-by: Zhao Mengmeng <zhaomengmeng@kylinos.cn>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20220823013653.203469-1-zhaomzhao@126.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/filesystems/xfs-delayed-logging-design.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Documentation/filesystems/xfs-delayed-logging-design.rst b/Documentation/filesystems/xfs-delayed-logging-design.rst
index 4ef419f54663..02b32030bab3 100644
--- a/Documentation/filesystems/xfs-delayed-logging-design.rst
+++ b/Documentation/filesystems/xfs-delayed-logging-design.rst
@@ -100,7 +100,7 @@ transactions together::
 
 	ntp = xfs_trans_dup(tp);
 	xfs_trans_commit(tp);
-	xfs_log_reserve(ntp);
+	xfs_trans_reserve(ntp);
 
 This results in a series of "rolling transactions" where the inode is locked
 across the entire chain of transactions.  Hence while this series of rolling
@@ -191,7 +191,7 @@ transaction rolling mechanism to re-reserve space on every transaction roll. We
 know from the implementation of the permanent transactions how many transaction
 rolls are likely for the common modifications that need to be made.
 
-For example, and inode allocation is typically two transactions - one to
+For example, an inode allocation is typically two transactions - one to
 physically allocate a free inode chunk on disk, and another to allocate an inode
 from an inode chunk that has free inodes in it.  Hence for an inode allocation
 transaction, we might set the reservation log count to a value of 2 to indicate
@@ -200,7 +200,7 @@ chain. Each time a permanent transaction rolls, it consumes an entire unit
 reservation.
 
 Hence when the permanent transaction is first allocated, the log space
-reservation is increases from a single unit reservation to multiple unit
+reservation is increased from a single unit reservation to multiple unit
 reservations. That multiple is defined by the reservation log count, and this
 means we can roll the transaction multiple times before we have to re-reserve
 log space when we roll the transaction. This ensures that the common
@@ -259,7 +259,7 @@ the next transaction in the sequeunce, but we have none remaining. We cannot
 sleep during the transaction commit process waiting for new log space to become
 available, as we may end up on the end of the FIFO queue and the items we have
 locked while we sleep could end up pinning the tail of the log before there is
-enough free space in the log to fulfil all of the pending reservations and
+enough free space in the log to fulfill all of the pending reservations and
 then wake up transaction commit in progress.
 
 To take a new reservation without sleeping requires us to be able to take a
@@ -615,7 +615,7 @@ those changes into the current checkpoint context. We then initialise a new
 context and attach that to the CIL for aggregation of new transactions.
 
 This allows us to unlock the CIL immediately after transfer of all the
-committed items and effectively allow new transactions to be issued while we
+committed items and effectively allows new transactions to be issued while we
 are formatting the checkpoint into the log. It also allows concurrent
 checkpoints to be written into the log buffers in the case of log force heavy
 workloads, just like the existing transaction commit code does. This, however,
@@ -886,7 +886,7 @@ can be multiple outstanding checkpoint contexts, we can still see elevated pin
 counts, but as each checkpoint completes the pin count will retain the correct
 value according to it's context.
 
-Just to make matters more slightly more complex, this checkpoint level context
+Just to make matters slightly more complex, this checkpoint level context
 for the pin count means that the pinning of an item must take place under the
 CIL commit/flush lock. If we pin the object outside this lock, we cannot
 guarantee which context the pin count is associated with. This is because of
-- 
cgit v1.2.3


From 6cf3116d5eeda0284bbedb8610192df31f86cdf0 Mon Sep 17 00:00:00 2001
From: JunChao Sun <sunjunchao2870@gmail.com>
Date: Mon, 15 Aug 2022 05:52:33 -0700
Subject: Documentation: ext4: correct the document about superblock

The description of s_lastcheck_hi, s_first_error_time_hi, and
s_last_error_time_hi fields refer to themselves, while these means
referring to upper 8 bits (byte) of corresponding fields (s_lastcheck,
s_first_error_time, and s_last_error_time). Correct the mistake.

Signed-off-by: JunChao Sun <sunjunchao2870@gmail.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Link: https://lore.kernel.org/r/20220815125233.2040-1-sunjunchao2870@gmail.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/filesystems/ext4/super.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/filesystems/ext4/super.rst b/Documentation/filesystems/ext4/super.rst
index 268888522e35..0152888cac29 100644
--- a/Documentation/filesystems/ext4/super.rst
+++ b/Documentation/filesystems/ext4/super.rst
@@ -456,15 +456,15 @@ The ext4 superblock is laid out as follows in
    * - 0x277
      - __u8
      - s_lastcheck_hi
-     - Upper 8 bits of the s_lastcheck_hi field.
+     - Upper 8 bits of the s_lastcheck field.
    * - 0x278
      - __u8
      - s_first_error_time_hi
-     - Upper 8 bits of the s_first_error_time_hi field.
+     - Upper 8 bits of the s_first_error_time field.
    * - 0x279
      - __u8
      - s_last_error_time_hi
-     - Upper 8 bits of the s_last_error_time_hi field.
+     - Upper 8 bits of the s_last_error_time field.
    * - 0x27A
      - __u8
      - s_pad[2]
-- 
cgit v1.2.3


From 3d8b56d7b10e1ca972330d8892da3f3e9a6346f2 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng@loongson.cn>
Date: Thu, 25 Aug 2022 20:53:25 +0800
Subject: docs/zh_CN: Update the translation of io_ordering to 6.0-rc2

Update to commit d1ce350015d8 Documentation: ("Add
io_ordering.rst to driver-api manual").
Move ../zh_CN/io_ordering.txt to ../zh_CN/driver-api/io_ordering.rst.

Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Reviewed-by: Wu XiangCheng <bobwxc@email.cn>
Link: https://lore.kernel.org/r/c66f6d17c509c2c93f2afd30223c4bcf734f8317.1661431365.git.siyanteng@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../translations/zh_CN/driver-api/index.rst        |  2 +-
 .../translations/zh_CN/driver-api/io_ordering.rst  | 60 +++++++++++++++++++
 Documentation/translations/zh_CN/io_ordering.txt   | 67 ----------------------
 3 files changed, 61 insertions(+), 68 deletions(-)
 create mode 100644 Documentation/translations/zh_CN/driver-api/io_ordering.rst
 delete mode 100644 Documentation/translations/zh_CN/io_ordering.txt

diff --git a/Documentation/translations/zh_CN/driver-api/index.rst b/Documentation/translations/zh_CN/driver-api/index.rst
index 24eb2198e5f1..ba354e1f4e6d 100644
--- a/Documentation/translations/zh_CN/driver-api/index.rst
+++ b/Documentation/translations/zh_CN/driver-api/index.rst
@@ -25,6 +25,7 @@ Linux驱动实现者的API指南
    :maxdepth: 2
 
    gpio/index
+   io_ordering
 
 Todolist:
 
@@ -97,7 +98,6 @@ Todolist:
 *   isa
 *   isapnp
 *   io-mapping
-*   io_ordering
 *   generic-counter
 *   memory-devices/index
 *   men-chameleon-bus
diff --git a/Documentation/translations/zh_CN/driver-api/io_ordering.rst b/Documentation/translations/zh_CN/driver-api/io_ordering.rst
new file mode 100644
index 000000000000..4dbfa4ce92a0
--- /dev/null
+++ b/Documentation/translations/zh_CN/driver-api/io_ordering.rst
@@ -0,0 +1,60 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/driver-api/io_ordering.rst
+
+:翻译:
+
+ 林永听 Lin Yongting <linyongting@gmail.com>
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+===========================
+对内存映射地址的I/O写入排序
+===========================
+
+在某些平台上，所谓的内存映射I/O是弱顺序。在这些平台上，驱动开发者有责任
+保证I/O内存映射地址的写操作按程序图意的顺序达到设备。通常读取一个“安全”
+设备寄存器或桥寄存器，触发IO芯片清刷未处理的写操作到达设备后才处理读操作，
+而达到保证目的。驱动程序通常在spinlock保护的临界区退出之前使用这种技术。
+这也可以保证后面的写操作只在前面的写操作之后到达设备（这非常类似于内存
+屏障操作，mb()，不过仅适用于I/O）。
+
+假设一个设备驱动程的具体例子::
+
+                ...
+        CPU A:  spin_lock_irqsave(&dev_lock, flags)
+        CPU A:  val = readl(my_status);
+        CPU A:  ...
+        CPU A:  writel(newval, ring_ptr);
+        CPU A:  spin_unlock_irqrestore(&dev_lock, flags)
+                ...
+        CPU B:  spin_lock_irqsave(&dev_lock, flags)
+        CPU B:  val = readl(my_status);
+        CPU B:  ...
+        CPU B:  writel(newval2, ring_ptr);
+        CPU B:  spin_unlock_irqrestore(&dev_lock, flags)
+                ...
+
+上述例子中，设备可能会先接收到newval2的值，然后接收到newval的值，问题就
+发生了。不过很容易通过下面方法来修复::
+
+                ...
+        CPU A:  spin_lock_irqsave(&dev_lock, flags)
+        CPU A:  val = readl(my_status);
+        CPU A:  ...
+        CPU A:  writel(newval, ring_ptr);
+        CPU A:  (void)readl(safe_register); /* 配置寄存器？*/
+        CPU A:  spin_unlock_irqrestore(&dev_lock, flags)
+                ...
+        CPU B:  spin_lock_irqsave(&dev_lock, flags)
+        CPU B:  val = readl(my_status);
+        CPU B:  ...
+        CPU B:  writel(newval2, ring_ptr);
+        CPU B:  (void)readl(safe_register); /* 配置寄存器？*/
+        CPU B:  spin_unlock_irqrestore(&dev_lock, flags)
+
+在解决方案中，读取safe_register寄存器，触发IO芯片清刷未处理的写操作，
+再处理后面的读操作，防止引发数据不一致问题。
diff --git a/Documentation/translations/zh_CN/io_ordering.txt b/Documentation/translations/zh_CN/io_ordering.txt
deleted file mode 100644
index 7bb3086227ae..000000000000
--- a/Documentation/translations/zh_CN/io_ordering.txt
+++ /dev/null
@@ -1,67 +0,0 @@
-Chinese translated version of Documentation/driver-api/io_ordering.rst
-
-If you have any comment or update to the content, please contact the
-original document maintainer directly.  However, if you have a problem
-communicating in English you can also ask the Chinese maintainer for
-help.  Contact the Chinese maintainer if this translation is outdated
-or if there is a problem with the translation.
-
-Chinese maintainer: Lin Yongting <linyongting@gmail.com>
----------------------------------------------------------------------
-Documentation/driver-api/io_ordering.rst 的中文翻译
-
-如果想评论或更新本文的内容，请直接联系原文档的维护者。如果你使用英文
-交流有困难的话，也可以向中文版维护者求助。如果本翻译更新不及时或者翻
-译存在问题，请联系中文版维护者。
-
-中文版维护者： 林永听 Lin Yongting <linyongting@gmail.com>
-中文版翻译者： 林永听 Lin Yongting <linyongting@gmail.com>
-中文版校译者： 林永听 Lin Yongting <linyongting@gmail.com>
-
-
-以下为正文
----------------------------------------------------------------------
-
-在某些平台上，所谓的内存映射I/O是弱顺序。在这些平台上，驱动开发者有责任
-保证I/O内存映射地址的写操作按程序图意的顺序达到设备。通常读取一个“安全”
-设备寄存器或桥寄存器，触发IO芯片清刷未处理的写操作到达设备后才处理读操作，
-而达到保证目的。驱动程序通常在spinlock保护的临界区退出之前使用这种技术。
-这也可以保证后面的写操作只在前面的写操作之后到达设备（这非常类似于内存
-屏障操作，mb()，不过仅适用于I/O）。
-
-假设一个设备驱动程的具体例子：
-
-        ...
-CPU A:  spin_lock_irqsave(&dev_lock, flags)
-CPU A:  val = readl(my_status);
-CPU A:  ...
-CPU A:  writel(newval, ring_ptr);
-CPU A:  spin_unlock_irqrestore(&dev_lock, flags)
-        ...
-CPU B:  spin_lock_irqsave(&dev_lock, flags)
-CPU B:  val = readl(my_status);
-CPU B:  ...
-CPU B:  writel(newval2, ring_ptr);
-CPU B:  spin_unlock_irqrestore(&dev_lock, flags)
-        ...
-
-上述例子中，设备可能会先接收到newval2的值，然后接收到newval的值，问题就
-发生了。不过很容易通过下面方法来修复：
-
-        ...
-CPU A:  spin_lock_irqsave(&dev_lock, flags)
-CPU A:  val = readl(my_status);
-CPU A:  ...
-CPU A:  writel(newval, ring_ptr);
-CPU A:  (void)readl(safe_register); /* 配置寄存器？*/
-CPU A:  spin_unlock_irqrestore(&dev_lock, flags)
-        ...
-CPU B:  spin_lock_irqsave(&dev_lock, flags)
-CPU B:  val = readl(my_status);
-CPU B:  ...
-CPU B:  writel(newval2, ring_ptr);
-CPU B:  (void)readl(safe_register); /* 配置寄存器？*/
-CPU B:  spin_unlock_irqrestore(&dev_lock, flags)
-
-在解决方案中，读取safe_register寄存器，触发IO芯片清刷未处理的写操作，
-再处理后面的读操作，防止引发数据不一致问题。
-- 
cgit v1.2.3


From 876d8552fa7c116d93254c61e11596dbcb33c201 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng@loongson.cn>
Date: Thu, 25 Aug 2022 20:53:26 +0800
Subject: docs/zh_CN: Remove IRQ and oops-tracing

The English version of IRQ has been refactored and
the new document (not called that anymore) has been
moved to core-api/irq, which has been translated
into Chinese. oops-tracing is pretty much the same,
let's remove them.

Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Reviewed-by: Wu XiangCheng <bobwxc@email.cn>
Link: https://lore.kernel.org/r/7dc43c33ea7e2edf668070b203dce83b285f2cdb.1661431365.git.siyanteng@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/translations/zh_CN/IRQ.txt          |  39 ----
 Documentation/translations/zh_CN/oops-tracing.txt | 212 ----------------------
 2 files changed, 251 deletions(-)
 delete mode 100644 Documentation/translations/zh_CN/IRQ.txt
 delete mode 100644 Documentation/translations/zh_CN/oops-tracing.txt

diff --git a/Documentation/translations/zh_CN/IRQ.txt b/Documentation/translations/zh_CN/IRQ.txt
deleted file mode 100644
index 9aec8dca4fcf..000000000000
--- a/Documentation/translations/zh_CN/IRQ.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-Chinese translated version of Documentation/core-api/irq/index.rst
-
-If you have any comment or update to the content, please contact the
-original document maintainer directly.  However, if you have a problem
-communicating in English you can also ask the Chinese maintainer for
-help.  Contact the Chinese maintainer if this translation is outdated
-or if there is a problem with the translation.
-
-Maintainer: Eric W. Biederman <ebiederman@xmission.com>
-Chinese maintainer: Fu Wei <tekkamanninja@gmail.com>
----------------------------------------------------------------------
-Documentation/core-api/irq/index.rst 的中文翻译
-
-如果想评论或更新本文的内容，请直接联系原文档的维护者。如果你使用英文
-交流有困难的话，也可以向中文版维护者求助。如果本翻译更新不及时或者翻
-译存在问题，请联系中文版维护者。
-英文版维护者： Eric W. Biederman <ebiederman@xmission.com>
-中文版维护者： 傅炜 Fu Wei <tekkamanninja@gmail.com>
-中文版翻译者： 傅炜 Fu Wei <tekkamanninja@gmail.com>
-中文版校译者： 傅炜 Fu Wei <tekkamanninja@gmail.com>
-
-
-以下为正文
----------------------------------------------------------------------
-何为 IRQ?
-
-一个 IRQ 是来自某个设备的一个中断请求。目前，它们可以来自一个硬件引脚，
-或来自一个数据包。多个设备可能连接到同个硬件引脚，从而共享一个 IRQ。
-
-一个 IRQ 编号是用于告知硬件中断源的内核标识。通常情况下，这是一个
-全局 irq_desc 数组的索引，但是除了在 linux/interrupt.h 中的实现，
-具体的细节是体系结构特定的。
-
-一个 IRQ 编号是设备上某个可能的中断源的枚举。通常情况下，枚举的编号是
-该引脚在系统内中断控制器的所有输入引脚中的编号。对于 ISA 总线中的情况，
-枚举的是在两个 i8259 中断控制器中 16 个输入引脚。
-
-架构可以对 IRQ 编号指定额外的含义，在硬件涉及任何手工配置的情况下，
-是被提倡的。ISA 的 IRQ 是一个分配这类额外含义的典型例子。
diff --git a/Documentation/translations/zh_CN/oops-tracing.txt b/Documentation/translations/zh_CN/oops-tracing.txt
deleted file mode 100644
index c5f3bda7abcb..000000000000
--- a/Documentation/translations/zh_CN/oops-tracing.txt
+++ /dev/null
@@ -1,212 +0,0 @@
-Chinese translated version of Documentation/admin-guide/bug-hunting.rst
-
-If you have any comment or update to the content, please contact the
-original document maintainer directly.  However, if you have a problem
-communicating in English you can also ask the Chinese maintainer for
-help.  Contact the Chinese maintainer if this translation is outdated
-or if there is a problem with the translation.
-
-Chinese maintainer: Dave Young <hidave.darkstar@gmail.com>
----------------------------------------------------------------------
-Documentation/admin-guide/bug-hunting.rst 的中文翻译
-
-如果想评论或更新本文的内容，请直接联系原文档的维护者。如果你使用英文
-交流有困难的话，也可以向中文版维护者求助。如果本翻译更新不及时或者翻
-译存在问题，请联系中文版维护者。
-
-中文版维护者： 杨瑞 Dave Young <hidave.darkstar@gmail.com>
-中文版翻译者： 杨瑞 Dave Young <hidave.darkstar@gmail.com>
-中文版校译者： 李阳 Li Yang <leoyang.li@nxp.com>
-               王聪 Wang Cong <xiyou.wangcong@gmail.com>
-
-以下为正文
----------------------------------------------------------------------
-
-注意： ksymoops 在2.6中是没有用的。 请以原有格式使用Oops(来自dmesg，等等)。
-忽略任何这样那样关于“解码Oops”或者“通过ksymoops运行”的文档。 如果你贴出运行过
-ksymoops的来自2.6的Oops，人们只会让你重贴一次。
-
-快速总结
--------------
-
-发现Oops并发送给看似相关的内核领域的维护者。别太担心对不上号。如果你不确定就发给
-和你所做的事情相关的代码的负责人。 如果可重现试着描述怎样重构。 那甚至比oops更有
-价值。
-
-如果你对于发送给谁一无所知， 发给linux-kernel@vger.kernel.org。感谢你帮助Linux
-尽可能地稳定。
-
-Oops在哪里?
-----------------------
-
-通常Oops文本由klogd从内核缓冲区里读取并传给syslogd，由syslogd写到syslog文件中，
-典型地是/var/log/messages(依赖于/etc/syslog.conf)。有时klogd崩溃了,这种情况下你
-能够运行dmesg > file来从内核缓冲区中读取数据并保存下来。 否则你可以
-cat /proc/kmsg > file， 然而你必须介入中止传输， kmsg是一个“永不结束的文件”。如
-果机器崩溃坏到你不能输入命令或者磁盘不可用那么你有三种选择:-
-
-（1） 手抄屏幕上的文本待机器重启后再输入计算机。 麻烦但如果没有针对崩溃的准备，
-这是仅有的选择。 另外，你可以用数码相机把屏幕拍下来-不太好，但比没有强。 如果信
-息滚动到了终端的上面，你会发现以高分辩率启动（比如，vga=791）会让你读到更多的文
-本。（注意：这需要vesafb，所以对‘早期’的oops没有帮助）
-
-（2）用串口终端启动（请参看Documentation/admin-guide/serial-console.rst），运行一个null
-modem到另一台机器并用你喜欢的通讯工具获取输出。Minicom工作地很好。
-
-（3）使用Kdump（请参看Documentation/admin-guide/kdump/kdump.rst），
-使用在Documentation/admin-guide/kdump/gdbmacros.txt中定义的dmesg gdb宏，从旧的内存中提取内核
-环形缓冲区。
-
-完整信息
-----------------
-
-注意：以下来自于Linus的邮件适用于2.4内核。 我因为历史原因保留了它，并且因为其中
-一些信息仍然适用。 特别注意的是，请忽略任何ksymoops的引用。
-
-From: Linus Torvalds <torvalds@osdl.org>
-
-怎样跟踪Oops.. [原发到linux-kernel的一封邮件]
-
-主要的窍门是有五年和这些烦人的oops消息打交道的经验;-)
-
-实际上，你有办法使它更简单。我有两个不同的方法：
-
-	gdb /usr/src/linux/vmlinux
-	gdb> disassemble <offending_function>
-
-那是发现问题的简单办法，至少如果bug报告做的好的情况下（象这个一样-运行ksymoops
-得到oops发生的函数及函数内的偏移）。
-
-哦，如果报告发生的内核以相同的编译器和相似的配置编译它会有帮助的。
-
-另一件要做的事是反汇编bug报告的“Code”部分：ksymoops也会用正确的工具来做这件事，
-但如果没有那些工具你可以写一个傻程序：
-
-	char str[] = "\xXX\xXX\xXX...";
-	main(){}
-
-并用gcc -g编译它然后执行“disassemble str”（XX部分是由Oops报告的值-你可以仅剪切
-粘贴并用“\x”替换空格-我就是这么做的，因为我懒得写程序自动做这一切）。
-
-另外，你可以用scripts/decodecode这个shell脚本。它的使用方法是：
-decodecode < oops.txt
-
-“Code”之后的十六进制字节可能（在某些架构上）有一些当前指令之前的指令字节以及
-当前和之后的指令字节
-
-Code: f9 0f 8d f9 00 00 00 8d 42 0c e8 dd 26 11 c7 a1 60 ea 2b f9 8b 50 08 a1
-64 ea 2b f9 8d 34 82 8b 1e 85 db 74 6d 8b 15 60 ea 2b f9 <8b> 43 04 39 42 54
-7e 04 40 89 42 54 8b 43 04 3b 05 00 f6 52 c0
-
-最后，如果你想知道代码来自哪里，你可以：
-
-	cd /usr/src/linux
-	make fs/buffer.s 	# 或任何产生BUG的文件
-
-然后你会比gdb反汇编更清楚的知道发生了什么。
-
-现在，问题是把你所拥有的所有数据结合起来：C源码（关于它应该怎样的一般知识），
-汇编代码及其反汇编得到的代码（另外还有从“oops”消息得到的寄存器状态-对了解毁坏的
-指针有用，而且当你有了汇编代码你也能拿其它的寄存器和任何它们对应的C表达式做匹配
-）。
-
-实际上，你仅需看看哪里不匹配（这个例子是“Code”反汇编和编译器生成的代码不匹配）。
-然后你须要找出为什么不匹配。通常很简单-你看到代码使用了空指针然后你看代码想知道
-空指针是怎么出现的，还有检查它是否合法..
-
-现在，如果明白这是一项耗时的工作而且需要一丁点儿的专心，没错。这就是我为什么大多
-只是忽略那些没有符号表信息的崩溃报告的原因：简单的说太难查找了（我有一些
-程序用于在内核代码段中搜索特定的模式，而且有时我也已经能找出那些崩溃的地方，但是
-仅仅是找出正确的序列也确实需要相当扎实的内核知识）
-
-_有时_会发生这种情况，我仅看到崩溃中的反汇编代码序列， 然后我马上就明白问题出在
-哪里。这时我才意识到自己干这个工作已经太长时间了;-)
-
-		Linus
-
-
----------------------------------------------------------------------------
-关于Oops跟踪的注解：
-
-为了帮助Linus和其它内核开发者，klogd纳入了大量的支持来处理保护错误。为了拥有对
-地址解析的完整支持至少应该使用1.3-pl3的sysklogd包。
-
-当保护错误发生时，klogd守护进程自动把内核日志信息中的重要地址翻译成它们相应的符
-号。
-
-klogd执行两种类型的地址解析。首先是静态翻译其次是动态翻译。静态翻译和ksymoops
-一样使用System.map文件。为了做静态翻译klogd守护进程必须在初始化时能找到system
-map文件。关于klogd怎样搜索map文件请参看klogd手册页。
-
-动态地址翻译在使用内核可装载模块时很重要。 因为内核模块的内存是从内核动态内存池
-里分配的，所以不管是模块开始位置还是模块中函数和符号的位置都不是固定的。
-
-内核支持允许程序决定装载哪些模块和它们在内存中位置的系统调用。使用这些系统调用
-klogd守护进程生成一张符号表用于调试发生在可装载模块中的保护错误。
-
-至少klogd会提供产生保护错误的模块名。还可有额外的符号信息供可装载模块开发者选择
-以从模块中输出符号信息。
-
-因为内核模块环境可能是动态的，所以必须有一种机制当模块环境发生改变时来通知klogd
-守护进程。 有一些可用的命令行选项允许klogd向当前执行中的守护进程发送信号，告知符
-号信息应该被刷新了。 更多信息请参看klogd手册页。
-
-sysklogd发布时包含一个补丁修改了modules-2.0.0包，无论何时一个模块装载或者卸载都
-会自动向klogd发送信号。打上这个补丁提供了必要的对调试发生于内核可装载模块的保护
-错误的无缝支持。
-
-以下是被klogd处理过的发生在可装载模块中的一个保护错误例子：
----------------------------------------------------------------------------
-Aug 29 09:51:01 blizard kernel: Unable to handle kernel paging request at virtual address f15e97cc
-Aug 29 09:51:01 blizard kernel: current->tss.cr3 = 0062d000, %cr3 = 0062d000
-Aug 29 09:51:01 blizard kernel: *pde = 00000000
-Aug 29 09:51:01 blizard kernel: Oops: 0002
-Aug 29 09:51:01 blizard kernel: CPU:    0
-Aug 29 09:51:01 blizard kernel: EIP:    0010:[oops:_oops+16/3868]
-Aug 29 09:51:01 blizard kernel: EFLAGS: 00010212
-Aug 29 09:51:01 blizard kernel: eax: 315e97cc   ebx: 003a6f80   ecx: 001be77b   edx: 00237c0c
-Aug 29 09:51:01 blizard kernel: esi: 00000000   edi: bffffdb3   ebp: 00589f90   esp: 00589f8c
-Aug 29 09:51:01 blizard kernel: ds: 0018   es: 0018   fs: 002b   gs: 002b   ss: 0018
-Aug 29 09:51:01 blizard kernel: Process oops_test (pid: 3374, process nr: 21, stackpage=00589000)
-Aug 29 09:51:01 blizard kernel: Stack: 315e97cc 00589f98 0100b0b4 bffffed4 0012e38e 00240c64 003a6f80 00000001
-Aug 29 09:51:01 blizard kernel:        00000000 00237810 bfffff00 0010a7fa 00000003 00000001 00000000 bfffff00
-Aug 29 09:51:01 blizard kernel:        bffffdb3 bffffed4 ffffffda 0000002b 0007002b 0000002b 0000002b 00000036
-Aug 29 09:51:01 blizard kernel: Call Trace: [oops:_oops_ioctl+48/80] [_sys_ioctl+254/272] [_system_call+82/128]
-Aug 29 09:51:01 blizard kernel: Code: c7 00 05 00 00 00 eb 08 90 90 90 90 90 90 90 90 89 ec 5d c3
----------------------------------------------------------------------------
-
-Dr. G.W. Wettstein           Oncology Research Div. Computing Facility
-Roger Maris Cancer Center    INTERNET: greg@wind.rmcc.com
-820 4th St. N.
-Fargo, ND  58122
-Phone: 701-234-7556
-
-
----------------------------------------------------------------------------
-受污染的内核
-
-一些oops报告在程序记数器之后包含字符串'Tainted: '。这表明内核已经被一些东西给污
-染了。 该字符串之后紧跟着一系列的位置敏感的字符，每个代表一个特定的污染值。
-
-  1：'G'如果所有装载的模块都有GPL或相容的许可证，'P'如果装载了任何的专有模块。
-没有模块MODULE_LICENSE或者带有insmod认为是与GPL不相容的的MODULE_LICENSE的模块被
-认定是专有的。
-
-  2：'F'如果有任何通过“insmod -f”被强制装载的模块，' '如果所有模块都被正常装载。
-
-  3：'S'如果oops发生在SMP内核中，运行于没有证明安全运行多处理器的硬件。 当前这种
-情况仅限于几种不支持SMP的速龙处理器。
-
-  4：'R'如果模块通过“insmod -f”被强制装载，' '如果所有模块都被正常装载。
-
-  5：'M'如果任何处理器报告了机器检查异常，' '如果没有发生机器检查异常。
-
-  6：'B'如果页释放函数发现了一个错误的页引用或者一些非预期的页标志。
-
-  7：'U'如果用户或者用户应用程序特别请求设置污染标志，否则' '。
-
-  8：'D'如果内核刚刚死掉，比如有OOPS或者BUG。
-
-使用'Tainted: '字符串的主要原因是要告诉内核调试者，这是否是一个干净的内核亦或发
-生了任何的不正常的事。污染是永久的：即使出错的模块已经被卸载了，污染值仍然存在，
-以表明内核不再值得信任。
-- 
cgit v1.2.3


From 7e94556e26c67138009169cd61959adc12db24b6 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng@loongson.cn>
Date: Thu, 25 Aug 2022 20:53:27 +0800
Subject: docs/zh_TW: Remove oops-tracing

The English version of oops-tracing has been
refactored and has been translated into Chinese.
Let's remove them.

Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Acked-by: Wu XiangCheng <bobwxc@email.cn>
Link: https://lore.kernel.org/r/0d22733cea474b0a3784f8de6b4bc4841fbaba77.1661431365.git.siyanteng@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/translations/zh_TW/oops-tracing.txt | 212 ----------------------
 1 file changed, 212 deletions(-)
 delete mode 100644 Documentation/translations/zh_TW/oops-tracing.txt

diff --git a/Documentation/translations/zh_TW/oops-tracing.txt b/Documentation/translations/zh_TW/oops-tracing.txt
deleted file mode 100644
index be8e59f2abaf..000000000000
--- a/Documentation/translations/zh_TW/oops-tracing.txt
+++ /dev/null
@@ -1,212 +0,0 @@
-Chinese translated version of Documentation/admin-guide/bug-hunting.rst
-
-If you have any comment or update to the content, please contact the
-original document maintainer directly.  However, if you have a problem
-communicating in English you can also ask the Chinese maintainer for
-help.  Contact the Chinese maintainer if this translation is outdated
-or if there is a problem with the translation.
-
-Traditional Chinese maintainer:  Hu Haowen <src.res@email.cn>
----------------------------------------------------------------------
-Documentation/admin-guide/bug-hunting.rst 的繁體中文版翻譯
-
-如果想評論或更新本文的內容，請直接聯繫原文檔的維護者。如果你使用英文
-交流有困難的話，也可以向繁體中文版維護者求助。如果本翻譯更新不及時或
-者翻譯存在問題，請聯繫繁體中文版維護者。
-
-繁體中文版維護者： 胡皓文 Hu Haowen <src.res@email.cn>
-繁體中文版翻譯者： 胡皓文 Hu Haowen <src.res@email.cn>
-繁體中文版校譯者： 胡皓文 Hu Haowen <src.res@email.cn>
-
-以下爲正文
----------------------------------------------------------------------
-
-注意： ksymoops 在2.6中是沒有用的。 請以原有格式使用Oops(來自dmesg，等等)。
-忽略任何這樣那樣關於「解碼Oops」或者「通過ksymoops運行」的文檔。 如果你貼出運行過
-ksymoops的來自2.6的Oops，人們只會讓你重貼一次。
-
-快速總結
--------------
-
-發現Oops並發送給看似相關的內核領域的維護者。別太擔心對不上號。如果你不確定就發給
-和你所做的事情相關的代碼的負責人。 如果可重現試著描述怎樣重構。 那甚至比oops更有
-價值。
-
-如果你對於發送給誰一無所知， 發給linux-kernel@vger.kernel.org。感謝你幫助Linux
-儘可能地穩定。
-
-Oops在哪裡?
-----------------------
-
-通常Oops文本由klogd從內核緩衝區里讀取並傳給syslogd，由syslogd寫到syslog文件中，
-典型地是/var/log/messages(依賴於/etc/syslog.conf)。有時klogd崩潰了,這種情況下你
-能夠運行dmesg > file來從內核緩衝區中讀取數據並保存下來。 否則你可以
-cat /proc/kmsg > file， 然而你必須介入中止傳輸， kmsg是一個「永不結束的文件」。如
-果機器崩潰壞到你不能輸入命令或者磁碟不可用那麼你有三種選擇:-
-
-（1） 手抄屏幕上的文本待機器重啓後再輸入計算機。 麻煩但如果沒有針對崩潰的準備，
-這是僅有的選擇。 另外，你可以用數位相機把屏幕拍下來-不太好，但比沒有強。 如果信
-息滾動到了終端的上面，你會發現以高分辯率啓動（比如，vga=791）會讓你讀到更多的文
-本。（注意：這需要vesafb，所以對『早期』的oops沒有幫助）
-
-（2）用串口終端啓動（請參看Documentation/admin-guide/serial-console.rst），運行一個null
-modem到另一台機器並用你喜歡的通訊工具獲取輸出。Minicom工作地很好。
-
-（3）使用Kdump（請參看Documentation/admin-guide/kdump/kdump.rst），
-使用在Documentation/admin-guide/kdump/gdbmacros.txt中定義的dmesg gdb宏，從舊的內存中提取內核
-環形緩衝區。
-
-完整信息
-----------------
-
-注意：以下來自於Linus的郵件適用於2.4內核。 我因爲歷史原因保留了它，並且因爲其中
-一些信息仍然適用。 特別注意的是，請忽略任何ksymoops的引用。
-
-From: Linus Torvalds <torvalds@osdl.org>
-
-怎樣跟蹤Oops.. [原發到linux-kernel的一封郵件]
-
-主要的竅門是有五年和這些煩人的oops消息打交道的經驗;-)
-
-實際上，你有辦法使它更簡單。我有兩個不同的方法：
-
-	gdb /usr/src/linux/vmlinux
-	gdb> disassemble <offending_function>
-
-那是發現問題的簡單辦法，至少如果bug報告做的好的情況下（象這個一樣-運行ksymoops
-得到oops發生的函數及函數內的偏移）。
-
-哦，如果報告發生的內核以相同的編譯器和相似的配置編譯它會有幫助的。
-
-另一件要做的事是反彙編bug報告的「Code」部分：ksymoops也會用正確的工具來做這件事，
-但如果沒有那些工具你可以寫一個傻程序：
-
-	char str[] = "\xXX\xXX\xXX...";
-	main(){}
-
-並用gcc -g編譯它然後執行「disassemble str」（XX部分是由Oops報告的值-你可以僅剪切
-粘貼並用「\x」替換空格-我就是這麼做的，因爲我懶得寫程序自動做這一切）。
-
-另外，你可以用scripts/decodecode這個shell腳本。它的使用方法是：
-decodecode < oops.txt
-
-「Code」之後的十六進位字節可能（在某些架構上）有一些當前指令之前的指令字節以及
-當前和之後的指令字節
-
-Code: f9 0f 8d f9 00 00 00 8d 42 0c e8 dd 26 11 c7 a1 60 ea 2b f9 8b 50 08 a1
-64 ea 2b f9 8d 34 82 8b 1e 85 db 74 6d 8b 15 60 ea 2b f9 <8b> 43 04 39 42 54
-7e 04 40 89 42 54 8b 43 04 3b 05 00 f6 52 c0
-
-最後，如果你想知道代碼來自哪裡，你可以：
-
-	cd /usr/src/linux
-	make fs/buffer.s 	# 或任何產生BUG的文件
-
-然後你會比gdb反彙編更清楚的知道發生了什麼。
-
-現在，問題是把你所擁有的所有數據結合起來：C源碼（關於它應該怎樣的一般知識），
-彙編代碼及其反彙編得到的代碼（另外還有從「oops」消息得到的寄存器狀態-對了解毀壞的
-指針有用，而且當你有了彙編代碼你也能拿其它的寄存器和任何它們對應的C表達式做匹配
-）。
-
-實際上，你僅需看看哪裡不匹配（這個例子是「Code」反彙編和編譯器生成的代碼不匹配）。
-然後你須要找出爲什麼不匹配。通常很簡單-你看到代碼使用了空指針然後你看代碼想知道
-空指針是怎麼出現的，還有檢查它是否合法..
-
-現在，如果明白這是一項耗時的工作而且需要一丁點兒的專心，沒錯。這就是我爲什麼大多
-只是忽略那些沒有符號表信息的崩潰報告的原因：簡單的說太難查找了（我有一些
-程序用於在內核代碼段中搜索特定的模式，而且有時我也已經能找出那些崩潰的地方，但是
-僅僅是找出正確的序列也確實需要相當紮實的內核知識）
-
-_有時_會發生這種情況，我僅看到崩潰中的反彙編代碼序列， 然後我馬上就明白問題出在
-哪裡。這時我才意識到自己幹這個工作已經太長時間了;-)
-
-		Linus
-
-
----------------------------------------------------------------------------
-關於Oops跟蹤的註解：
-
-爲了幫助Linus和其它內核開發者，klogd納入了大量的支持來處理保護錯誤。爲了擁有對
-地址解析的完整支持至少應該使用1.3-pl3的sysklogd包。
-
-當保護錯誤發生時，klogd守護進程自動把內核日誌信息中的重要地址翻譯成它們相應的符
-號。
-
-klogd執行兩種類型的地址解析。首先是靜態翻譯其次是動態翻譯。靜態翻譯和ksymoops
-一樣使用System.map文件。爲了做靜態翻譯klogd守護進程必須在初始化時能找到system
-map文件。關於klogd怎樣搜索map文件請參看klogd手冊頁。
-
-動態地址翻譯在使用內核可裝載模塊時很重要。 因爲內核模塊的內存是從內核動態內存池
-里分配的，所以不管是模塊開始位置還是模塊中函數和符號的位置都不是固定的。
-
-內核支持允許程序決定裝載哪些模塊和它們在內存中位置的系統調用。使用這些系統調用
-klogd守護進程生成一張符號表用於調試發生在可裝載模塊中的保護錯誤。
-
-至少klogd會提供產生保護錯誤的模塊名。還可有額外的符號信息供可裝載模塊開發者選擇
-以從模塊中輸出符號信息。
-
-因爲內核模塊環境可能是動態的，所以必須有一種機制當模塊環境發生改變時來通知klogd
-守護進程。 有一些可用的命令行選項允許klogd向當前執行中的守護進程發送信號，告知符
-號信息應該被刷新了。 更多信息請參看klogd手冊頁。
-
-sysklogd發布時包含一個補丁修改了modules-2.0.0包，無論何時一個模塊裝載或者卸載都
-會自動向klogd發送信號。打上這個補丁提供了必要的對調試發生於內核可裝載模塊的保護
-錯誤的無縫支持。
-
-以下是被klogd處理過的發生在可裝載模塊中的一個保護錯誤例子：
----------------------------------------------------------------------------
-Aug 29 09:51:01 blizard kernel: Unable to handle kernel paging request at virtual address f15e97cc
-Aug 29 09:51:01 blizard kernel: current->tss.cr3 = 0062d000, %cr3 = 0062d000
-Aug 29 09:51:01 blizard kernel: *pde = 00000000
-Aug 29 09:51:01 blizard kernel: Oops: 0002
-Aug 29 09:51:01 blizard kernel: CPU:    0
-Aug 29 09:51:01 blizard kernel: EIP:    0010:[oops:_oops+16/3868]
-Aug 29 09:51:01 blizard kernel: EFLAGS: 00010212
-Aug 29 09:51:01 blizard kernel: eax: 315e97cc   ebx: 003a6f80   ecx: 001be77b   edx: 00237c0c
-Aug 29 09:51:01 blizard kernel: esi: 00000000   edi: bffffdb3   ebp: 00589f90   esp: 00589f8c
-Aug 29 09:51:01 blizard kernel: ds: 0018   es: 0018   fs: 002b   gs: 002b   ss: 0018
-Aug 29 09:51:01 blizard kernel: Process oops_test (pid: 3374, process nr: 21, stackpage=00589000)
-Aug 29 09:51:01 blizard kernel: Stack: 315e97cc 00589f98 0100b0b4 bffffed4 0012e38e 00240c64 003a6f80 00000001
-Aug 29 09:51:01 blizard kernel:        00000000 00237810 bfffff00 0010a7fa 00000003 00000001 00000000 bfffff00
-Aug 29 09:51:01 blizard kernel:        bffffdb3 bffffed4 ffffffda 0000002b 0007002b 0000002b 0000002b 00000036
-Aug 29 09:51:01 blizard kernel: Call Trace: [oops:_oops_ioctl+48/80] [_sys_ioctl+254/272] [_system_call+82/128]
-Aug 29 09:51:01 blizard kernel: Code: c7 00 05 00 00 00 eb 08 90 90 90 90 90 90 90 90 89 ec 5d c3
----------------------------------------------------------------------------
-
-Dr. G.W. Wettstein           Oncology Research Div. Computing Facility
-Roger Maris Cancer Center    INTERNET: greg@wind.rmcc.com
-820 4th St. N.
-Fargo, ND  58122
-Phone: 701-234-7556
-
-
----------------------------------------------------------------------------
-受汙染的內核
-
-一些oops報告在程序記數器之後包含字符串'Tainted: '。這表明內核已經被一些東西給汙
-染了。 該字符串之後緊跟著一系列的位置敏感的字符，每個代表一個特定的汙染值。
-
-  1：'G'如果所有裝載的模塊都有GPL或相容的許可證，'P'如果裝載了任何的專有模塊。
-沒有模塊MODULE_LICENSE或者帶有insmod認爲是與GPL不相容的的MODULE_LICENSE的模塊被
-認定是專有的。
-
-  2：'F'如果有任何通過「insmod -f」被強制裝載的模塊，' '如果所有模塊都被正常裝載。
-
-  3：'S'如果oops發生在SMP內核中，運行於沒有證明安全運行多處理器的硬體。 當前這種
-情況僅限於幾種不支持SMP的速龍處理器。
-
-  4：'R'如果模塊通過「insmod -f」被強制裝載，' '如果所有模塊都被正常裝載。
-
-  5：'M'如果任何處理器報告了機器檢查異常，' '如果沒有發生機器檢查異常。
-
-  6：'B'如果頁釋放函數發現了一個錯誤的頁引用或者一些非預期的頁標誌。
-
-  7：'U'如果用戶或者用戶應用程式特別請求設置汙染標誌，否則' '。
-
-  8：'D'如果內核剛剛死掉，比如有OOPS或者BUG。
-
-使用'Tainted: '字符串的主要原因是要告訴內核調試者，這是否是一個乾淨的內核亦或發
-生了任何的不正常的事。汙染是永久的：即使出錯的模塊已經被卸載了，汙染值仍然存在，
-以表明內核不再值得信任。
-
-- 
cgit v1.2.3


From 363fd2324c3ef9638a5ef42c5e5a2b75a8041d46 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Tue, 16 Aug 2022 13:19:07 +0100
Subject: docs: scheduler: Update new path for the sysctl knobs

Add missing update for the documentation bit of some scheduler knob.
The knobs have been moved to /debug/sched/ location (with adjusted names).

Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Alex Shi <alexs@kernel.org>
Tested-by: Yanteng Si <siyanteng@loongson.cn>
Link: https://lore.kernel.org/r/20220816121907.841-1-lukasz.luba@arm.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/scheduler/sched-design-CFS.rst                    | 2 +-
 Documentation/translations/zh_CN/scheduler/sched-design-CFS.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
index 59b2d1fb4dc4..03db55504515 100644
--- a/Documentation/scheduler/sched-design-CFS.rst
+++ b/Documentation/scheduler/sched-design-CFS.rst
@@ -94,7 +94,7 @@ other HZ detail.  Thus the CFS scheduler has no notion of "timeslices" in the
 way the previous scheduler had, and has no heuristics whatsoever.  There is
 only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
 
-   /proc/sys/kernel/sched_min_granularity_ns
+   /sys/kernel/debug/sched/min_granularity_ns
 
 which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
 "server" (i.e., good batching) workloads.  It defaults to a setting suitable
diff --git a/Documentation/translations/zh_CN/scheduler/sched-design-CFS.rst b/Documentation/translations/zh_CN/scheduler/sched-design-CFS.rst
index 26b0f36f793d..3076402406c4 100644
--- a/Documentation/translations/zh_CN/scheduler/sched-design-CFS.rst
+++ b/Documentation/translations/zh_CN/scheduler/sched-design-CFS.rst
@@ -80,7 +80,7 @@ p->se.vruntime。一旦p->se.vruntime变得足够大，其它的任务将成为
 CFS使用纳秒粒度的计时，不依赖于任何jiffies或HZ的细节。因此CFS并不像之前的调度器那样
 有“时间片”的概念，也没有任何启发式的设计。唯一可调的参数（你需要打开CONFIG_SCHED_DEBUG）是：
 
-   /proc/sys/kernel/sched_min_granularity_ns
+   /sys/kernel/debug/sched/min_granularity_ns
 
 它可以用来将调度器从“桌面”模式（也就是低时延）调节为“服务器”（也就是高批处理）模式。
 它的默认设置是适合桌面的工作负载。SCHED_BATCH也被CFS调度器模块处理。
-- 
cgit v1.2.3


From 20e21951f3ebceac93168854edc1c222e515d292 Mon Sep 17 00:00:00 2001
From: Wu XiangCheng <bobwxc@email.cn>
Date: Sat, 27 Aug 2022 08:18:53 +0800
Subject: docs/zh_CN: Update zh_CN/process/submit-checklist.rst to 6.0-rc2

* update to commit 163ba35ff371 ("doc: use KCFLAGS instead of
  EXTRA_CFLAGS to pass flags from command line")

Signed-off-by: Wu XiangCheng <bobwxc@email.cn>
Reviewed-by: Yanteng Si <siyanteng@loongson.cn>
Link: https://lore.kernel.org/r/Ywli7VfhQVPHKiGw@bobwxc.mipc
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../zh_CN/process/submit-checklist.rst             | 84 ++++++++++++----------
 1 file changed, 45 insertions(+), 39 deletions(-)

diff --git a/Documentation/translations/zh_CN/process/submit-checklist.rst b/Documentation/translations/zh_CN/process/submit-checklist.rst
index a64858d321fc..3d6ee21c74ae 100644
--- a/Documentation/translations/zh_CN/process/submit-checklist.rst
+++ b/Documentation/translations/zh_CN/process/submit-checklist.rst
@@ -1,105 +1,111 @@
 .. include:: ../disclaimer-zh_CN.rst
 
-:Original: :ref:`Documentation/process/submit-checklist.rst <submitchecklist>`
-:Translator: Alex Shi <alex.shi@linux.alibaba.com>
+:Original: Documentation/process/submit-checklist.rst
+:Translator:
+ - Alex Shi <alexs@kernel.org>
+ - Wu XiangCheng <bobwxc@email.cn>
 
 .. _cn_submitchecklist:
 
-Linux内核补丁提交清单
-~~~~~~~~~~~~~~~~~~~~~
+Linux内核补丁提交检查单
+~~~~~~~~~~~~~~~~~~~~~~~
 
 如果开发人员希望看到他们的内核补丁提交更快地被接受，那么他们应该做一些基本
 的事情。
 
-这些都是在
-:ref:`Documentation/translations/zh_CN/process/submitting-patches.rst <cn_submittingpatches>`
+这些都是在 Documentation/translations/zh_CN/process/submitting-patches.rst
 和其他有关提交Linux内核补丁的文档中提供的。
 
-1) 如果使用工具，则包括定义/声明该工具的文件。不要依赖于其他头文件拉入您使用
+1) 如果使用工具，则包括定义/声明该工具的文件。不要依赖其他头文件来引入您使用
    的头文件。
 
 2) 干净的编译：
 
-   a) 使用适用或修改的 ``CONFIG`` 选项 ``=y``、``=m`` 和 ``=n`` 。没有GCC
+   a) 使用合适的 ``CONFIG`` 选项 ``=y``、``=m`` 和 ``=n`` 。没有 ``gcc``
       警告/错误，没有链接器警告/错误。
 
-   b) 通过allnoconfig、allmodconfig
+   b) 通过 ``allnoconfig`` 、 ``allmodconfig``
 
    c) 使用 ``O=builddir`` 时可以成功编译
 
-3) 通过使用本地交叉编译工具或其他一些构建场在多个CPU体系结构上构建。
+   d) 任何 Doucmentation/ 下的变更都能成功构建且不引入新警告/错误。
+      用 ``make htmldocs`` 或 ``make pdfdocs`` 检验构建情况并修复问题。
+
+3) 通过使用本地交叉编译工具或其他一些构建设施在多个CPU体系结构上构建。
 
 4) PPC64是一种很好的交叉编译检查体系结构，因为它倾向于对64位的数使用无符号
    长整型。
 
-5) 如下所述 :ref:`Documentation/translations/zh_CN/process/coding-style.rst <cn_codingstyle>`.
-   检查您的补丁是否为常规样式。在提交（ ``scripts/check patch.pl`` ）之前，
-   使用补丁样式检查器检查是否有轻微的冲突。您应该能够处理您的补丁中存在的所有
+5) 按 Documentation/translations/zh_CN/process/coding-style.rst 所述检查您的
+   补丁是否为常规样式。在提交之前使用补丁样式检查器 ``scripts/checkpatch.pl``
+   检查是否有轻微的冲突。您应该能够处理您的补丁中存在的所有
    违规行为。
 
-6) 任何新的或修改过的 ``CONFIG`` 选项都不会弄脏配置菜单，并默认为关闭，除非
-   它们符合 ``Documentation/kbuild/kconfig-language.rst`` 中记录的异常条件,
-   菜单属性：默认值.
+6) 任何新的或修改过的 ``CONFIG`` 选项都不应搞乱配置菜单，并默认为关闭，除非
+   它们符合 ``Documentation/kbuild/kconfig-language.rst`` 菜单属性：默认值中
+   记录的例外条件。
 
 7) 所有新的 ``kconfig`` 选项都有帮助文本。
 
 8) 已仔细审查了相关的 ``Kconfig`` 组合。这很难用测试来纠正——脑力在这里是有
    回报的。
 
-9) 用 sparse 检查干净。
+9) 通过 sparse 清查。
+   （参见 Documentation/translations/zh_CN/dev-tools/sparse.rst ）
 
 10) 使用 ``make checkstack`` 和 ``make namespacecheck`` 并修复他们发现的任何
     问题。
 
     .. note::
 
-        ``checkstack`` 并没有明确指出问题，但是任何一个在堆栈上使用超过512
+        ``checkstack`` 并不会明确指出问题，但是任何一个在堆栈上使用超过512
         字节的函数都可以进行更改。
 
-11) 包括 :ref:`kernel-doc <kernel_doc>` 内核文档以记录全局内核API。（静态函数
-    不需要，但也可以。）使用 ``make htmldocs`` 或 ``make pdfdocs`` 检查
-    :ref:`kernel-doc <kernel_doc>` 并修复任何问题。
+11) 包括 :ref:`kernel-doc <kernel_doc_zh>` 内核文档以记录全局内核API。（静态
+    函数不需要，但也可以。）使用 ``make htmldocs`` 或 ``make pdfdocs`` 检查
+    :ref:`kernel-doc <kernel_doc_zh>` 并修复任何问题。
 
-12) 通过以下选项同时启用的测试 ``CONFIG_PREEMPT``, ``CONFIG_DEBUG_PREEMPT``,
+12) 通过以下选项同时启用的测试： ``CONFIG_PREEMPT``, ``CONFIG_DEBUG_PREEMPT``,
     ``CONFIG_DEBUG_SLAB``, ``CONFIG_DEBUG_PAGEALLOC``, ``CONFIG_DEBUG_MUTEXES``,
     ``CONFIG_DEBUG_SPINLOCK``, ``CONFIG_DEBUG_ATOMIC_SLEEP``,
-    ``CONFIG_PROVE_RCU`` and ``CONFIG_DEBUG_OBJECTS_RCU_HEAD``
-
-13) 已经过构建和运行时测试，包括有或没有 ``CONFIG_SMP``, ``CONFIG_PREEMPT``.
+    ``CONFIG_PROVE_RCU`` 和 ``CONFIG_DEBUG_OBJECTS_RCU_HEAD`` 。
 
-14) 如果补丁程序影响IO/磁盘等：使用或不使用 ``CONFIG_LBDAF`` 进行测试。
+13) 在 ``CONFIG_SMP``, ``CONFIG_PREEMPT`` 开启和关闭的情况下都进行构建和运行
+    时测试。
 
-15) 所有代码路径都已在启用所有lockdep功能的情况下运行。
+14) 所有代码路径都已在启用所有死锁检测（lockdep）功能的情况下运行。
 
-16) 所有新的/proc条目都记录在 ``Documentation/``
+15) 所有新的 ``/proc`` 条目都记录在 ``Documentation/``
 
-17) 所有新的内核引导参数都记录在
+16) 所有新的内核引导参数都记录在
     Documentation/admin-guide/kernel-parameters.rst 中。
 
-18) 所有新的模块参数都记录在 ``MODULE_PARM_DESC()``
+17) 所有新的模块参数都记录在 ``MODULE_PARM_DESC()``
 
-19) 所有新的用户空间接口都记录在 ``Documentation/ABI/`` 中。有关详细信息，
+18) 所有新的用户空间接口都记录在 ``Documentation/ABI/`` 中。有关详细信息，
     请参阅 ``Documentation/ABI/README`` 。更改用户空间接口的补丁应该抄送
     linux-api@vger.kernel.org。
 
-20) 已通过至少注入slab和page分配失败进行检查。请参阅 ``Documentation/fault-injection/``
+19) 已通过至少注入slab和page分配失败进行检查。请参阅 ``Documentation/fault-injection/`` 。
     如果新代码是实质性的，那么添加子系统特定的故障注入可能是合适的。
 
-21) 新添加的代码已经用 ``gcc -W`` 编译（使用 ``make EXTRA-CFLAGS=-W`` ）。这
+20) 新添加的代码已经用 ``gcc -W`` 编译（使用 ``make EXTRA-CFLAGS=-W`` ）。这
     将产生大量噪声，但对于查找诸如“警告：有符号和无符号之间的比较”之类的错误
     很有用。
 
-22) 在它被合并到-mm补丁集中之后进行测试，以确保它仍然与所有其他排队的补丁以
+21) 在它被合并到-mm补丁集中之后进行测试，以确保它仍然与所有其他排队的补丁以
     及VM、VFS和其他子系统中的各种更改一起工作。
 
-23) 所有内存屏障例如 ``barrier()``, ``rmb()``, ``wmb()`` 都需要源代码中的注
+22) 所有内存屏障（例如 ``barrier()``, ``rmb()``, ``wmb()`` ）都需要源代码注
     释来解释它们正在执行的操作及其原因的逻辑。
 
-24) 如果补丁添加了任何ioctl，那么也要更新 ``Documentation/userspace-api/ioctl/ioctl-number.rst``
+23) 如果补丁添加了任何ioctl，那么也要更新
+    ``Documentation/userspace-api/ioctl/ioctl-number.rst`` 。
 
-25) 如果修改后的源代码依赖或使用与以下 ``Kconfig`` 符号相关的任何内核API或
+24) 如果修改后的源代码依赖或使用与以下 ``Kconfig`` 符号相关的任何内核API或
     功能，则在禁用相关 ``Kconfig`` 符号和/或 ``=m`` （如果该选项可用）的情况
     下测试以下多个构建[并非所有这些都同时存在，只是它们的各种/随机组合]：
 
-    ``CONFIG_SMP``, ``CONFIG_SYSFS``, ``CONFIG_PROC_FS``, ``CONFIG_INPUT``, ``CONFIG_PCI``, ``CONFIG_BLOCK``, ``CONFIG_PM``, ``CONFIG_MAGIC_SYSRQ``,
-    ``CONFIG_NET``, ``CONFIG_INET=n`` (但是后者伴随 ``CONFIG_NET=y``).
+    ``CONFIG_SMP``, ``CONFIG_SYSFS``, ``CONFIG_PROC_FS``, ``CONFIG_INPUT``,
+    ``CONFIG_PCI``, ``CONFIG_BLOCK``, ``CONFIG_PM``, ``CONFIG_MAGIC_SYSRQ``,
+    ``CONFIG_NET``, ``CONFIG_INET=n`` （但是最后一个需要 ``CONFIG_NET=y`` ）。
-- 
cgit v1.2.3


From 6b0d3e7c5888d2ec0f5527ee699265dc852a5faa Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Sat, 27 Aug 2022 13:37:18 +0900
Subject: docs/conf.py: Treat mathjax as fallback math renderer

Currently, math expressions using the "math::" directive or
the ":math:" role of Sphinx need the imgmath extension for proper
rendering in html and epub builds.
imgmath requires dvipng (and latex).
Otherwise, "make htmldocs" will complain of missing commands.

As a matter of fact, the mathjax extension is loaded by default since
Sphinx v1.8 and it is good enough for html docs without any dependency
on texlive packages.

Stop loading the imgmath extension for html docs unless requirements
for imgmath are met.

To find out whether required commands are available, add a helper
find_command(), which is a wrapper of shutil.which().

For epub docs, keep the same behavior of always loading imgmath.

Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Acked-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Link: https://lore.kernel.org/r/a6a877fc-dc93-2bda-a6d3-37001d99942a@gmail.com
[jc: Took out the writing of the math_renderer decision]
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/conf.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/Documentation/conf.py b/Documentation/conf.py
index 934727e23e0e..f12eb51235da 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -15,6 +15,18 @@
 import sys
 import os
 import sphinx
+import shutil
+
+# helper
+# ------
+
+def have_command(cmd):
+    """Search ``cmd`` in the ``PATH`` environment.
+
+    If found, return True.
+    If not found, return False.
+    """
+    return shutil.which(cmd) is not None
 
 # Get Sphinx version
 major, minor, patch = sphinx.version_info[:3]
@@ -106,7 +118,20 @@ else:
 autosectionlabel_prefix_document = True
 autosectionlabel_maxdepth = 2
 
-extensions.append("sphinx.ext.imgmath")
+# Load math renderer:
+# For html builder, load imgmath only when its dependencies are met.
+# mathjax is the default math renderer since Sphinx 1.8.
+have_latex =  have_command('latex')
+have_dvipng = have_command('dvipng')
+load_imgmath = ((have_latex and have_dvipng)
+                or (major == 1 and minor < 8)
+                or 'epub' in sys.argv)
+
+if load_imgmath:
+    extensions.append("sphinx.ext.imgmath")
+    math_renderer = 'imgmath'
+else:
+    math_renderer = 'mathjax'
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
-- 
cgit v1.2.3


From 3b384e95642c0f01a34525a71b6a5a4826934b75 Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Sat, 27 Aug 2022 13:38:17 +0900
Subject: docs/conf.py: Respect env variable SPHINX_IMGMATH

On some distros with coarse-grained packaging policy, dvipng is
installed along with latex.  In such cases, math rendering will
use imgmath by default.  It is possible to override the choice by
specifying the option string of "-D html_math_renderer='mathjax'"
to sphinx-build (Sphinx >= 1.8).

To provide developers an easier-to-use knob, add code for an env
variable "SPHINX_IMGMATH" which overrides the automatic choice
of math renderer for html docs.

    SPHINX_IMGMATH=yes : Load imgmath even if dvipng is not found
    SPHINX_IMGMATH=no  : Don't load imgmath (fall back to mathjax)

Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Acked-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Link: https://lore.kernel.org/r/5a582b2b-d51c-a062-36b2-19479cf68fab@gmail.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/conf.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/Documentation/conf.py b/Documentation/conf.py
index f12eb51235da..cdcc8e0f46a6 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -123,8 +123,20 @@ autosectionlabel_maxdepth = 2
 # mathjax is the default math renderer since Sphinx 1.8.
 have_latex =  have_command('latex')
 have_dvipng = have_command('dvipng')
-load_imgmath = ((have_latex and have_dvipng)
-                or (major == 1 and minor < 8)
+load_imgmath = have_latex and have_dvipng
+
+# Respect SPHINX_IMGMATH (for html docs only)
+if 'SPHINX_IMGMATH' in os.environ:
+    env_sphinx_imgmath = os.environ['SPHINX_IMGMATH']
+    if 'yes' in env_sphinx_imgmath:
+        load_imgmath = True
+    elif 'no' in env_sphinx_imgmath:
+        load_imgmath = False
+    else:
+        sys.stderr.write("Unknown env SPHINX_IMGMATH=%s ignored.\n" % env_sphinx_imgmath)
+
+# Always load imgmath for Sphinx <1.8 or for epub docs
+load_imgmath = (load_imgmath or (major == 1 and minor < 8)
                 or 'epub' in sys.argv)
 
 if load_imgmath:
-- 
cgit v1.2.3


From d2bef8e1037cc69695c6b146bb05ce053450e0de Mon Sep 17 00:00:00 2001
From: Akhil Raj <lf32.dev@gmail.com>
Date: Sat, 27 Aug 2022 20:23:59 +0530
Subject: Remove duplicate words inside documentation

I have removed repeated `the` inside the documentation

Signed-off-by: Akhil Raj <lf32.dev@gmail.com>
Link: https://lore.kernel.org/r/20220827145359.32599-1-lf32.dev@gmail.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/admin-guide/kdump/vmcoreinfo.rst    | 2 +-
 Documentation/bpf/map_cgroup_storage.rst          | 4 ++--
 Documentation/core-api/cpu_hotplug.rst            | 2 +-
 Documentation/driver-api/isa.rst                  | 2 +-
 Documentation/filesystems/caching/backend-api.rst | 2 +-
 Documentation/locking/seqlock.rst                 | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 8419019b6a88..6726f439958c 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -200,7 +200,7 @@ prb
 
 A pointer to the printk ringbuffer (struct printk_ringbuffer). This
 may be pointing to the static boot ringbuffer or the dynamically
-allocated ringbuffer, depending on when the the core dump occurred.
+allocated ringbuffer, depending on when the core dump occurred.
 Used by user-space tools to read the active kernel log buffer.
 
 printk_rb_static
diff --git a/Documentation/bpf/map_cgroup_storage.rst b/Documentation/bpf/map_cgroup_storage.rst
index cab9543017bf..8e5fe532c07e 100644
--- a/Documentation/bpf/map_cgroup_storage.rst
+++ b/Documentation/bpf/map_cgroup_storage.rst
@@ -31,7 +31,7 @@ The map uses key of type of either ``__u64 cgroup_inode_id`` or
     };
 
 ``cgroup_inode_id`` is the inode id of the cgroup directory.
-``attach_type`` is the the program's attach type.
+``attach_type`` is the program's attach type.
 
 Linux 5.9 added support for type ``__u64 cgroup_inode_id`` as the key type.
 When this key type is used, then all attach types of the particular cgroup and
@@ -155,7 +155,7 @@ However, the BPF program can still only associate with one map of each type
 ``BPF_MAP_TYPE_CGROUP_STORAGE`` or more than one
 ``BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE``.
 
-In all versions, userspace may use the the attach parameters of cgroup and
+In all versions, userspace may use the attach parameters of cgroup and
 attach type pair in ``struct bpf_cgroup_storage_key`` as the key to the BPF map
 APIs to read or update the storage for a given attachment. For Linux 5.9
 attach type shared storages, only the first value in the struct, cgroup inode
diff --git a/Documentation/core-api/cpu_hotplug.rst b/Documentation/core-api/cpu_hotplug.rst
index c6f4ba2fb32d..f75778d37488 100644
--- a/Documentation/core-api/cpu_hotplug.rst
+++ b/Documentation/core-api/cpu_hotplug.rst
@@ -560,7 +560,7 @@ available:
   * cpuhp_state_remove_instance(state, node)
   * cpuhp_state_remove_instance_nocalls(state, node)
 
-The arguments are the same as for the the cpuhp_state_add_instance*()
+The arguments are the same as for the cpuhp_state_add_instance*()
 variants above.
 
 The functions differ in the way how the installed callbacks are treated:
diff --git a/Documentation/driver-api/isa.rst b/Documentation/driver-api/isa.rst
index def4a7b690b5..3df1b1696524 100644
--- a/Documentation/driver-api/isa.rst
+++ b/Documentation/driver-api/isa.rst
@@ -100,7 +100,7 @@ I believe platform_data is available for this, but if rather not, moving
 the isa_driver pointer to the private struct isa_dev is ofcourse fine as
 well.
 
-Then, if the the driver did not provide a .match, it matches. If it did,
+Then, if the driver did not provide a .match, it matches. If it did,
 the driver match() method is called to determine a match.
 
 If it did **not** match, dev->platform_data is reset to indicate this to
diff --git a/Documentation/filesystems/caching/backend-api.rst b/Documentation/filesystems/caching/backend-api.rst
index d7507becf674..3a199fc50828 100644
--- a/Documentation/filesystems/caching/backend-api.rst
+++ b/Documentation/filesystems/caching/backend-api.rst
@@ -122,7 +122,7 @@ volumes, calling::
 to tell fscache that a volume has been withdrawn.  This waits for all
 outstanding accesses on the volume to complete before returning.
 
-When the the cache is completely withdrawn, fscache should be notified by
+When the cache is completely withdrawn, fscache should be notified by
 calling::
 
 	void fscache_relinquish_cache(struct fscache_cache *cache);
diff --git a/Documentation/locking/seqlock.rst b/Documentation/locking/seqlock.rst
index 64405e5da63e..bfda1a5fecad 100644
--- a/Documentation/locking/seqlock.rst
+++ b/Documentation/locking/seqlock.rst
@@ -39,7 +39,7 @@ as the writer can invalidate a pointer that the reader is following.
 Sequence counters (``seqcount_t``)
 ==================================
 
-This is the the raw counting mechanism, which does not protect against
+This is the raw counting mechanism, which does not protect against
 multiple writers.  Write side critical sections must thus be serialized
 by an external lock.
 
-- 
cgit v1.2.3


From 8a7f0e8ab9e8c1a32eefd632fb11cdc08af28aee Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 31 Jul 2022 19:52:07 -0700
Subject: Documentation/ABI: correct possessive "its" typos

Correct all uses of "it's" that are meant to be possessive "its".

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220801025207.29971-1-rdunlap@infradead.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/ABI/testing/sysfs-bus-bcma           | 2 +-
 Documentation/ABI/testing/sysfs-bus-fcoe           | 2 +-
 Documentation/ABI/testing/sysfs-bus-iio-proximity  | 2 +-
 Documentation/ABI/testing/sysfs-devices-system-cpu | 2 +-
 Documentation/ABI/testing/sysfs-power              | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-bus-bcma b/Documentation/ABI/testing/sysfs-bus-bcma
index 721b4aea3020..e93d3ddca844 100644
--- a/Documentation/ABI/testing/sysfs-bus-bcma
+++ b/Documentation/ABI/testing/sysfs-bus-bcma
@@ -3,7 +3,7 @@ Date:		May 2011
 KernelVersion:	3.0
 Contact:	Rafał Miłecki <zajec5@gmail.com>
 Description:
-		Each BCMA core has it's manufacturer id. See
+		Each BCMA core has its manufacturer id. See
 		include/linux/bcma/bcma.h for possible values.
 
 What:		/sys/bus/bcma/devices/.../id
diff --git a/Documentation/ABI/testing/sysfs-bus-fcoe b/Documentation/ABI/testing/sysfs-bus-fcoe
index 8fe787cc4ab7..5a4f2091ac37 100644
--- a/Documentation/ABI/testing/sysfs-bus-fcoe
+++ b/Documentation/ABI/testing/sysfs-bus-fcoe
@@ -31,7 +31,7 @@ Description:	'FCoE Controller' instances on the fcoe bus.
 		1) Write interface name to ctlr_create 2) Configure the FCoE
 		Controller (ctlr_X) 3) Enable the FCoE Controller to begin
 		discovery and login. The FCoE Controller is destroyed by
-		writing it's name, i.e. ctlr_X to the ctlr_delete file.
+		writing its name, i.e. ctlr_X to the ctlr_delete file.
 
 Attributes:
 
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-proximity b/Documentation/ABI/testing/sysfs-bus-iio-proximity
index 3aac6dab8775..9b9d1cc9b703 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio-proximity
+++ b/Documentation/ABI/testing/sysfs-bus-iio-proximity
@@ -18,7 +18,7 @@ Description:
 		on the signal from which time of flight measurements are
 		taken.
 		The appropriate values to take is dependent on both the
-		sensor and it's operating environment:
+		sensor and its operating environment:
 		* as3935 (0-31 range)
 		18 = indoors (default)
 		14 = outdoors
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 5bf61881f012..4b422f7dfe99 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -296,7 +296,7 @@ Description:	Processor frequency boosting control
 
 		This switch controls the boost setting for the whole system.
 		Boosting allows the CPU and the firmware to run at a frequency
-		beyond it's nominal limit.
+		beyond its nominal limit.
 
 		More details can be found in
 		Documentation/admin-guide/pm/cpufreq.rst
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 90ec4987074b..f99d433ff311 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -152,7 +152,7 @@ Description:
 		case further investigation is required to determine which
 		device is causing the problem.  Note that genuine RTC clock
 		values (such as when pm_trace has not been used), can still
-		match a device and output it's name here.
+		match a device and output its name here.
 
 What:		/sys/power/pm_async
 Date:		January 2009
-- 
cgit v1.2.3


From 3f10b50829194eb7ce4c8f320a665b7b076a5fb6 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Wed, 20 Jul 2022 06:13:24 +0200
Subject: docs: admin-guide: do not mention the 'run a.out user programs'
 feature

Running a.out user programs with the latest kernel release is a very rare
and uncommon use case nowadays. The support of a.out user programs is only
remaining for the alpha architecture and is not defined and activated in
the architecture's Kconfig (so even the activation of this support requires
to modify the Kconfig file and not just kernel build configuration).

The discussion on a.out support in 2019 (see Link) shows that the support
of a.out user programs is just remaining for a special corner case from
some (alpha architecture) users.

There is no need to point out and mention this special feature to the
general audience of kernel users. Delete the reference to this historic and
special feature.

Link: https://lore.kernel.org/all/CAHk-=wgt7M6yA5BJCJo0nF22WgPJnN8CvViL9CAJmd+S+Civ6w@mail.gmail.com/

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Link: https://lore.kernel.org/r/20220720041325.15693-2-lukas.bulwahn@gmail.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/admin-guide/README.rst | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst
index caa3c09a5c3f..b78fe64b39f6 100644
--- a/Documentation/admin-guide/README.rst
+++ b/Documentation/admin-guide/README.rst
@@ -262,8 +262,6 @@ Compiling the kernel
  - Make sure you have at least gcc 5.1 available.
    For more information, refer to :ref:`Documentation/process/changes.rst <changes>`.
 
-   Please note that you can still run a.out user programs with this kernel.
-
  - Do a ``make`` to create a compressed kernel image. It is also
    possible to do ``make install`` if you have lilo installed to suit the
    kernel makefiles, but you may want to check your particular lilo setup first.
-- 
cgit v1.2.3


From 32a3a9db16eb7551a6763820a8ed6f151f879eaa Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Wed, 20 Jul 2022 06:13:25 +0200
Subject: docs: admin-guide: for kernel bugs refer to other kernel
 documentation

The current section 'If something goes wrong' makes a number of suggestions
for debugging, bug hunting and reporting issues, which are quite briefly
described in that section.

However, the suggestions are also well covered in other kernel
documentation or sometimes simply outdated. Here, each suggestion in that
section is summarized, and then followed with its assessment, and the
derived action for each suggestion:

  - use MAINTAINERS and mailing list: covered in 'Reporting issues',
    summarized in the short guide, detailed in its further section.
    Reporting issues even provides some specific examples that guides
    readers well through the needed steps. Refer to 'Reporting issues'.

  - contact Linus Torvalds: probably outdated as currently described.
    nevertheless covered in 'Reporting issues'. Reporting issues points out
    to contact the relevant kernel maintainers first, and after some
    patience and failed attempts with those maintainers, contacting Linus
    Torvalds might be okay. Refer to 'Reporting issues'.

  - tell what kernel, how to duplicate, the setup, if the problem is new
    or old and when did you notice: covered in 'Reporting issues',
    especially in Step-by-step guide how to report issues to the kernel
    maintainers. Refer to 'Reporting issues'.

  - duplicate kernel bug reports exactly: covered in 'Reporting issues',
    especially in Write and send the report. Refer to 'Reporting issues'.

  - read 'Bug hunting': keep this reference. Refer to 'Bug hunting'.

  - compile the kernel with CONFIG_KALLSYMS: covered in 'Reporting issues',
    especially in Decode failure messages. Refer to 'Reporting issues'.

  - alternatively, use ksymoops: ksymoops at the mentioned URL seems not to
    be maintained anymore. It was released roughly once a year until
    version 2.4.11 in 2005, but has not seen a new release since then. The
    information in ./scripts/ksymoops/README is from 1999, and does not
    give more insight on its actual maintenance state either. Ksymoops is
    mentioned as system utility in changes.rst, but also not recommended
    there. Drop the explanation on using ksymoops.

  - alternatively, lookup dump manually with the EIP and nm to determine
    the function in which the kernel crashes: this method seems already a
    quite advanced and low-level debugging method. Even all the further
    references on bug hunting and debugging do not mention it. Drop this
    alternative method and limit mentioning methods explained in the other
    existing kernel documentation.

  - read 'Reporting issues': keep this reference.
    Refer to 'Reporting issues'.

  - use gdb for debugging: some specific details, e.g., edit
    arch/x86/Makefile, are probably outdated or limited to one (historic
    important) setup. Using gdb is covered in 'Bug hunting', 'Debugging
    kernel and modules via gdb' and 'Using kgdb, kdb and the kernel
    debugger internals'. Refer to those three documents.

Overall, it is sufficient to refer to reporting-issues.rst,
bug-hunting.rst, gdb-kernel-debugging.rst and kgdb.rst and this way cover
the existing suggestions.

'Reporting issues' is quite new and probably up to date. 'Bug hunting',
'Debugging kernel and modules via gdb' and 'Using kgdb, kdb and the kernel
debugger internals' might need some revisit and update, but they are
generally in an acceptable state for referring to them.

Replace the existing suggestions by reference to other existing kernel
documentation covering those suggestions---partly even nicely summarized
and then explained in greater detail.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Link: https://lore.kernel.org/r/20220720041325.15693-3-lukas.bulwahn@gmail.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/admin-guide/README.rst | 89 +++---------------------------------
 1 file changed, 7 insertions(+), 82 deletions(-)

diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst
index b78fe64b39f6..c47ce4830d4b 100644
--- a/Documentation/admin-guide/README.rst
+++ b/Documentation/admin-guide/README.rst
@@ -330,85 +330,10 @@ Compiling the kernel
 If something goes wrong
 -----------------------
 
- - If you have problems that seem to be due to kernel bugs, please check
-   the file MAINTAINERS to see if there is a particular person associated
-   with the part of the kernel that you are having trouble with. If there
-   isn't anyone listed there, then the second best thing is to mail
-   them to me (torvalds@linux-foundation.org), and possibly to any other
-   relevant mailing-list or to the newsgroup.
-
- - In all bug-reports, *please* tell what kernel you are talking about,
-   how to duplicate the problem, and what your setup is (use your common
-   sense).  If the problem is new, tell me so, and if the problem is
-   old, please try to tell me when you first noticed it.
-
- - If the bug results in a message like::
-
-     unable to handle kernel paging request at address C0000010
-     Oops: 0002
-     EIP:   0010:XXXXXXXX
-     eax: xxxxxxxx   ebx: xxxxxxxx   ecx: xxxxxxxx   edx: xxxxxxxx
-     esi: xxxxxxxx   edi: xxxxxxxx   ebp: xxxxxxxx
-     ds: xxxx  es: xxxx  fs: xxxx  gs: xxxx
-     Pid: xx, process nr: xx
-     xx xx xx xx xx xx xx xx xx xx
-
-   or similar kernel debugging information on your screen or in your
-   system log, please duplicate it *exactly*.  The dump may look
-   incomprehensible to you, but it does contain information that may
-   help debugging the problem.  The text above the dump is also
-   important: it tells something about why the kernel dumped code (in
-   the above example, it's due to a bad kernel pointer). More information
-   on making sense of the dump is in Documentation/admin-guide/bug-hunting.rst
-
- - If you compiled the kernel with CONFIG_KALLSYMS you can send the dump
-   as is, otherwise you will have to use the ``ksymoops`` program to make
-   sense of the dump (but compiling with CONFIG_KALLSYMS is usually preferred).
-   This utility can be downloaded from
-   https://www.kernel.org/pub/linux/utils/kernel/ksymoops/ .
-   Alternatively, you can do the dump lookup by hand:
-
- - In debugging dumps like the above, it helps enormously if you can
-   look up what the EIP value means.  The hex value as such doesn't help
-   me or anybody else very much: it will depend on your particular
-   kernel setup.  What you should do is take the hex value from the EIP
-   line (ignore the ``0010:``), and look it up in the kernel namelist to
-   see which kernel function contains the offending address.
-
-   To find out the kernel function name, you'll need to find the system
-   binary associated with the kernel that exhibited the symptom.  This is
-   the file 'linux/vmlinux'.  To extract the namelist and match it against
-   the EIP from the kernel crash, do::
-
-     nm vmlinux | sort | less
-
-   This will give you a list of kernel addresses sorted in ascending
-   order, from which it is simple to find the function that contains the
-   offending address.  Note that the address given by the kernel
-   debugging messages will not necessarily match exactly with the
-   function addresses (in fact, that is very unlikely), so you can't
-   just 'grep' the list: the list will, however, give you the starting
-   point of each kernel function, so by looking for the function that
-   has a starting address lower than the one you are searching for but
-   is followed by a function with a higher address you will find the one
-   you want.  In fact, it may be a good idea to include a bit of
-   "context" in your problem report, giving a few lines around the
-   interesting one.
-
-   If you for some reason cannot do the above (you have a pre-compiled
-   kernel image or similar), telling me as much about your setup as
-   possible will help.  Please read
-   'Documentation/admin-guide/reporting-issues.rst' for details.
-
- - Alternatively, you can use gdb on a running kernel. (read-only; i.e. you
-   cannot change values or set break points.) To do this, first compile the
-   kernel with -g; edit arch/x86/Makefile appropriately, then do a ``make
-   clean``. You'll also need to enable CONFIG_PROC_FS (via ``make config``).
-
-   After you've rebooted with the new kernel, do ``gdb vmlinux /proc/kcore``.
-   You can now use all the usual gdb commands. The command to look up the
-   point where your system crashed is ``l *0xXXXXXXXX``. (Replace the XXXes
-   with the EIP value.)
-
-   gdb'ing a non-running kernel currently fails because ``gdb`` (wrongly)
-   disregards the starting offset for which the kernel is compiled.
+If you have problems that seem to be due to kernel bugs, please follow the
+instructions at 'Documentation/admin-guide/reporting-issues.rst'.
+
+Hints on understanding kernel bug reports are in
+'Documentation/admin-guide/bug-hunting.rst'. More on debugging the kernel
+with gdb is in 'Documentation/dev-tools/gdb-kernel-debugging.rst' and
+'Documentation/dev-tools/kgdb.rst'.
-- 
cgit v1.2.3


From 7f77ebbf758be6df89951713e7b234a9e6c5d32b Mon Sep 17 00:00:00 2001
From: Akhil Raj <lf32.dev@gmail.com>
Date: Mon, 29 Aug 2022 12:22:39 +0530
Subject: Delete duplicate words from kernel docs

I have deleted duplicate words like

to, guest, trace, when, we

Signed-off-by: Akhil Raj <lf32.dev@gmail.com>
Link: https://lore.kernel.org/r/20220829065239.4531-1-lf32.dev@gmail.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/RCU/checklist.rst         | 2 +-
 Documentation/RCU/lockdep.rst           | 2 +-
 Documentation/bpf/instruction-set.rst   | 2 +-
 Documentation/trace/histogram.rst       | 2 +-
 Documentation/trace/timerlat-tracer.rst | 2 +-
 Documentation/virt/kvm/x86/mmu.rst      | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst
index 42cc5d891bd2..7b1c85a16dc3 100644
--- a/Documentation/RCU/checklist.rst
+++ b/Documentation/RCU/checklist.rst
@@ -477,6 +477,6 @@ over a rather long period of time, but improvements are always welcome!
 	So if you need to wait for both an RCU grace period and for
 	all pre-existing call_rcu() callbacks, you will need to execute
 	both rcu_barrier() and synchronize_rcu(), if necessary, using
-	something like workqueues to to execute them concurrently.
+	something like workqueues to execute them concurrently.
 
 	See rcubarrier.rst for more information.
diff --git a/Documentation/RCU/lockdep.rst b/Documentation/RCU/lockdep.rst
index cc860a0c296b..a94f55991a71 100644
--- a/Documentation/RCU/lockdep.rst
+++ b/Documentation/RCU/lockdep.rst
@@ -61,7 +61,7 @@ checking of rcu_dereference() primitives:
 	rcu_access_pointer(p):
 		Return the value of the pointer and omit all barriers,
 		but retain the compiler constraints that prevent duplicating
-		or coalescsing.  This is useful when when testing the
+		or coalescsing.  This is useful when testing the
 		value of the pointer itself, for example, against NULL.
 
 The rcu_dereference_check() check expression can be any boolean
diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst
index 1b0e6711dec9..0ac7ae40be37 100644
--- a/Documentation/bpf/instruction-set.rst
+++ b/Documentation/bpf/instruction-set.rst
@@ -133,7 +133,7 @@ code field of ``BPF_END``.
 The byte swap instructions operate on the destination register
 only and do not use a separate source register or immediate value.
 
-The 1-bit source operand field in the opcode is used to to select what byte
+The 1-bit source operand field in the opcode is used to select what byte
 order the operation convert from or to:
 
   =========  =====  =================================================
diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst
index 859fd1b76c63..c1b685a38f6b 100644
--- a/Documentation/trace/histogram.rst
+++ b/Documentation/trace/histogram.rst
@@ -412,7 +412,7 @@ Extended error information
   Because the default sort key above is 'hitcount', the above shows a
   the list of call_sites by increasing hitcount, so that at the bottom
   we see the functions that made the most kmalloc calls during the
-  run.  If instead we we wanted to see the top kmalloc callers in
+  run.  If instead we wanted to see the top kmalloc callers in
   terms of the number of bytes requested rather than the number of
   calls, and we wanted the top caller to appear at the top, we can use
   the 'sort' parameter, along with the 'descending' modifier::
diff --git a/Documentation/trace/timerlat-tracer.rst b/Documentation/trace/timerlat-tracer.rst
index d643c95c01eb..db17df312bc8 100644
--- a/Documentation/trace/timerlat-tracer.rst
+++ b/Documentation/trace/timerlat-tracer.rst
@@ -20,7 +20,7 @@ For example::
         [root@f32 ~]# cd /sys/kernel/tracing/
         [root@f32 tracing]# echo timerlat > current_tracer
 
-It is possible to follow the trace by reading the trace trace file::
+It is possible to follow the trace by reading the trace file::
 
   [root@f32 tracing]# cat trace
   # tracer: timerlat
diff --git a/Documentation/virt/kvm/x86/mmu.rst b/Documentation/virt/kvm/x86/mmu.rst
index 8739120f4300..8364afa228ec 100644
--- a/Documentation/virt/kvm/x86/mmu.rst
+++ b/Documentation/virt/kvm/x86/mmu.rst
@@ -377,7 +377,7 @@ Emulating cr0.wp
 ================
 
 If tdp is not enabled, the host must keep cr0.wp=1 so page write protection
-works for the guest kernel, not guest guest userspace.  When the guest
+works for the guest kernel, not guest userspace.  When the guest
 cr0.wp=1, this does not present a problem.  However when the guest cr0.wp=0,
 we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the
 semantics require allowing any guest kernel access plus user read access).
-- 
cgit v1.2.3


From adb95582a09f24201e071732824aef53593020e2 Mon Sep 17 00:00:00 2001
From: Rong Tao <rtoax@foxmail.com>
Date: Wed, 14 Sep 2022 08:33:00 +0800
Subject: Documentation: process/submitting-patches: misspelling "mesages"

Fix spelling mistakes, "mesages" should be spelled "messages".

Signed-off-by: Rong Tao <rtoax@foxmail.com>
Link: https://lore.kernel.org/r/tencent_924BF0B25425E2D5673409D1CF604F682505@qq.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/process/submitting-patches.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst
index be49d8f2601b..7dc94555417d 100644
--- a/Documentation/process/submitting-patches.rst
+++ b/Documentation/process/submitting-patches.rst
@@ -715,8 +715,8 @@ references.
 
 .. _backtraces:
 
-Backtraces in commit mesages
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Backtraces in commit messages
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Backtraces help document the call chain leading to a problem. However,
 not all backtraces are helpful. For example, early boot call chains are
-- 
cgit v1.2.3


From 980309d94e021ddac12e3bd9932e2a1686161899 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng@loongson.cn>
Date: Tue, 6 Sep 2022 16:26:47 +0800
Subject: docs/zh_CN: add PCI acpi-info translation

Translate .../PCI/acpi-info.rst into Chinese.
Add PCI into .../zh_CN/index.rst.

Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Reviewed-by: Alex Shi <alexs@kernel.org>
Reviewed-by: Wu XiangCheng <bobwxc@email.cn>
Link: https://lore.kernel.org/r/f07ba17ae9c6d728d6135ecc0577a932e9836fba.1662449105.git.siyanteng@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/translations/zh_CN/PCI/acpi-info.rst | 139 +++++++++++++++++++++
 Documentation/translations/zh_CN/PCI/index.rst     |  13 +-
 Documentation/translations/zh_CN/index.rst         |   2 +-
 3 files changed, 145 insertions(+), 9 deletions(-)
 create mode 100644 Documentation/translations/zh_CN/PCI/acpi-info.rst

diff --git a/Documentation/translations/zh_CN/PCI/acpi-info.rst b/Documentation/translations/zh_CN/PCI/acpi-info.rst
new file mode 100644
index 000000000000..a35f39dcd858
--- /dev/null
+++ b/Documentation/translations/zh_CN/PCI/acpi-info.rst
@@ -0,0 +1,139 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/PCI/acpi-info.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+=====================
+PCI主桥的ACPI注意事项
+=====================
+
+一般的规则是，ACPI命名空间应该描述操作系统可能使用的所有东西，除非有其他方法让操作系
+统找到它[1, 2]。
+
+例如，没有标准的硬件机制来枚举PCI主桥，所以ACPI命名空间必须描述每个主桥、访问它
+下面的PCI配置空间的方法、主桥转发到PCI的地址空间窗口（使用_CRS）以及传统的INTx
+中断的路由（使用_PRT）。
+
+在主桥下面的PCI设备，通常不需要通过ACPI描述。操作系统可以通过标准的PCI枚举机制来
+发现它们，使用配置访问来发现和识别设备，并读取和测量它们的BAR。然而，如果ACPI为它们
+提供电源管理或热插拔功能，或者如果设备有由平台中断控制器连接的INTx中断，需要一个_PRT
+来描述这些连接，这种情况下ACPI可以描述PCI设备。
+
+ACPI资源描述是通过ACPI命名空间中设备的_CRS对象完成的[2]。_CRS就像一个通用的PCI BAR：
+操作系统可以读取_CRS并找出正在消耗的资源，即使它没有该设备的驱动程序[3]。这一点很重要，
+因为它意味着一个旧的操作系统可以正确地工作，即使是在操作系统不知道的新设备的系统上。新设
+备可能什么都不做，但操作系统至少可以确保没有资源与它们冲突。
+
+像MCFG、HPET、ECDT等静态表，不是保留地址空间的机制。静态表是在操作系统在启动初期且在它
+能够解析ACPI命名空间之前需要知道的东西。如果定义了一个新的表，即使旧的操作系统忽略了这
+个表，它也需要正常运行。_CRS允许这样做，因为它是通用的，可以被旧的操作系统解析；而静态表
+则不允许。
+
+如果操作系统要管理一个通过ACPI描述的不可发现的设备，该设备将有一个特定的_HID/_CID，以
+告诉操作系统与之绑定的驱动程序，并且_CRS告诉操作系统和驱动程序该设备的寄存器在哪里。
+
+PCI主桥是PNP0A03或PNP0A08设备。它们的_CRS应该描述它们所消耗的所有地址空间。这包括它
+们转发到PCI总线上的所有窗口，以及不转发到PCI的主桥本身的寄存器。主桥的寄存器包括次要/下
+级总线寄存器，决定了桥下面的总线范围，窗口寄存器描述了桥洞，等等。这些都是设备相关的，非
+架构相关的东西，所以PNP0A03/PNP0A08驱动可以管理它们的唯一方法是通过_PRS/_CRS/_SRS，
+它包含了特定于设备的细节。主桥寄存器也包括ECAM空间，因为它是由主桥消耗的。
+
+ACPI定义了一个Consumer/Producer位来区分桥寄存器（“Consumer”下文译作消费者）和
+桥洞（“Producer”下文译作生产者）[4, 5]，但是早期的BIOS没有正确使用这个位。其结果
+是，目前的ACPI规范只为扩展地址空间描述符定义了消费者/生产者；在旧的QWord/Word/Word地
+址空间描述符中，该位应该被忽略。因此，操作系统必须假定所有的QWord/Word/Word描述符都是
+窗口。
+
+在增加扩展地址空间描述符之前，消费者/生产者的失败意味着没有办法描述PNP0A03/PNP0A08设
+备本身的桥寄存器。解决办法是在PNP0C02捕捉器中描述桥寄存器（包括ECAM空间）[6]。
+除了ECAM之外，桥寄存器空间反正是特定于设备的，所以通用的PNP0A03/PNP0A08驱动程
+序(pci_root.c)没有必要了解它。
+
+新的架构应该能够在PNP0A03设备中使用“消费者”扩展地址空间描述符，用于桥寄存器，包括
+ECAM，尽管对[6]的严格解释可能禁止这样做。旧的x86和ia64内核假定所有的地址空间描述
+符，包括“消费者”扩展地址空间的描述符，都是窗口，所以在这些架构上以这种方式描述桥寄
+存器是不安全的。
+
+PNP0C02“主板”设备基本上是万能的。除了“不要将这些资源用于其他用途”之外，没有其他的编
+程模型。因此，PNP0C02 _CRS应该声明ACPI命名空间中(1)没有被_CRS声明的任何其他设备对
+象的地址空间，(2)不应该被OS分配给其他东西。
+
+除非有一个标准的固件接口用于配置访问，例如ia64 SAL接口[7]，否则PCIe规范要求使用增强
+型配置访问方法（ECAM）。主桥消耗ECAM内存地址空间并将内存访问转换为PCI配置访问。该规范
+定义了ECAM地址空间的布局和功能；只有地址空间的基础是特定于设备的。ACPI操作系统从静态
+MCFG表或PNP0A03设备中的_CBA方法中了解基础地址。
+
+MCFG表必须描述非热插拔主桥的ECAM空间[8]。由于MCFG是一个静态表，不能通过热插拔更新，
+PNP0A03设备中的_CBA方法描述了可热插拔主桥的ECAM空间[9]。请注意，对于MCFG和_CBA，
+基址总是对应于总线0，即使桥器下面的总线范围（通过_CRS报告）不从0开始。
+
+
+[1] ACPI 6.2, sec 6.1:
+    对于任何在非枚举类型的总线上的设备（例如，ISA总线），OSPM会枚举设备的标识符，ACPI
+    系统固件必须为每个设备提供一个_HID对象...以使OSPM能够做到这一点。
+
+[2] ACPI 6.2, sec 3.7:
+    操作系统枚举主板设备时，只需通过读取ACPI命名空间来寻找具有硬件ID的设备。
+
+    ACPI枚举的每个设备都包括ACPI命名空间中ACPI定义的对象，该对象报告设备可能占用的硬
+    件资源[_PRS]，报告设备当前使用的资源[_CRS]的对象，以及配置这些资源的对象[_SRS]。
+    这些信息被即插即用操作系统（OSPM）用来配置设备。
+
+[3] ACPI 6.2, sec 6.2:
+    OSPM使用设备配置对象来配置通过ACPI列举的设备的硬件资源。设备配置对象提供了关于当前
+    和可能的资源需求的信息，共享资源之间的关系，以及配置硬件资源的方法。
+
+    当OSPM枚举一个设备时，它调用_PRS来确定该设备的资源需求。它也可以调用_CRS来找到该设
+    备的当前资源设置。利用这些信息，即插即用系统决定设备应该消耗什么资源，并通过调用设备
+    的_SRS控制方法来设置这些资源。
+
+    在ACPI中，设备可以消耗资源（例如，传统的键盘），提供资源（例如，一个专有的PCI桥），
+    或者两者都做。除非另有规定，设备的资源被假定为来自设备层次结构中设备上方最近的匹配资
+    源。
+
+[4] ACPI 6.2, sec 6.4.3.5.1, 2, 3, 4:
+    QWord/DWord/Word 地址空间描述符 (.1, .2, .3)
+      常规标志: Bit [0] 被忽略。
+
+    扩展地址空间描述符 (.4)
+      常规标志: Bit [0] 消费者/生产者:
+
+        * 1 – 这个设备消费这个资源
+        * 0 – 该设备生产和消费该资源
+
+[5] ACPI 6.2, sec 19.6.43:
+    ResourceUsage指定内存范围是由这个设备（ResourceConsumer）消费还是传递给子设备
+    （ResourceProducer）。如果没有指定，那么就假定是ResourceConsumer。
+
+[6] PCI Firmware 3.2, sec 4.1.2:
+    如果操作系统不能原生的懂得保留MMCFG区域，MMCFG区域必须由固件保留。在MCFG表中或通
+    过_CBA方法（见第4.1.3节）报告的地址范围必须通过声明主板资源来保留。对于大多数系统，
+    主板资源将出现在ACPI命名空间的根部（在_SB下），在一个节点的_HID为EISAID（PNP0C0
+    2），在这种情况下的资源不应该要求在根PCI总线的_CRS。这些资源可以选择在Int15 E820
+    或EFIGetMemoryMap中作为保留内存返回，但必须始终通过ACPI作为主板资源报告。
+
+[7] PCI Express 4.0, sec 7.2.2:
+    对于PC兼容的系统，或者没有实现允许访问配置空间的处理器架构特定固件接口标准的系统，需
+    要使用本节中定义的ECAM。
+
+[8] PCI Firmware 3.2, sec 4.1.2:
+    MCFG表是一个ACPI表，用于沟通的基础地址对应的非热的可移动的PCI段组范围内的PCI段组在
+    启动时提供给操作系统。这对PC兼容系统来说是必需的。
+
+    MCFG表仅用于沟通在启动时系统可用的PCI段组对应的基址。
+
+[9] PCI Firmware 3.2, sec 4.1.3:
+    _CBA (Memory mapped Configuration Base Address) 控制方法是一个可选的ACPI对
+    象，用于返回热插拔主桥的64位内存映射的配置基址。_CBA 返回的基址是与处理器相关的地址。
+    _CBA 控制方法被评估为一个整数。
+
+    这个控制方法出现在主桥对象下。当_CBA方法出现在一个活动的主桥对象下时，操作系统会评
+    估这个结构，以确定内存映射的配置基址，对应于_CRS方法中指定的总线编号范围的PCI段组。
+    一个包含_CBA方法的ACPI命名空间对象也必须包含一个相应的_SEG方法。
diff --git a/Documentation/translations/zh_CN/PCI/index.rst b/Documentation/translations/zh_CN/PCI/index.rst
index 16acb2bd9b58..cbeb33c34a98 100644
--- a/Documentation/translations/zh_CN/PCI/index.rst
+++ b/Documentation/translations/zh_CN/PCI/index.rst
@@ -10,9 +10,6 @@
 :校译:
 
 
-
-.. _cn_PCI_index.rst:
-
 ===================
 Linux PCI总线子系统
 ===================
@@ -26,12 +23,12 @@ Linux PCI总线子系统
    pci-iov-howto
    msi-howto
    sysfs-pci
+   acpi-info
 
 
 Todolist:
 
-   acpi-info
-   pci-error-recovery
-   pcieaer-howto
-   endpoint/index
-   boot-interrupts
+* pci-error-recovery
+* pcieaer-howto
+* endpoint/index
+* boot-interrupts
diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst
index 4f04367a4c5e..2fc60e60feb4 100644
--- a/Documentation/translations/zh_CN/index.rst
+++ b/Documentation/translations/zh_CN/index.rst
@@ -121,6 +121,7 @@ TODOList:
    scheduler/index
    mm/index
    peci/index
+   PCI/index
 
 TODOList:
 
@@ -148,7 +149,6 @@ TODOList:
 * crypto/index
 * bpf/index
 * usb/index
-* PCI/index
 * scsi/index
 * misc-devices/index
 * mhi/index
-- 
cgit v1.2.3


From 330f5a300548babcaada664668bb17165db8d54c Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng@loongson.cn>
Date: Tue, 6 Sep 2022 16:26:48 +0800
Subject: docs/zh_CN: add dt changesets translation

Translate .../devicetree/changesets.rst into Chinese.

Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/07d23cedda1e2cd8cf40d68059024d116f8d004e.1662449105.git.siyanteng@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../translations/zh_CN/devicetree/changesets.rst   | 37 ++++++++++++++++++++++
 .../translations/zh_CN/devicetree/index.rst        |  3 +-
 2 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/translations/zh_CN/devicetree/changesets.rst

diff --git a/Documentation/translations/zh_CN/devicetree/changesets.rst b/Documentation/translations/zh_CN/devicetree/changesets.rst
new file mode 100644
index 000000000000..2ace05f3c377
--- /dev/null
+++ b/Documentation/translations/zh_CN/devicetree/changesets.rst
@@ -0,0 +1,37 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/Devicetree/changesets.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+============
+设备树变更集
+============
+
+设备树变更集是一种方法，它允许人们以这样一种方式在实时树中使用变化，即要么使用全部的
+变化，要么不使用。如果在使用变更集的过程中发生错误，那么树将被回滚到之前的状态。一个
+变更集也可以在使用后被删除。
+
+当一个变更集被使用时，所有的改变在发出OF_RECONFIG通知器之前被一次性使用到树上。这是
+为了让接收者在收到通知时看到一个完整的、一致的树的状态。
+
+一个变化集的顺序如下。
+
+1. of_changeset_init() - 初始化一个变更集。
+
+2. 一些DT树变化的调用，of_changeset_attach_node(), of_changeset_detach_node(),
+   of_changeset_add_property(), of_changeset_remove_property,
+   of_changeset_update_property()来准备一组变更。此时不会对活动树做任何变更。所有
+   的变更操作都记录在of_changeset的 `entries` 列表中。
+
+3. of_changeset_apply() - 将变更使用到树上。要么整个变更集被使用，要么如果有错误，
+   树会被恢复到之前的状态。核心通过锁确保正确的顺序。如果需要的话，可以使用一个解锁的
+   __of_changeset_apply版本。
+
+如果一个成功使用的变更集需要被删除，可以用of_changeset_revert()来完成。
diff --git a/Documentation/translations/zh_CN/devicetree/index.rst b/Documentation/translations/zh_CN/devicetree/index.rst
index 3fc355fe0037..e9aff2ccc579 100644
--- a/Documentation/translations/zh_CN/devicetree/index.rst
+++ b/Documentation/translations/zh_CN/devicetree/index.rst
@@ -34,9 +34,10 @@ Devicetree Overlays
 .. toctree::
    :maxdepth: 1
 
+   changesets
+
 Todolist:
 
-*   changesets
 *   dynamic-resolution-notes
 *   overlay-notes
 
-- 
cgit v1.2.3


From 5e38432db8f361e002481b006e54ad4de678721f Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng@loongson.cn>
Date: Tue, 6 Sep 2022 16:26:49 +0800
Subject: docs/zh_CN: add dt dynamic-resolution-notes translation

Translate .../devicetree/dynamic-resolution-notes.rst into Chinese.

Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/d8b7c06fe8fdb58cb2ec6989e09f9999aca2d8d1.1662449105.git.siyanteng@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../zh_CN/devicetree/dynamic-resolution-notes.rst  | 31 ++++++++++++++++++++++
 .../translations/zh_CN/devicetree/index.rst        |  2 +-
 2 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/translations/zh_CN/devicetree/dynamic-resolution-notes.rst

diff --git a/Documentation/translations/zh_CN/devicetree/dynamic-resolution-notes.rst b/Documentation/translations/zh_CN/devicetree/dynamic-resolution-notes.rst
new file mode 100644
index 000000000000..115190341305
--- /dev/null
+++ b/Documentation/translations/zh_CN/devicetree/dynamic-resolution-notes.rst
@@ -0,0 +1,31 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/Devicetree/dynamic-resolution-notes.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+========================
+Devicetree动态解析器说明
+========================
+
+本文描述了内核内DeviceTree解析器的实现，它位于drivers/of/resolver.c中。
+
+解析器如何工作？
+----------------
+
+解析器被赋予一个任意的树作为输入，该树用适当的dtc选项编译，并有一个/plugin/标签。这就产
+生了适当的__fixups__和__local_fixups__节点。
+
+解析器依次通过以下步骤工作:
+
+1. 从实时树中获取最大的设备树phandle值 + 1.
+2. 调整树的所有本地 phandles，以解决这个量。
+3. 使用 __local__fixups__ 节点信息以相同的量调整所有本地引用。
+4. 对于__fixups__节点中的每个属性，找到它在实时树中引用的节点。这是用来标记该节点的标签。
+5. 检索fixup的目标的phandle。
+6. 对于属性中的每个fixup，找到节点:属性:偏移的位置，并用phandle值替换它。
diff --git a/Documentation/translations/zh_CN/devicetree/index.rst b/Documentation/translations/zh_CN/devicetree/index.rst
index e9aff2ccc579..be5b974c6e68 100644
--- a/Documentation/translations/zh_CN/devicetree/index.rst
+++ b/Documentation/translations/zh_CN/devicetree/index.rst
@@ -35,10 +35,10 @@ Devicetree Overlays
    :maxdepth: 1
 
    changesets
+   dynamic-resolution-notes
 
 Todolist:
 
-*   dynamic-resolution-notes
 *   overlay-notes
 
 Devicetree Bindings
-- 
cgit v1.2.3


From f773455ce59d9425c473e1e559fc26ea84bcdf00 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng@loongson.cn>
Date: Tue, 6 Sep 2022 16:26:50 +0800
Subject: docs/zh_CN: add dt overlay-notes translation

Translate .../devicetree/overlay-notes.rst into Chinese.

Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/b957580e448e2d0ab7917644c8f8f1614060b20a.1662449105.git.siyanteng@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../translations/zh_CN/devicetree/index.rst        |   5 +-
 .../zh_CN/devicetree/overlay-notes.rst             | 140 +++++++++++++++++++++
 2 files changed, 141 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/translations/zh_CN/devicetree/overlay-notes.rst

diff --git a/Documentation/translations/zh_CN/devicetree/index.rst b/Documentation/translations/zh_CN/devicetree/index.rst
index be5b974c6e68..9d95d2629b38 100644
--- a/Documentation/translations/zh_CN/devicetree/index.rst
+++ b/Documentation/translations/zh_CN/devicetree/index.rst
@@ -36,10 +36,7 @@ Devicetree Overlays
 
    changesets
    dynamic-resolution-notes
-
-Todolist:
-
-*   overlay-notes
+   overlay-notes
 
 Devicetree Bindings
 ===================
diff --git a/Documentation/translations/zh_CN/devicetree/overlay-notes.rst b/Documentation/translations/zh_CN/devicetree/overlay-notes.rst
new file mode 100644
index 000000000000..1bd482cb0a1b
--- /dev/null
+++ b/Documentation/translations/zh_CN/devicetree/overlay-notes.rst
@@ -0,0 +1,140 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/Devicetree/overlay-notes.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+==============
+设备树覆盖说明
+==============
+
+本文档描述了drivers/of/overlay.c中的内核内设备树覆盖功能的实现，是
+Documentation/devicetree/dynamic-resolution-notes.rst[1]的配套文档。
+
+覆盖如何工作
+------------
+
+设备树覆盖的目的是修改内核的实时树，并使修改以反映变化的方式影响内核的状态。
+由于内核主要处理的是设备，任何新的设备节点如果导致一个活动的设备，就应该创建它，
+而如果设备节点被禁用或被全部删除，受影响的设备应该被取消注册。
+
+让我们举个例子，我们有一个foo板，它的基本树形图如下::
+
+    ---- foo.dts ---------------------------------------------------------------
+	/* FOO平台 */
+	/dts-v1/;
+	/ {
+		compatible = "corp,foo";
+
+		/* 共享的资源 */
+		res: res {
+		};
+
+		/* 芯片上的外围设备 */
+		ocp: ocp {
+			/* 总是被实例化的外围设备 */
+			peripheral1 { ... };
+		};
+	};
+    ---- foo.dts ---------------------------------------------------------------
+
+覆盖bar.dts,
+::
+
+    ---- bar.dts - 按标签覆盖目标位置 ----------------------------
+	/dts-v1/;
+	/插件/;
+	&ocp {
+		/* bar外围 */
+		bar {
+			compatible = "corp,bar";
+			... /* 各种属性和子节点 */
+		};
+	};
+    ---- bar.dts ---------------------------------------------------------------
+
+当加载（并按照[1]中描述的方式解决）时，应该产生foo+bar.dts::
+
+    ---- foo+bar.dts -----------------------------------------------------------
+	/* FOO平台 + bar外围 */
+	/ {
+		compatible = "corp,foo";
+
+		/* 共享资源 */
+		res: res {
+		};
+
+		/* 芯片上的外围设备 */
+		ocp: ocp {
+			/* 总是被实例化的外围设备 */
+			peripheral1 { ... };
+
+			/* bar外围 */
+			bar {
+				compatible = "corp,bar";
+				... /* 各种属性和子节点 */
+			};
+		};
+	};
+    ---- foo+bar.dts -----------------------------------------------------------
+
+作为覆盖的结果，已经创建了一个新的设备节点（bar），因此将注册一个bar平台设备，
+如果加载了匹配的设备驱动程序，将按预期创建设备。
+
+如果基础DT不是用-@选项编译的，那么“&ocp”标签将不能用于将覆盖节点解析到基础
+DT中的适当位置。在这种情况下，可以提供目标路径。通过标签的目标位置的语法是比
+较好的，因为不管标签在DT中出现在哪里，覆盖都可以被应用到任何包含标签的基础DT上。
+
+上面的bar.dts例子被修改为使用目标路径语法，即为::
+
+    ---- bar.dts - 通过明确的路径覆盖目标位置 --------------------
+	/dts-v1/;
+	/插件/;
+	&{/ocp} {
+		/* bar外围 */
+		bar {
+			compatible = "corp,bar";
+			... /* 各种外围设备和子节点 */
+		}
+	};
+    ---- bar.dts ---------------------------------------------------------------
+
+
+内核中关于覆盖的API
+-------------------
+
+该API相当容易使用。
+
+1) 调用of_overlay_fdt_apply()来创建和应用一个覆盖的变更集。返回值是一个
+   错误或一个识别这个覆盖的cookie。
+
+2) 调用of_overlay_remove()来删除和清理先前通过调用of_overlay_fdt_apply()
+   而创建的覆盖变更集。不允许删除一个被另一个覆盖的覆盖变化集。
+
+最后，如果你需要一次性删除所有的覆盖，只需调用of_overlay_remove_all()，
+它将以正确的顺序删除每一个覆盖。
+
+你可以选择注册在覆盖操作中被调用的通知器。详见
+of_overlay_notifier_register/unregister和enum of_overlay_notify_action。
+
+OF_OVERLAY_PRE_APPLY、OF_OVERLAY_POST_APPLY或OF_OVERLAY_PRE_REMOVE
+的通知器回调可以存储指向覆盖层中的设备树节点或其内容的指针，但这些指针不能持
+续到OF_OVERLAY_POST_REMOVE的通知器回调。在OF_OVERLAY_POST_REMOVE通
+知器被调用后，包含覆盖层的内存将被kfree()ed。请注意，即使OF_OVERLAY_POST_REMOVE
+的通知器返回错误，内存也会被kfree()ed。
+
+drivers/of/dynamic.c中的变更集通知器是第二种类型的通知器，可以通过应用或移除
+覆盖层来触发。这些通知器不允许在覆盖层或其内容中存储指向设备树节点的指针。当包含
+覆盖层的内存因移除覆盖层而被释放时，覆盖层代码并不能防止这类指针仍然有效。
+
+任何其他保留指向覆盖层节点或数据的指针的代码都被认为是一个错误，因为在移除覆盖层
+后，该指针将指向已释放的内存。
+
+覆盖层的用户必须特别注意系统上发生的整体操作，以确保其他内核代码不保留任何指向覆
+盖层节点或数据的指针。任何无意中使用这种指针的例子是，如果一个驱动或子系统模块在
+应用了覆盖后被加载，并且该驱动或子系统扫描了整个设备树或其大部分，包括覆盖节点。
-- 
cgit v1.2.3


From 9485acfded2024bcd28e9e4d5d503ed7beda0308 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng@loongson.cn>
Date: Tue, 6 Sep 2022 16:26:51 +0800
Subject: docs/zh_CN: add dt kernel-api translation

Translte .../devicetree/kernel-api.rst into Chinese.

Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/35fd1b5801d7191e078937908008115f8949aac3.1662449105.git.siyanteng@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../translations/zh_CN/devicetree/index.rst        |  5 +-
 .../translations/zh_CN/devicetree/kernel-api.rst   | 58 ++++++++++++++++++++++
 2 files changed, 59 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/translations/zh_CN/devicetree/kernel-api.rst

diff --git a/Documentation/translations/zh_CN/devicetree/index.rst b/Documentation/translations/zh_CN/devicetree/index.rst
index 9d95d2629b38..7451dbfdd3e5 100644
--- a/Documentation/translations/zh_CN/devicetree/index.rst
+++ b/Documentation/translations/zh_CN/devicetree/index.rst
@@ -24,10 +24,7 @@ Open Firmware 和 Devicetree
 
    usage-model
    of_unittest
-
-Todolist:
-
-*   kernel-api
+   kernel-api
 
 Devicetree Overlays
 ===================
diff --git a/Documentation/translations/zh_CN/devicetree/kernel-api.rst b/Documentation/translations/zh_CN/devicetree/kernel-api.rst
new file mode 100644
index 000000000000..6aa3b685494e
--- /dev/null
+++ b/Documentation/translations/zh_CN/devicetree/kernel-api.rst
@@ -0,0 +1,58 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/Devicetree/kernel-api.rst
+
+:翻译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+
+:校译:
+
+
+=================
+内核中的设备树API
+=================
+
+核心函数
+--------
+
+该API在以下内核代码中:
+
+drivers/of/base.c
+
+include/linux/of.h
+
+drivers/of/property.c
+
+include/linux/of_graph.h
+
+drivers/of/address.c
+
+drivers/of/irq.c
+
+drivers/of/fdt.c
+
+驱动模型函数
+------------
+
+该API在以下内核代码中:
+
+include/linux/of_device.h
+
+drivers/of/device.c
+
+include/linux/of_platform.h
+
+drivers/of/platform.c
+
+覆盖和动态DT函数
+----------------
+
+该API在以下内核代码中:
+
+drivers/of/resolver.c
+
+drivers/of/dynamic.c
+
+drivers/of/overlay.c
-- 
cgit v1.2.3


From 8bfdfa0d6b929ede7b6189e0e546ceb6a124d05d Mon Sep 17 00:00:00 2001
From: Shuah Khan <skhan@linuxfoundation.org>
Date: Thu, 1 Sep 2022 15:23:19 -0600
Subject: docs: update mediator information in CoC docs

Update mediator information in the CoC interpretation document.

Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220901212319.56644-1-skhan@linuxfoundation.org
Cc: stable@vger.kernel.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/process/code-of-conduct-interpretation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/process/code-of-conduct-interpretation.rst b/Documentation/process/code-of-conduct-interpretation.rst
index e899f14a4ba2..4f8a06b00f60 100644
--- a/Documentation/process/code-of-conduct-interpretation.rst
+++ b/Documentation/process/code-of-conduct-interpretation.rst
@@ -51,7 +51,7 @@ the Technical Advisory Board (TAB) or other maintainers if you're
 uncertain how to handle situations that come up.  It will not be
 considered a violation report unless you want it to be.  If you are
 uncertain about approaching the TAB or any other maintainers, please
-reach out to our conflict mediator, Mishi Choudhary <mishi@linux.com>.
+reach out to our conflict mediator, Joanna Lee <joanna.lee@gesmer.com>.
 
 In the end, "be kind to each other" is really what the end goal is for
 everybody.  We know everyone is human and we all fail at times, but the
-- 
cgit v1.2.3


From 67fe6792a7fb9b7af85d1abf44605c3ab7d3b5f7 Mon Sep 17 00:00:00 2001
From: Salvatore Bonaccorso <carnil@debian.org>
Date: Thu, 1 Sep 2022 20:43:28 +0200
Subject: Documentation: stable: Document alternative for referring upstream
 commit hash

Additionally to the "commit <sha1> upstream." variant, "[ Upstream
commit <sha1> ]" is used as well as alternative to refer to the upstream
commit hash.

Signed-off-by: Salvatore Bonaccorso <carnil@debian.org>
Link: https://lore.kernel.org/r/20220901184328.4075701-1-carnil@debian.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/process/stable-kernel-rules.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/process/stable-kernel-rules.rst b/Documentation/process/stable-kernel-rules.rst
index c61865e91f52..2fd8aa593a28 100644
--- a/Documentation/process/stable-kernel-rules.rst
+++ b/Documentation/process/stable-kernel-rules.rst
@@ -97,6 +97,12 @@ text, like this:
 
     commit <sha1> upstream.
 
+or alternatively:
+
+.. code-block:: none
+
+    [ Upstream commit <sha1> ]
+
 Additionally, some patches submitted via :ref:`option_1` may have additional
 patch prerequisites which can be cherry-picked. This can be specified in the
 following format in the sign-off area:
-- 
cgit v1.2.3


From 622d6f198737c26a47d892f29d054bf20df5a460 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 31 Aug 2022 17:28:28 -0700
Subject: Documentation: filesystems: correct possessive "its"

Change occurrences of "it's" that are possessive to "its"
so that they don't read as "it is".

For f2fs.rst, reword one description for better clarity.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-f2fs-devel@lists.sourceforge.net
Cc: linux-xfs@vger.kernel.org
Cc: Christian Brauner <brauner@kernel.org>
Cc: Seth Forshee <sforshee@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: "Christian Brauner (Microsoft)" <brauner@kernel.org>
Reviewed-by: Chao Yu <chao@kernel.org>
Reviewed-by: Jaegeuk Kim <jaegeuk@kernel.org>
Link: https://lore.kernel.org/r/20220901002828.25102-1-rdunlap@infradead.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/filesystems/f2fs.rst                       | 5 ++---
 Documentation/filesystems/idmappings.rst                 | 2 +-
 Documentation/filesystems/qnx6.rst                       | 2 +-
 Documentation/filesystems/xfs-delayed-logging-design.rst | 6 +++---
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index d0c09663dae8..17df9a02ccff 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -286,9 +286,8 @@ compress_algorithm=%s:%d Control compress algorithm and its compress level, now,
 			 algorithm	level range
 			 lz4		3 - 16
 			 zstd		1 - 22
-compress_log_size=%u	 Support configuring compress cluster size, the size will
-			 be 4KB * (1 << %u), 16KB is minimum size, also it's
-			 default size.
+compress_log_size=%u	 Support configuring compress cluster size. The size will
+			 be 4KB * (1 << %u). The default and minimum sizes are 16KB.
 compress_extension=%s	 Support adding specified extension, so that f2fs can enable
 			 compression on those corresponding files, e.g. if all files
 			 with '.ext' has high compression rate, we can set the '.ext'
diff --git a/Documentation/filesystems/idmappings.rst b/Documentation/filesystems/idmappings.rst
index c1db8748389c..b9b31066aef2 100644
--- a/Documentation/filesystems/idmappings.rst
+++ b/Documentation/filesystems/idmappings.rst
@@ -661,7 +661,7 @@ idmappings::
  mount idmapping:      u0:k10000:r10000
 
 Assume a file owned by ``u1000`` is read from disk. The filesystem maps this id
-to ``k21000`` according to it's idmapping. This is what is stored in the
+to ``k21000`` according to its idmapping. This is what is stored in the
 inode's ``i_uid`` and ``i_gid`` fields.
 
 When the caller queries the ownership of this file via ``stat()`` the kernel
diff --git a/Documentation/filesystems/qnx6.rst b/Documentation/filesystems/qnx6.rst
index fd13433d362c..523b798f04e7 100644
--- a/Documentation/filesystems/qnx6.rst
+++ b/Documentation/filesystems/qnx6.rst
@@ -176,7 +176,7 @@ Then userspace.
 The requirement for a static, fixed preallocated system area comes from how
 qnx6fs deals with writes.
 
-Each superblock got it's own half of the system area. So superblock #1
+Each superblock got its own half of the system area. So superblock #1
 always uses blocks from the lower half while superblock #2 just writes to
 blocks represented by the upper half bitmap system area bits.
 
diff --git a/Documentation/filesystems/xfs-delayed-logging-design.rst b/Documentation/filesystems/xfs-delayed-logging-design.rst
index 02b32030bab3..6402ab8e370c 100644
--- a/Documentation/filesystems/xfs-delayed-logging-design.rst
+++ b/Documentation/filesystems/xfs-delayed-logging-design.rst
@@ -551,14 +551,14 @@ Essentially, this shows that an item that is in the AIL can still be modified
 and relogged, so any tracking must be separate to the AIL infrastructure. As
 such, we cannot reuse the AIL list pointers for tracking committed items, nor
 can we store state in any field that is protected by the AIL lock. Hence the
-committed item tracking needs it's own locks, lists and state fields in the log
+committed item tracking needs its own locks, lists and state fields in the log
 item.
 
 Similar to the AIL, tracking of committed items is done through a new list
 called the Committed Item List (CIL).  The list tracks log items that have been
 committed and have formatted memory buffers attached to them. It tracks objects
 in transaction commit order, so when an object is relogged it is removed from
-it's place in the list and re-inserted at the tail. This is entirely arbitrary
+its place in the list and re-inserted at the tail. This is entirely arbitrary
 and done to make it easy for debugging - the last items in the list are the
 ones that are most recently modified. Ordering of the CIL is not necessary for
 transactional integrity (as discussed in the next section) so the ordering is
@@ -884,7 +884,7 @@ pin the object the first time it is inserted into the CIL - if it is already in
 the CIL during a transaction commit, then we do not pin it again. Because there
 can be multiple outstanding checkpoint contexts, we can still see elevated pin
 counts, but as each checkpoint completes the pin count will retain the correct
-value according to it's context.
+value according to its context.
 
 Just to make matters slightly more complex, this checkpoint level context
 for the pin count means that the pinning of an item must take place under the
-- 
cgit v1.2.3


From 06cb31cc761823ef444ba4e1df11347342a6e745 Mon Sep 17 00:00:00 2001
From: Lin Yujun <linyujun809@huawei.com>
Date: Tue, 30 Aug 2022 20:36:14 +0800
Subject: Documentation/hw-vuln: Update spectre doc

commit 7c693f54c873691 ("x86/speculation: Add spectre_v2=ibrs option to support Kernel IBRS")

adds the "ibrs " option  in
Documentation/admin-guide/kernel-parameters.txt but omits it to
Documentation/admin-guide/hw-vuln/spectre.rst, add it.

Signed-off-by: Lin Yujun <linyujun809@huawei.com>
Link: https://lore.kernel.org/r/20220830123614.23007-1-linyujun809@huawei.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/admin-guide/hw-vuln/spectre.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst
index 2ce2a38cdd55..c4dcdb3d0d45 100644
--- a/Documentation/admin-guide/hw-vuln/spectre.rst
+++ b/Documentation/admin-guide/hw-vuln/spectre.rst
@@ -613,6 +613,7 @@ kernel command line.
                 eibrs                   enhanced IBRS
                 eibrs,retpoline         enhanced IBRS + Retpolines
                 eibrs,lfence            enhanced IBRS + LFENCE
+                ibrs                    use IBRS to protect kernel
 
 		Not specifying this option is equivalent to
 		spectre_v2=auto.
-- 
cgit v1.2.3


From 5415673414c99b44db0c917f1400ad3b6d56fef7 Mon Sep 17 00:00:00 2001
From: Wu XiangCheng <bobwxc@email.cn>
Date: Tue, 30 Aug 2022 13:23:12 +0800
Subject: docs/zh_CN: Update zh_CN/process/coding-style.rst to 6.0-rc2

* update to commit c04639a7d2fb ("coding-style.rst: trivial: fix
  location of driver model macros")

Signed-off-by: Wu XiangCheng <bobwxc@email.cn>
Reviewed-by: Yanteng Si <siyanteng@loongson.cn>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/Yw2ewM4wfaDDLjTk@bobwxc.mipc
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../translations/zh_CN/process/coding-style.rst    | 274 +++++++++++++++------
 1 file changed, 202 insertions(+), 72 deletions(-)

diff --git a/Documentation/translations/zh_CN/process/coding-style.rst b/Documentation/translations/zh_CN/process/coding-style.rst
index 638d714bec83..fa28ef0a7fee 100644
--- a/Documentation/translations/zh_CN/process/coding-style.rst
+++ b/Documentation/translations/zh_CN/process/coding-style.rst
@@ -1,21 +1,23 @@
 .. include:: ../disclaimer-zh_CN.rst
 
-:Original: :ref:`Documentation/process/coding-style.rst <codingstyle>`
+:Original: Documentation/process/coding-style.rst
 
 .. _cn_codingstyle:
 
-译者::
+:译者:
+ - 张乐 Zhang Le <r0bertz@gentoo.org>
+ - Andy Deng <theandy.deng@gmail.com>
+ - 吴想成 <bobwxc@email.cn>
 
-  中文版维护者： 张乐 Zhang Le <r0bertz@gentoo.org>
-  中文版翻译者： 张乐 Zhang Le <r0bertz@gentoo.org>
-  中文版校译者： 王聪 Wang Cong <xiyou.wangcong@gmail.com>
-                 wheelz <kernel.zeng@gmail.com>
-                 管旭东 Xudong Guan <xudong.guan@gmail.com>
-                 Li Zefan <lizf@cn.fujitsu.com>
-                 Wang Chen <wangchen@cn.fujitsu.com>
+:校译:
+ - 王聪 Wang Cong <xiyou.wangcong@gmail.com>
+ - wheelz <kernel.zeng@gmail.com>
+ - 管旭东 Xudong Guan <xudong.guan@gmail.com>
+ - Li Zefan <lizf@cn.fujitsu.com>
+ - Wang Chen <wangchen@cn.fujitsu.com>
 
 Linux 内核代码风格
-=========================
+==================
 
 这是一个简短的文档，描述了 linux 内核的首选代码风格。代码风格是因人而异的，
 而且我不愿意把自己的观点强加给任何人，但这就像我去做任何事情都必须遵循的原则
@@ -29,7 +31,7 @@ Linux 内核代码风格
 
 
 1) 缩进
---------------
+-------
 
 制表符是 8 个字符，所以缩进也是 8 个字符。有些异端运动试图将缩进变为 4 (甚至
 2！) 字符深，这几乎相当于尝试将圆周率的值定义为 3。
@@ -73,6 +75,22 @@ Linux 内核代码风格
 	if (condition) do_this;
 	  do_something_everytime;
 
+不要使用逗号来避免使用大括号：
+
+.. code-block:: c
+
+	if (condition)
+		do_this(), do_that();
+
+使用大括号包裹多语句：
+
+.. code-block:: c
+
+	if (condition) {
+		do_this();
+		do_that();
+	}
+
 也不要在一行里放多个赋值语句。内核代码风格超级简单。就是避免可能导致别人误读
 的表达式。
 
@@ -83,20 +101,25 @@ Linux 内核代码风格
 
 
 2) 把长的行和字符串打散
-------------------------------
+-----------------------
 
 代码风格的意义就在于使用平常使用的工具来维持代码的可读性和可维护性。
 
 每一行的长度的限制是 80 列，我们强烈建议您遵守这个惯例。
 
 长于 80 列的语句要打散成有意义的片段。除非超过 80 列能显著增加可读性，并且不
-会隐藏信息。子片段要明显短于母片段，并明显靠右。这同样适用于有着很长参数列表
-的函数头。然而，绝对不要打散对用户可见的字符串，例如 printk 信息，因为这样就
+会隐藏信息。
+
+子片段要明显短于母片段，并明显靠右。一种非常常用的样式是将子体与函数左括号对齐。
+
+这同样适用于有着很长参数列表的函数头。
+
+然而，绝对不要打散对用户可见的字符串，例如 printk 信息，因为这样就
 很难对它们 grep。
 
 
 3) 大括号和空格的放置
-------------------------------
+---------------------
 
 C 语言风格中另外一个常见问题是大括号的放置。和缩进大小不同，选择或弃用某种放
 置策略并没有多少技术上的原因，不过首选的方式，就像 Kernighan 和 Ritchie 展示
@@ -132,12 +155,12 @@ C 语言风格中另外一个常见问题是大括号的放置。和缩进大小
 		body of function
 	}
 
-全世界的异端可能会抱怨这个不一致性是... 呃... 不一致的，不过所有思维健全的人
+全世界的异端可能会抱怨这个不一致性是……呃……不一致，不过所有思维健全的人
 都知道 (a) K&R 是 **正确的** 并且 (b) K&R 是正确的。此外，不管怎样函数都是特
 殊的 (C 函数是不能嵌套的)。
 
 注意结束大括号独自占据一行，除非它后面跟着同一个语句的剩余部分，也就是 do 语
-句中的 "while" 或者 if 语句中的 "else"，像这样：
+句中的 ``while`` 或者 if 语句中的 ``else`` ，像这样：
 
 .. code-block:: c
 
@@ -191,7 +214,7 @@ C 语言风格中另外一个常见问题是大括号的放置。和缩进大小
 	}
 
 3.1) 空格
-********************
+*********
 
 Linux 内核的空格使用方式 (主要) 取决于它是用于函数还是关键字。(大多数) 关键字
 后要加一个空格。值得注意的例外是 sizeof, typeof, alignof 和 __attribute__，这
@@ -254,7 +277,7 @@ Linux 内核的空格使用方式 (主要) 取决于它是用于函数还是关
 
 
 4) 命名
-------------------------------
+-------
 
 C 是一个简朴的语言，你的命名也应该这样。和 Modula-2 和 Pascal 程序员不同，
 C 程序员不使用类似 ThisVariableIsATemporaryCounter 这样华丽的名字。C 程序员会
@@ -275,11 +298,31 @@ C 程序员不使用类似 ThisVariableIsATemporaryCounter 这样华丽的名字
 可能的话。类似的， ``tmp`` 可以用来称呼任意类型的临时变量。
 
 如果你怕混淆了你的本地变量名，你就遇到另一个问题了，叫做函数增长荷尔蒙失衡综
-合症。请看第六章 (函数)。
+合征。请看第六章 (函数)。
 
+对于符号名称和文档，避免引入新的“master/slave”（或独立于“master”的“slave”）
+和“blacklist/whitelist”。
+
+“master/slave”推荐替换为：
+    '{primary,main} / {secondary,replica,subordinate}'
+    '{initiator,requester} / {target,responder}'
+    '{controller,host} / {device,worker,proxy}'
+    'leader/follower'
+    'director/performer'
+
+“blacklist/whitelist”推荐替换为：
+    'denylist/allowlist'
+    'blocklist/passlist'
+
+引入新用法的例外情况是：维护用户空间ABI/API，或更新现有（截至2020年）硬件或
+协议规范的代码时要求这些术语。对于新规范，尽可能将术语的规范用法转换为内核
+编码标准。
+
+.. warning::
+	以上主从、黑白名单规则不适用于中文文档，请勿更改中文术语！
 
 5) Typedef
------------
+----------
 
 不要使用类似 ``vps_t`` 之类的东西。
 
@@ -308,7 +351,7 @@ C 程序员不使用类似 ThisVariableIsATemporaryCounter 这样华丽的名字
 
      .. note::
 
-       不透明性和 "访问函数" 本身是不好的。我们使用 pte_t 等类型的原因在于真
+       不透明性和“访问函数”本身是不好的。我们使用 pte_t 等类型的原因在于真
        的是完全没有任何共用的可访问信息。
 
  (b) 清楚的整数类型，如此，这层抽象就可以 **帮助** 消除到底是 ``int`` 还是
@@ -353,7 +396,7 @@ C 程序员不使用类似 ThisVariableIsATemporaryCounter 这样华丽的名字
 
 
 6) 函数
-------------------------------
+-------
 
 函数应该简短而漂亮，并且只完成一件事情。函数应该可以一屏或者两屏显示完 (我们
 都知道 ISO/ANSI 屏幕大小是 80x24)，只做一件事情，而且把它做好。
@@ -383,12 +426,46 @@ C 程序员不使用类似 ThisVariableIsATemporaryCounter 这样华丽的名字
 	}
 	EXPORT_SYMBOL(system_is_up);
 
-在函数原型中，包含函数名和它们的数据类型。虽然 C 语言里没有这样的要求，在
+6.1) 函数原型
+*************
+
+在函数原型中包含参数名和它们的数据类型。虽然 C 语言里没有这样的要求，但在
 Linux 里这是提倡的做法，因为这样可以很简单的给读者提供更多的有价值的信息。
 
+不要在函数声明里使用 ``extern`` 关键字，因为这会导致代码行变长，并且不是严格
+必需的。
+
+写函数原型时，请保持 `元素顺序规则 <https://lore.kernel.org/mm-commits/CAHk-=wiOCLRny5aifWNhr621kYrJwhfURsa0vFPeUEm8mF0ufg@mail.gmail.com/>`_ 。
+例如下列函数声明::
+
+ __init void * __must_check action(enum magic value, size_t size, u8 count,
+				   char *fmt, ...) __printf(4, 5) __malloc;
+
+推荐的函数原型元素顺序是：
+
+- 储存类型（下方的 ``static __always_inline`` ，注意 ``__always_inline``
+  技术上来讲是个属性但被当做 ``inline`` ）
+- 储存类型属性（上方的 ``__init`` ——即节声明，但也像 ``__cold`` ）
+- 返回类型（上方的 ``void *`` ）
+- 返回类型属性（上方的 ``__must_check`` ）
+- 函数名（上方的 ``action`` ）
+- 函数参数（上方的 ``(enum magic value, size_t size, u8 count, char *fmt, ...)`` ，
+  注意必须写上参数名）
+- 函数参数属性（上方的 ``__printf(4, 5)`` ）
+- 函数行为属性（上方的 ``__malloc`` ）
+
+请注意，对于函数 **定义** （即实际函数体），编译器不允许在函数参数之后添加函
+数参数属性。在这种情况下，它们应该跟随存储类型属性（例如，与上面的 **声明**
+示例相比，请注意下面的 ``__printf(4, 5)`` 的位置发生了变化）::
+
+ static __always_inline __init __printf(4, 5) void * __must_check action(enum magic value,
+		size_t size, u8 count, char *fmt, ...) __malloc
+ {
+	...
+ }
 
 7) 集中的函数退出途径
-------------------------------
+---------------------
 
 虽然被某些人声称已经过时，但是 goto 语句的等价物还是经常被编译器所使用，具体
 形式是无条件跳转指令。
@@ -432,7 +509,7 @@ Linux 里这是提倡的做法，因为这样可以很简单的给读者提供
 		return result;
 	}
 
-一个需要注意的常见错误是 ``一个 err 错误`` ，就像这样：
+一个需要注意的常见错误是 ``单 err 错误`` ，就像这样：
 
 .. code-block:: c
 
@@ -456,19 +533,19 @@ Linux 里这是提倡的做法，因为这样可以很简单的给读者提供
 
 
 8) 注释
-------------------------------
+-------
 
 注释是好的，不过有过度注释的危险。永远不要在注释里解释你的代码是如何运作的：
 更好的做法是让别人一看你的代码就可以明白，解释写的很差的代码是浪费时间。
 
-一般的，你想要你的注释告诉别人你的代码做了什么，而不是怎么做的。也请你不要把
+一般来说你用注释告诉别人你的代码做了什么，而不是怎么做的。也请你不要把
 注释放在一个函数体内部：如果函数复杂到你需要独立的注释其中的一部分，你很可能
 需要回到第六章看一看。你可以做一些小注释来注明或警告某些很聪明 (或者槽糕) 的
 做法，但不要加太多。你应该做的，是把注释放在函数的头部，告诉人们它做了什么，
 也可以加上它做这些事情的原因。
 
-当注释内核 API 函数时，请使用 kernel-doc 格式。请看
-Documentation/doc-guide/ 和 scripts/kernel-doc 以获得详细信息。
+当注释内核 API 函数时，请使用 kernel-doc 格式。详见
+Documentation/translations/zh_CN/doc-guide/index.rst 和 scripts/kernel-doc 。
 
 长 (多行) 注释的首选风格是：
 
@@ -500,17 +577,18 @@ Documentation/doc-guide/ 和 scripts/kernel-doc 以获得详细信息。
 
 
 9) 你已经把事情弄糟了
-------------------------------
+---------------------
 
-这没什么，我们都是这样。可能你的使用了很长时间 Unix 的朋友已经告诉你
+这没什么，我们都是这样。可能你长期使用 Unix 的朋友已经告诉你
 ``GNU emacs`` 能自动帮你格式化 C 源代码，而且你也注意到了，确实是这样，不过它
 所使用的默认值和我们想要的相去甚远 (实际上，甚至比随机打的还要差——无数个猴子
-在 GNU emacs 里打字永远不会创造出一个好程序) (译注：Infinite Monkey Theorem)
+在 GNU emacs 里打字永远不会创造出一个好程序)
+*(译注：Infinite Monkey Theorem)*
 
 所以你要么放弃 GNU emacs，要么改变它让它使用更合理的设定。要采用后一个方案，
 你可以把下面这段粘贴到你的 .emacs 文件里。
 
-.. code-block:: none
+.. code-block:: elisp
 
   (defun c-lineup-arglist-tabs-only (ignored)
     "Line up argument lists by tabs, not spaces"
@@ -529,7 +607,7 @@ Documentation/doc-guide/ 和 scripts/kernel-doc 以获得详细信息。
           (c-offsets-alist . (
                   (arglist-close         . c-lineup-arglist-tabs-only)
                   (arglist-cont-nonempty .
-		      (c-lineup-gcc-asm-reg c-lineup-arglist-tabs-only))
+                      (c-lineup-gcc-asm-reg c-lineup-arglist-tabs-only))
                   (arglist-intro         . +)
                   (brace-list-intro      . +)
                   (c                     . c-lineup-C-comments)
@@ -573,9 +651,14 @@ Documentation/doc-guide/ 和 scripts/kernel-doc 以获得详细信息。
 ``indent`` 有很多选项，特别是重新格式化注释的时候，你可能需要看一下它的手册。
 不过记住： ``indent`` 不能修正坏的编程习惯。
 
+请注意，您还可以使用 ``clang-format`` 工具帮助您处理这些规则，快速自动重新格
+式化部分代码，并审阅整个文件以发现代码风格错误、打字错误和可能的改进。它还可
+以方便地排序 ``#include`` ，对齐变量/宏，重排文本和其他类似任务。
+详见 Documentation/process/clang-format.rst 。
+
 
 10) Kconfig 配置文件
-------------------------------
+--------------------
 
 对于遍布源码树的所有 Kconfig* 配置文件来说，它们缩进方式有所不同。紧挨着
 ``config`` 定义的行，用一个制表符缩进，然而 help 信息的缩进则额外增加 2 个空
@@ -598,11 +681,11 @@ Documentation/doc-guide/ 和 scripts/kernel-doc 以获得详细信息。
 	depends on ADFS_FS
 	...
 
-要查看配置文件的完整文档，请看 Documentation/kbuild/kconfig-language.rst。
+要查看配置文件的完整文档，请看 Documentation/kbuild/kconfig-language.rst 。
 
 
 11) 数据结构
-------------------------------
+------------
 
 如果一个数据结构，在创建和销毁它的单线执行环境之外可见，那么它必须要有一个引
 用计数器。内核里没有垃圾收集 (并且内核之外的垃圾收集慢且效率低下)，这意味着你
@@ -626,7 +709,7 @@ mm_count)，和文件系统 (``struct super_block``: s_count 和 s_active) 中
 
 
 12) 宏，枚举和RTL
-------------------------------
+-----------------
 
 用于定义常量的宏的名字及枚举里的标签需要大写。
 
@@ -638,7 +721,7 @@ mm_count)，和文件系统 (``struct super_block``: s_count 和 s_active) 中
 
 宏的名字请用大写字母，不过形如函数的宏的名字可以用小写字母。
 
-一般的，如果能写成内联函数就不要写成像函数的宏。
+通常如果能写成内联函数就不要写成像函数的宏。
 
 含有多个语句的宏应该被包含在一个 do-while 代码块里：
 
@@ -696,18 +779,18 @@ mm_count)，和文件系统 (``struct super_block``: s_count 和 s_active) 中
 		(ret);				\
 	})
 
-ret 是本地变量的通用名字 - __foo_ret 更不容易与一个已存在的变量冲突。
+ret 是本地变量的通用名字—— __foo_ret 更不容易与一个已存在的变量冲突。
 
 cpp 手册对宏的讲解很详细。gcc internals 手册也详细讲解了 RTL，内核里的汇编语
 言经常用到它。
 
 
 13) 打印内核消息
-------------------------------
+----------------
 
-内核开发者应该是受过良好教育的。请一定注意内核信息的拼写，以给人以好的印象。
+内核开发者应该看起来有文化。请一定注意内核信息的拼写，以给人良好的印象。
 不要用不规范的单词比如 ``dont``，而要用 ``do not`` 或者 ``don't`` 。保证这些信
-息简单明了,无歧义。
+息简单明了、无歧义。
 
 内核信息不必以英文句号结束。
 
@@ -724,17 +807,18 @@ dev_info() 等等。对于那些不和某个特定设备相关连的信息，<li
 或设定了 CONFIG_DYNAMIC_DEBUG。实际这同样是为了 dev_dbg()，一个相关约定是在一
 个已经开启了 DEBUG 时，使用 VERBOSE_DEBUG 来添加 dev_vdbg()。
 
-许多子系统拥有 Kconfig 调试选项来开启 -DDEBUG 在对应的 Makefile 里面；在其他
+许多子系统拥有 Kconfig 调试选项来开启对应 Makefile 里面的 -DDEBUG；在其他
 情况下，特殊文件使用 #define DEBUG。当一条调试信息需要被无条件打印时，例如，
 如果已经包含一个调试相关的 #ifdef 条件，printk(KERN_DEBUG ...) 就可被使用。
 
 
 14) 分配内存
-------------------------------
+------------
 
 内核提供了下面的一般用途的内存分配函数：
 kmalloc(), kzalloc(), kmalloc_array(), kcalloc(), vmalloc() 和 vzalloc()。
-请参考 API 文档以获取有关它们的详细信息。
+请参考 API 文档以获取有关它们的详细信息：
+Documentation/translations/zh_CN/core-api/memory-allocation.rst 。
 
 传递结构体大小的首选形式是这样的：
 
@@ -761,11 +845,13 @@ kmalloc(), kzalloc(), kmalloc_array(), kcalloc(), vmalloc() 和 vzalloc()。
 
 	p = kcalloc(n, sizeof(...), ...);
 
-两种形式检查分配大小 n * sizeof(...) 的溢出，如果溢出返回 NULL。
+两种形式都会检查分配 n * sizeof(...) 大小时内存的溢出，如果溢出返回 NULL。
 
+在没有 __GFP_NOWARN 的情况下使用时，这些通用分配函数都会在失败时发起堆栈转储，
+因此当返回NULL时，没有必要发出额外的失败消息。
 
 15) 内联弊病
-------------------------------
+------------
 
 有一个常见的误解是 ``内联`` 是 gcc 提供的可以让代码运行更快的一个选项。虽然使
 用内联函数有时候是恰当的 (比如作为一种替代宏的方式，请看第十二章)，不过很多情
@@ -786,7 +872,7 @@ inline gcc 也可以自动使其内联。而且其他用户可能会要求移除
 
 
 16) 函数返回值及命名
-------------------------------
+--------------------
 
 函数可以返回多种不同类型的值，最常见的一种是表明函数执行成功或者失败的值。这样
 的一个值可以表示为一个错误代码整数 (-Exxx＝失败，0＝成功) 或者一个 ``成功``
@@ -797,7 +883,7 @@ inline gcc 也可以自动使其内联。而且其他用户可能会要求移除
 产生这种 bug，请遵循下面的惯例::
 
 	如果函数的名字是一个动作或者强制性的命令，那么这个函数应该返回错误代
-	码整数。如果是一个判断，那么函数应该返回一个 "成功" 布尔值。
+	码整数。如果是一个判断，那么函数应该返回一个“成功”布尔值。
 
 比如， ``add work`` 是一个命令，所以 add_work() 在成功时返回 0，在失败时返回
 -EBUSY。类似的，因为 ``PCI device present`` 是一个判断，所以 pci_dev_present()
@@ -806,13 +892,35 @@ inline gcc 也可以自动使其内联。而且其他用户可能会要求移除
 所有 EXPORTed 函数都必须遵守这个惯例，所有的公共函数也都应该如此。私有
 (static) 函数不需要如此，但是我们也推荐这样做。
 
-返回值是实际计算结果而不是计算是否成功的标志的函数不受此惯例的限制。一般的，
+返回值是实际计算结果而不是计算是否成功的标志的函数不受此惯例的限制。通常
 他们通过返回一些正常值范围之外的结果来表示出错。典型的例子是返回指针的函数，
 他们使用 NULL 或者 ERR_PTR 机制来报告错误。
 
+17) 使用布尔类型
+----------------
+
+Linux内核布尔（bool）类型是C99 _Bool类型的别名。布尔值只能为0或1，而对布尔的
+隐式或显式转换将自动将值转换为true或false。在使用布尔类型时 **不需要** 构造，
+它会消除一类错误。
+
+使用布尔值时，应使用true和false定义，而不是1和0。
 
-17) 不要重新发明内核宏
-------------------------------
+布尔函数返回类型和堆栈变量总是可以在适当的时候使用。鼓励使用布尔来提高可读性，
+并且布尔值在存储时通常比“int”更好。
+
+如果缓存行布局或值的大小很重要，请不要使用布尔，因为其大小和对齐方式根据编译
+的体系结构而不同。针对对齐和大小进行优化的结构体不应使用布尔。
+
+如果一个结构体有多个true/false值，请考虑将它们合并为具有1比特成员的位域，或使
+用适当的固定宽度类型，如u8。
+
+类似地，对于函数参数，多个true/false值可以合并为单个按位的“标志”参数，如果调
+用点具有裸true/false常量，“标志”参数通常是更具可读性的替代方法。
+
+总之，在结构体和参数中有限地使用布尔可以提高可读性。
+
+18) 不要重新发明内核宏
+----------------------
 
 头文件 include/linux/kernel.h 包含了一些宏，你应该使用它们，而不要自己写一些
 它们的变种。比如，如果你需要计算一个数组的长度，使用这个宏
@@ -832,11 +940,11 @@ inline gcc 也可以自动使其内联。而且其他用户可能会要求移除
 在你的代码里自己重新定义。
 
 
-18) 编辑器模式行和其他需要罗嗦的事情
---------------------------------------------------
+19) 编辑器模式行和其他需要罗嗦的事情
+------------------------------------
 
 有一些编辑器可以解释嵌入在源文件里的由一些特殊标记标明的配置信息。比如，emacs
-能够解释被标记成这样的行：
+能够解析被标记成这样的行：
 
 .. code-block:: c
 
@@ -852,7 +960,7 @@ inline gcc 也可以自动使其内联。而且其他用户可能会要求移除
 	End:
 	*/
 
-Vim 能够解释这样的标记：
+Vim 能够解析这样的标记：
 
 .. code-block:: c
 
@@ -863,8 +971,8 @@ Vim 能够解释这样的标记：
 的模式，或者使用其他可以产生正确的缩进的巧妙方法。
 
 
-19) 内联汇编
-------------------------------
+20) 内联汇编
+------------
 
 在特定架构的代码中，你可能需要内联汇编与 CPU 和平台相关功能连接。需要这么做时
 就不要犹豫。然而，当 C 可以完成工作时，不要平白无故地使用内联汇编。在可能的情
@@ -880,8 +988,8 @@ Vim 能够解释这样的标记：
 移除了。你不必总是这样做，尽管，这不必要的举动会限制优化。
 
 在写一个包含多条指令的单个内联汇编语句时，把每条指令用引号分割而且各占一行，
-除了最后一条指令外，在每个指令结尾加上 \n\t，让汇编输出时可以正确地缩进下一条
-指令：
+除了最后一条指令外，在每个指令结尾加上 ``\n\t`` ，让汇编输出时可以正确地缩进
+下一条指令：
 
 .. code-block:: c
 
@@ -890,10 +998,10 @@ Vim 能够解释这样的标记：
 	     : /* outputs */ : /* inputs */ : /* clobbers */);
 
 
-20) 条件编译
-------------------------------
+21) 条件编译
+------------
 
-只要可能，就不要在 .c 文件里面使用预处理条件 (#if, #ifdef)；这样做让代码更难
+只要可能，就不要在 .c 文件里面使用预处理条件 (#if, #ifdef)；这样做会让代码更难
 阅读并且更难去跟踪逻辑。替代方案是，在头文件中用预处理条件提供给那些 .c 文件
 使用，再给 #else 提供一个空桩 (no-op stub) 版本，然后在 .c 文件内无条件地调用
 那些 (定义在头文件内的) 函数。这样做，编译器会避免为桩函数 (stub) 的调用生成
@@ -904,8 +1012,8 @@ Vim 能够解释这样的标记：
 条件到这个辅助函数内。
 
 如果你有一个在特定配置中，可能变成未使用的函数或变量，编译器会警告它定义了但
-未使用，把它标记为 __maybe_unused 而不是将它包含在一个预处理条件中。(然而，如
-果一个函数或变量总是未使用，就直接删除它。)
+未使用，请把它标记为 __maybe_unused 而不是将它包含在一个预处理条件中。(然而，
+如果一个函数或变量总是未使用，就直接删除它。)
 
 在代码中，尽可能地使用 IS_ENABLED 宏来转化某个 Kconfig 标记为 C 的布尔
 表达式，并在一般的 C 条件中使用它：
@@ -931,23 +1039,45 @@ Vim 能够解释这样的标记：
 	#endif /* CONFIG_SOMETHING */
 
 
-附录 I) 参考
--------------------
+附录 I) 参考资料
+----------------
 
-The C Programming Language, 第二版
+The C Programming Language, 2nd Edition
 作者：Brian W. Kernighan 和 Denni M. Ritchie.
 Prentice Hall, Inc., 1988.
-ISBN 0-13-110362-8 (软皮), 0-13-110370-9 (硬皮).
+ISBN 0-13-110362-8 (平装), 0-13-110370-9 (精装).
+
+.. note::
+
+    《C程序设计语言（第2版）》
+    作者：[美] Brian W. Kernighan / [美] Dennis M. Ritchie
+    译者：徐宝文 / 李志 / 尤晋元（审校）
+    出版社：机械工业出版社，2019
+    ISBN：9787111617945
 
 The Practice of Programming
 作者：Brian W. Kernighan 和 Rob Pike.
 Addison-Wesley, Inc., 1999.
 ISBN 0-201-61586-X.
 
+.. note::
+
+    《程序设计实践》
+    作者：[美] Brian W. Kernighan / [美] Rob Pike
+    出版社：机械工业出版社，2005
+    ISBN：9787111091578
+
+    《程序设计实践》
+    作者：[美] Brian W. Kernighan / Rob Pike
+    译者：裘宗燕
+    出版社：机械工业出版社，2000
+    ISBN：9787111075738
+
 GNU 手册 - 遵循 K&R 标准和此文本 - cpp, gcc, gcc internals and indent,
 都可以从 https://www.gnu.org/manual/ 找到
 
 WG14 是 C 语言的国际标准化工作组，URL: http://www.open-std.org/JTC1/SC22/WG14/
 
-Kernel process/coding-style.rst，作者 greg@kroah.com 发表于 OLS 2002：
+内核文档 Documentation/process/coding-style.rst，
+作者 greg@kroah.com 发表于 OLS 2002：
 http://www.kroah.com/linux/talks/ols_2002_kernel_codingstyle_talk/html/
-- 
cgit v1.2.3


From e2815b71cc48c988474dc2f2fc4725c5913f7155 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 29 Aug 2022 16:29:08 -0700
Subject: Documentation: spufs: correct a duplicate word typo

Fix a typo of "or" which should be "of".

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jeremy Kerr <jk@ozlabs.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Jonathan Corbet <corbet@lwn.net>
Reviewed-by: Jeremy Kerr <jk@ozlabs.org>
Link: https://lore.kernel.org/r/20220829232908.32437-1-rdunlap@infradead.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/filesystems/spufs/spufs.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/filesystems/spufs/spufs.rst b/Documentation/filesystems/spufs/spufs.rst
index 8a42859bb100..ca0441cbe37e 100644
--- a/Documentation/filesystems/spufs/spufs.rst
+++ b/Documentation/filesystems/spufs/spufs.rst
@@ -227,7 +227,7 @@ Files
               from the data buffer, updating the value of the specified signal
               notification register.  The signal  notification  register  will
               either be replaced with the input data or will be updated to the
-              bitwise OR or the old value and the input data, depending on the
+              bitwise OR of the old value and the input data, depending on the
               contents  of  the  signal1_type,  or  signal2_type respectively,
               file.
 
-- 
cgit v1.2.3


From 3832d1fd84b6220842d92513239005b2c116633b Mon Sep 17 00:00:00 2001
From: Robert Elliott <elliott@hpe.com>
Date: Sat, 27 Aug 2022 14:38:36 -0500
Subject: docs/core-api: expand Fedora instructions for GCC plugins

In Fedora 36, cross-compiling an allmodconfig configuration
for other architectures on x86 fails with this problem:

In file included from ../scripts/gcc-plugins/gcc-common.h:95,
                 from ../scripts/gcc-plugins/latent_entropy_plugin.c:78:
/usr/lib/gcc/aarch64-linux-gnu/12/plugin/include/builtins.h:23:10: fatal
error: mpc.h: No such file or directory
   23 | #include <mpc.h>
      |          ^~~~~~~
compilation terminated.

In that distro, that header file is available in the separate
libmpc-devel package.

Although future versions of Fedora might correctly mark
that dependency, mention this additional package.

To help detect such problems ahead of time, describe the
    gcc -print-file-name=plugin
command that is used by scripts/gcc-plugins/Kconfig to detect
plugins [1].

[1] https://lore.kernel.org/lkml/CAHk-=wjjiYjCp61gdAMpDOsUBU-A2hFFKJoVx5VAC7yV4K6WYg@xxxxxxxxxxxxxx/

Fixes: 43e96ef8b70c50f ("docs/core-api: Add Fedora instructions for GCC plugins");
Signed-off-by: Robert Elliott <elliott@hpe.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220827193836.2582079-1-elliott@hpe.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/kbuild/gcc-plugins.rst | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/Documentation/kbuild/gcc-plugins.rst b/Documentation/kbuild/gcc-plugins.rst
index 0ba76719f1b9..c578c6ba3eb6 100644
--- a/Documentation/kbuild/gcc-plugins.rst
+++ b/Documentation/kbuild/gcc-plugins.rst
@@ -90,7 +90,11 @@ e.g., on Ubuntu for gcc-10::
 
 Or on Fedora::
 
-	dnf install gcc-plugin-devel
+	dnf install gcc-plugin-devel libmpc-devel
+
+Or on Fedora when using cross-compilers that include plugins::
+
+	dnf install libmpc-devel
 
 Enable the GCC plugin infrastructure and some plugin(s) you want to use
 in the kernel config::
@@ -99,6 +103,19 @@ in the kernel config::
 	CONFIG_GCC_PLUGIN_LATENT_ENTROPY=y
 	...
 
+Run gcc (native or cross-compiler) to ensure plugin headers are detected::
+
+	gcc -print-file-name=plugin
+	CROSS_COMPILE=arm-linux-gnu- ${CROSS_COMPILE}gcc -print-file-name=plugin
+
+The word "plugin" means they are not detected::
+
+	plugin
+
+A full path means they are detected::
+
+       /usr/lib/gcc/x86_64-redhat-linux/12/plugin
+
 To compile the minimum tool set including the plugin(s)::
 
 	make scripts
-- 
cgit v1.2.3


From 60e89a10eef2a3052dc35bea62f51acf26525698 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 28 Aug 2022 12:25:01 -0700
Subject: Documentation: fb: udlfb: clean up text and formatting

Clean up punctuation, spelling, and formatting for command line usage
and modprobe config file usage in udlfb.rst.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Bernie Thompson <bernie@plugable.com>
Cc: linux-fbdev@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: Helge Deller <deller@gmx.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-doc@vger.kernel.org
Cc: Bagas Sanjaya <bagasdotme@gmail.com>
Link: https://lore.kernel.org/r/20220828192501.14232-1-rdunlap@infradead.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/fb/udlfb.rst | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/Documentation/fb/udlfb.rst b/Documentation/fb/udlfb.rst
index 732b37db3504..99cfbb7a1922 100644
--- a/Documentation/fb/udlfb.rst
+++ b/Documentation/fb/udlfb.rst
@@ -86,17 +86,24 @@ Module Options
 Special configuration for udlfb is usually unnecessary. There are a few
 options, however.
 
-From the command line, pass options to modprobe
-modprobe udlfb fb_defio=0 console=1 shadow=1
+From the command line, pass options to modprobe::
 
-Or modify options on the fly at /sys/module/udlfb/parameters directory via
-sudo nano fb_defio
-change the parameter in place, and save the file.
+  modprobe udlfb fb_defio=0 console=1 shadow=1
 
-Unplug/replug USB device to apply with new settings
+Or change options on the fly by editing
+/sys/module/udlfb/parameters/PARAMETER_NAME ::
 
-Or for permanent option, create file like /etc/modprobe.d/udlfb.conf with text
-options udlfb fb_defio=0 console=1 shadow=1
+  cd /sys/module/udlfb/parameters
+  ls # to see a list of parameter names
+  sudo nano PARAMETER_NAME
+  # change the parameter in place, and save the file.
+
+Unplug/replug USB device to apply with new settings.
+
+Or to apply options permanently, create a modprobe configuration file
+like /etc/modprobe.d/udlfb.conf with text::
+
+  options udlfb fb_defio=0 console=1 shadow=1
 
 Accepted boolean options:
 
-- 
cgit v1.2.3


From b270228753f241aa70ad37134f284566be86ec73 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 27 Aug 2022 17:28:45 -0700
Subject: Documentation: W1: minor typo corrections

Correct one typo/spello and remove one duplicated word in the
W1 documentation.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: linux-doc@vger.kernel.org
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Link: https://lore.kernel.org/r/20220828002845.7022-1-rdunlap@infradead.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/w1/masters/ds2490.rst | 2 +-
 Documentation/w1/w1-generic.rst     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/w1/masters/ds2490.rst b/Documentation/w1/masters/ds2490.rst
index 7e5b50f9c0f5..842e7ae80424 100644
--- a/Documentation/w1/masters/ds2490.rst
+++ b/Documentation/w1/masters/ds2490.rst
@@ -52,7 +52,7 @@ Notes and limitations.
   clear the entire bulk in buffer.  It would be possible to read the
   maximum buffer size to not run into this error condition, only extra
   bytes in the buffer is a logic error in the driver.  The code should
-  should match reads and writes as well as data sizes.  Reads and
+  match reads and writes as well as data sizes.  Reads and
   writes are serialized and the status verifies that the chip is idle
   (and data is available) before the read is executed, so it should
   not happen.
diff --git a/Documentation/w1/w1-generic.rst b/Documentation/w1/w1-generic.rst
index da4e8b4e9b01..99255b6d0e53 100644
--- a/Documentation/w1/w1-generic.rst
+++ b/Documentation/w1/w1-generic.rst
@@ -113,7 +113,7 @@ generally only make sense when searching is disabled, as a search will
 redetect manually removed devices that are present and timeout manually
 added devices that aren't on the bus.
 
-Bus searches occur at an interval, specified as a summ of timeout and
+Bus searches occur at an interval, specified as a sum of timeout and
 timeout_us module parameters (either of which may be 0) for as long as
 w1_master_search remains greater than 0 or is -1.  Each search attempt
 decrements w1_master_search by 1 (down to 0) and increments
-- 
cgit v1.2.3


From c68c0db4b25ff7336471ca32be7f9af5110528bc Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 27 Aug 2022 13:32:17 -0700
Subject: usb: chipidea: clarify Documentation/ABI text

Fix grammar and improve readability of chipidea-usb2 text.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Peter Chen <peter.chen@kernel.org>
Cc: linux-usb@vger.kernel.org
Acked-by: Peter Chen <peter.chen@kernel.org>
Link: https://lore.kernel.org/r/20220827203217.7837-1-rdunlap@infradead.org
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/ABI/testing/sysfs-platform-chipidea-usb2 | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-platform-chipidea-usb2 b/Documentation/ABI/testing/sysfs-platform-chipidea-usb2
index b0f4684a83fe..b9f7d924f28a 100644
--- a/Documentation/ABI/testing/sysfs-platform-chipidea-usb2
+++ b/Documentation/ABI/testing/sysfs-platform-chipidea-usb2
@@ -2,8 +2,8 @@ What:		/sys/bus/platform/devices/ci_hdrc.0/role
 Date:		Mar 2017
 Contact:	Peter Chen <peter.chen@nxp.com>
 Description:
-		It returns string "gadget" or "host" when read it, it indicates
-		current controller role.
+		When read, it returns string "gadget" or "host", indicating
+		the current controller role.
 
-		It will do role switch when write "gadget" or "host" to it.
+		It will do role switch when "gadget" or "host" is written to it.
 		Only controller at dual-role configuration supports writing.
-- 
cgit v1.2.3


From a356c06699e2e9bde64b17fc220de3c01f0c7d20 Mon Sep 17 00:00:00 2001
From: Tuo Cao <91tuocao@gmail.com>
Date: Thu, 15 Sep 2022 23:01:55 +0800
Subject: docs: x86: replace do_IRQ int the entry_64.rst with
 common_interrupt()

do_IRQ has been replaced by common_interrupt in commit
fa5e5c409213 ("x86/entry: Use idtentry for interrupts").

Signed-off-by: Tuo Cao <91tuocao@gmail.com>
Link: https://lore.kernel.org/r/20220915150155.9908-1-91tuocao@gmail.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/x86/entry_64.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/x86/entry_64.rst b/Documentation/x86/entry_64.rst
index e433e08f7018..0afdce3c06f4 100644
--- a/Documentation/x86/entry_64.rst
+++ b/Documentation/x86/entry_64.rst
@@ -33,8 +33,8 @@ Some of these entries are:
  - interrupt: An array of entries.  Every IDT vector that doesn't
    explicitly point somewhere else gets set to the corresponding
    value in interrupts.  These point to a whole array of
-   magically-generated functions that make their way to do_IRQ with
-   the interrupt number as a parameter.
+   magically-generated functions that make their way to common_interrupt()
+   with the interrupt number as a parameter.
 
  - APIC interrupts: Various special-purpose interrupts for things
    like TLB shootdown.
-- 
cgit v1.2.3


From 7675ea68d60554e119d7439472af056d07a521ba Mon Sep 17 00:00:00 2001
From: Binbin Zhou <zhoubinbin@loongson.cn>
Date: Fri, 19 Aug 2022 14:28:37 +0800
Subject: docs/zh_CN: core-api: Add idr Chinese translation

Translate core-api/idr.rst into Chinese.

Last English version used:

commit 85656ec193e9 ("IDR: Note that the IDR API is deprecated").

Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Reviewed-by: Wu XiangCheng <bobwxc@email.cn>
Reviewed-by: Yanteng Si<siyanteng@loongson.cn>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/9f578ea087df7ef8665fc08541d208e7429176ec.1660881950.git.zhoubinbin@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/translations/zh_CN/core-api/idr.rst  | 80 ++++++++++++++++++++++
 .../translations/zh_CN/core-api/index.rst          |  2 +-
 2 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/translations/zh_CN/core-api/idr.rst

diff --git a/Documentation/translations/zh_CN/core-api/idr.rst b/Documentation/translations/zh_CN/core-api/idr.rst
new file mode 100644
index 000000000000..97a16e76b81b
--- /dev/null
+++ b/Documentation/translations/zh_CN/core-api/idr.rst
@@ -0,0 +1,80 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/core-api/idr.rst
+
+:翻译:
+
+ 周彬彬 Binbin Zhou <zhoubinbin@loongson.cn>
+
+:校译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+ 吴想成 Wu Xiangcheng <bobwxc@email.cn>
+ 时奎亮 Alex Shi <alexs@kernel.org>
+
+======
+ID分配
+======
+
+:作者: Matthew Wilcox
+
+概述
+====
+
+要解决的一个常见问题是分配标识符（IDs）；它通常是标识事物的数字。比如包括文件描述
+符、进程ID、网络协议中的数据包标识符、SCSI标记和设备实例编号。IDR和IDA为这个问题
+提供了一个合理的解决方案，以避免每个人都自创。IDR提供将ID映射到指针的能力，而IDA
+仅提供ID分配，因此内存效率更高。
+
+IDR接口已经被废弃，请使用 ``XArray`` 。
+
+IDR的用法
+=========
+
+首先初始化一个IDR，对于静态分配的IDR使用DEFINE_IDR()，或者对于动态分配的IDR使用
+idr_init()。
+
+您可以调用idr_alloc()来分配一个未使用的ID。通过调用idr_find()查询与该ID相关的指针，
+并通过调用idr_remove()释放该ID。
+
+如果需要更改与一个ID相关联的指针，可以调用idr_replace()。这样做的一个常见原因是通
+过将 ``NULL`` 指针传递给分配函数来保留ID；用保留的ID初始化对象，最后将初始化的对
+象插入IDR。
+
+一些用户需要分配大于 ``INT_MAX`` 的ID。到目前为止，所有这些用户都满足 ``UINT_MAX``
+的限制，他们使用idr_alloc_u32()。如果您需要超出u32的ID，我们将与您合作以满足您的
+需求。
+
+如果需要按顺序分配ID，可以使用idr_alloc_cyclic()。处理较大数量的ID时，IDR的效率会
+降低，所以使用这个函数会有一点代价。
+
+要对IDR使用的所有指针进行操作，您可以使用基于回调的idr_for_each()或迭代器样式的
+idr_for_each_entry()。您可能需要使用idr_for_each_entry_continue()来继续迭代。如果
+迭代器不符合您的需求，您也可以使用idr_get_next()。
+
+当使用完IDR后，您可以调用idr_destroy()来释放IDR占用的内存。这并不会释放IDR指向的
+对象；如果您想这样做，请使用其中一个迭代器来执行此操作。
+
+您可以使用idr_is_empty()来查看当前是否分配了任何ID。
+
+如果在从IDR分配一个新ID时需要带锁，您可能需要传递一组限制性的GFP标志，但这可能导
+致IDR无法分配内存。为了解决该问题，您可以在获取锁之前调用idr_preload()，然后在分
+配之后调用idr_preload_end()。
+
+IDR同步的相关内容请见include/linux/idr.h文件中的“DOC: idr sync”。
+
+IDA的用法
+=========
+
+IDA的用法的相关内容请见lib/idr.c文件中的“DOC: IDA description”。
+
+函数和数据结构
+==============
+
+该API在以下内核代码中:
+
+include/linux/idr.h
+
+lib/idr.c
diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst
index 8a94ad87465d..c5ba0a48147c 100644
--- a/Documentation/translations/zh_CN/core-api/index.rst
+++ b/Documentation/translations/zh_CN/core-api/index.rst
@@ -44,12 +44,12 @@
    assoc_array
    xarray
    rbtree
+   idr
 
 Todolist:
 
 
-   idr
    circular-buffers
    generic-radix-tree
    packing
-- 
cgit v1.2.3


From 2e6506c1a5a51eabaf1cc88a0fe6783509c905e8 Mon Sep 17 00:00:00 2001
From: Binbin Zhou <zhoubinbin@loongson.cn>
Date: Fri, 19 Aug 2022 14:28:38 +0800
Subject: docs/zh_CN: core-api: Add circular-buffers Chinese translation

Translate core-api/circular-buffers.rst into Chinese.

Last English version used:

commit 714b6904e23e ("doc: Remove ".vnet" from paulmck email addresses").

Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Reviewed-by: Wu XiangCheng <bobwxc@email.cn>
Reviewed-by: Yanteng Si<siyanteng@loongson.cn>
Link: https://lore.kernel.org/r/6b94f233dd4b4a9e6da6fa2f86a9b1d32f104004.1660881950.git.zhoubinbin@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../zh_CN/core-api/circular-buffers.rst            | 210 +++++++++++++++++++++
 .../translations/zh_CN/core-api/index.rst          |   2 +-
 2 files changed, 211 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/translations/zh_CN/core-api/circular-buffers.rst

diff --git a/Documentation/translations/zh_CN/core-api/circular-buffers.rst b/Documentation/translations/zh_CN/core-api/circular-buffers.rst
new file mode 100644
index 000000000000..694ad8e61070
--- /dev/null
+++ b/Documentation/translations/zh_CN/core-api/circular-buffers.rst
@@ -0,0 +1,210 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/core-api/circular-buffers.rst
+
+:翻译:
+
+ 周彬彬 Binbin Zhou <zhoubinbin@loongson.cn>
+
+:校译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+ 吴想成 Wu Xiangcheng <bobwxc@email.cn>
+ 时奎亮 Alex Shi <alexs@kernel.org>
+
+==========
+环形缓冲区
+==========
+
+:作者: David Howells <dhowells@redhat.com>
+:作者: Paul E. McKenney <paulmck@linux.ibm.com>
+
+
+Linux 提供了许多可用于实现循环缓冲的特性。有两组这样的特性：
+
+ (1) 用于确定2次方大小的缓冲区信息的便利函数。
+
+ (2) 可以代替缓冲区中对象的生产者和消费者共享锁的内存屏障。
+
+如下所述，要使用这些设施，只需要一个生产者和一个消费者。可以通过序列化来处理多个
+生产者，并通过序列化来处理多个消费者。
+
+.. Contents:
+
+ (*) 什么是环形缓冲区？
+
+ (*) 测量2次幂缓冲区
+
+ (*) 内存屏障与环形缓冲区的结合使用
+     - 生产者
+     - 消费者
+
+ (*) 延伸阅读
+
+
+
+什么是环形缓冲区？
+==================
+
+首先，什么是环形缓冲区？环形缓冲区是具有固定的有限大小的缓冲区，它有两个索引：
+
+ (1) 'head'索引 - 生产者将元素插入缓冲区的位置。
+
+ (2) 'tail'索引 - 消费者在缓冲区中找到下一个元素的位置。
+
+通常，当tail指针等于head指针时，表明缓冲区是空的；而当head指针比tail指针少一个时，
+表明缓冲区是满的。
+
+添加元素时，递增head索引；删除元素时，递增tail索引。tail索引不应该跳过head索引，
+两个索引在到达缓冲区末端时都应该被赋值为0，从而允许海量的数据流过缓冲区。
+
+通常情况下，元素都有相同的单元大小，但这并不是使用以下技术的严格要求。如果要在缓
+冲区中包含多个元素或可变大小的元素，则索引可以增加超过1，前提是两个索引都没有超过
+另一个。然而，实现者必须小心，因为超过一个单位大小的区域可能会覆盖缓冲区的末端并
+且缓冲区会被分成两段。
+
+测量2次幂缓冲区
+===============
+
+计算任意大小的环形缓冲区的占用或剩余容量通常是一个费时的操作，需要使用模（除法）
+指令。但是如果缓冲区的大小为2次幂，则可以使用更快的按位与指令代替。
+
+Linux提供了一组用于处理2次幂环形缓冲区的宏。可以通过以下方式使用::
+
+	#include <linux/circ_buf.h>
+
+这些宏包括：
+
+ (#) 测量缓冲区的剩余容量::
+
+	CIRC_SPACE(head_index, tail_index, buffer_size);
+
+     返回缓冲区[1]中可插入元素的剩余空间大小。
+
+
+ (#) 测量缓冲区中的最大连续立即可用空间::
+
+	CIRC_SPACE_TO_END(head_index, tail_index, buffer_size);
+
+     返回缓冲区[1]中剩余的连续空间的大小，元素可以立即插入其中，而不必绕回到缓冲
+     区的开头。
+
+
+ (#) 测量缓冲区的使用数::
+
+	CIRC_CNT(head_index, tail_index, buffer_size);
+
+     返回当前占用缓冲区[2]的元素数量。
+
+
+ (#) 测量缓冲区的连续使用数::
+
+	CIRC_CNT_TO_END(head_index, tail_index, buffer_size);
+
+     返回可以从缓冲区中提取的连续元素[2]的数量，而不必绕回到缓冲区的开头。
+
+这里的每一个宏名义上都会返回一个介于0和buffer_size-1之间的值，但是：
+
+ (1) CIRC_SPACE*()是为了在生产者中使用。对生产者来说，它们将返回一个下限，因为生
+     产者控制着head索引，但消费者可能仍然在另一个CPU上耗尽缓冲区并移动tail索引。
+
+     对消费者来说，它将显示一个上限，因为生产者可能正忙于耗尽空间。
+
+ (2) CIRC_CNT*()是为了在消费者中使用。对消费者来说，它们将返回一个下限，因为消费
+     者控制着tail索引，但生产者可能仍然在另一个CPU上填充缓冲区并移动head索引。
+
+     对于生产者，它将显示一个上限，因为消费者可能正忙于清空缓冲区。
+
+ (3) 对于第三方来说，生产者和消费者对索引的写入顺序是无法保证的，因为它们是独立的，
+     而且可能是在不同的CPU上进行的，所以在这种情况下的结果只是一种猜测，甚至可能
+     是错误的。
+
+内存屏障与环形缓冲区的结合使用
+==============================
+
+通过将内存屏障与环形缓冲区结合使用，可以避免以下需求：
+
+ (1) 使用单个锁来控制对缓冲区两端的访问，从而允许同时填充和清空缓冲区；以及
+
+ (2) 使用原子计数器操作。
+
+这有两个方面：填充缓冲区的生产者和清空缓冲区的消费者。在任何时候，只应有一个生产
+者在填充缓冲区，同样的也只应有一个消费者在清空缓冲区，但双方可以同时操作。
+
+
+生产者
+------
+
+生产者看起来像这样::
+
+	spin_lock(&producer_lock);
+
+	unsigned long head = buffer->head;
+	/* spin_unlock()和下一个spin_lock()提供必要的排序。 */
+	unsigned long tail = READ_ONCE(buffer->tail);
+
+	if (CIRC_SPACE(head, tail, buffer->size) >= 1) {
+		/* 添加一个元素到缓冲区 */
+		struct item *item = buffer[head];
+
+		produce_item(item);
+
+		smp_store_release(buffer->head,
+				  (head + 1) & (buffer->size - 1));
+
+		/* wake_up()将确保在唤醒任何人之前提交head */
+		wake_up(consumer);
+	}
+
+	spin_unlock(&producer_lock);
+
+这将表明CPU必须在head索引使其对消费者可用之前写入新项目的内容，同时CPU必须在唤醒
+消费者之前写入修改后的head索引。
+
+请注意，wake_up()并不保证任何形式的屏障，除非确实唤醒了某些东西。因此我们不能依靠
+它来进行排序。但是数组中始终有一个元素留空，因此生产者必须产生两个元素，然后才可
+能破坏消费者当前正在读取的元素。同时，消费者连续调用之间成对的解锁-加锁提供了索引
+读取（指示消费者已清空给定元素）和生产者对该相同元素的写入之间的必要顺序。
+
+
+消费者
+------
+
+消费者看起来像这样::
+
+	spin_lock(&consumer_lock);
+
+	/* 读取该索引处的内容之前，先读取索引 */
+	unsigned long head = smp_load_acquire(buffer->head);
+	unsigned long tail = buffer->tail;
+
+	if (CIRC_CNT(head, tail, buffer->size) >= 1) {
+
+		/* 从缓冲区中提取一个元素 */
+		struct item *item = buffer[tail];
+
+		consume_item(item);
+
+		/* 在递增tail之前完成对描述符的读取。 */
+		smp_store_release(buffer->tail,
+				  (tail + 1) & (buffer->size - 1));
+	}
+
+	spin_unlock(&consumer_lock);
+
+这表明CPU在读取新元素之前确保索引是最新的，然后在写入新的尾指针之前应确保CPU已完
+成读取该元素，这将擦除该元素。
+
+请注意，使用READ_ONCE()和smp_load_acquire()来读取反向（head）索引。这可以防止编译
+器丢弃并重新加载其缓存值。如果您能确定反向（head）索引将仅使用一次，则这不是必须
+的。smp_load_acquire()还可以强制CPU对后续的内存引用进行排序。类似地，两种算法都使
+用smp_store_release()来写入线程的索引。这记录了我们正在写入可以并发读取的内容的事
+实，以防止编译器破坏存储，并强制对以前的访问进行排序。
+
+
+延伸阅读
+========
+
+关于Linux的内存屏障设施的描述，请查看Documentation/memory-barriers.txt。
diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst
index c5ba0a48147c..4ae39a83aa61 100644
--- a/Documentation/translations/zh_CN/core-api/index.rst
+++ b/Documentation/translations/zh_CN/core-api/index.rst
@@ -45,12 +45,12 @@
    xarray
    rbtree
    idr
+   circular-buffers
 
 Todolist:
 
 
-   circular-buffers
    generic-radix-tree
    packing
    this_cpu_ops
-- 
cgit v1.2.3


From eda91f4090e97dd75d5f4aec82021c097d4b3114 Mon Sep 17 00:00:00 2001
From: Binbin Zhou <zhoubinbin@loongson.cn>
Date: Fri, 19 Aug 2022 14:28:39 +0800
Subject: docs/zh_CN: core-api: Add generic-radix-tree Chinese translation

Translate core-api/generic-radix-tree.rst into Chinese.

Last English version used:

commit ba20ba2e3743 ("generic radix trees").

Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Reviewed-by: Wu XiangCheng <bobwxc@email.cn>
Reviewed-by: Yanteng Si<siyanteng@loongson.cn>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/aad94e2a053ae021eb4d63240690b05c2f3e8dec.1660881950.git.zhoubinbin@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../zh_CN/core-api/generic-radix-tree.rst          | 23 ++++++++++++++++++++++
 .../translations/zh_CN/core-api/index.rst          |  2 +-
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/translations/zh_CN/core-api/generic-radix-tree.rst

diff --git a/Documentation/translations/zh_CN/core-api/generic-radix-tree.rst b/Documentation/translations/zh_CN/core-api/generic-radix-tree.rst
new file mode 100644
index 000000000000..eacd1d2ebddc
--- /dev/null
+++ b/Documentation/translations/zh_CN/core-api/generic-radix-tree.rst
@@ -0,0 +1,23 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/core-api/generic-radix-tree.rst
+
+:翻译:
+
+ 周彬彬 Binbin Zhou <zhoubinbin@loongson.cn>
+
+===================
+通用基数树/稀疏数组
+===================
+
+通用基数树/稀疏数组的相关内容请见include/linux/generic-radix-tree.h文件中的
+“DOC: Generic radix trees/sparse arrays”。
+
+通用基数树函数
+--------------
+
+该API在以下内核代码中:
+
+include/linux/generic-radix-tree.h
diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst
index 4ae39a83aa61..c62658941aee 100644
--- a/Documentation/translations/zh_CN/core-api/index.rst
+++ b/Documentation/translations/zh_CN/core-api/index.rst
@@ -46,12 +46,12 @@
    rbtree
    idr
    circular-buffers
+   generic-radix-tree
 
 Todolist:
 
 
-   generic-radix-tree
    packing
    this_cpu_ops
    timekeeping
-- 
cgit v1.2.3


From 6d75bb92f99260af29986657cea1426f4040b280 Mon Sep 17 00:00:00 2001
From: Binbin Zhou <zhoubinbin@loongson.cn>
Date: Fri, 19 Aug 2022 14:28:40 +0800
Subject: docs/zh_CN: core-api: Add packing Chinese translation

Translate core-api/packing.rst into Chinese.

Last English version used:

commit 1ec779b9fabc ("docs: packing: move it to core-api book
and adjust markups").

Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Reviewed-by: Wu XiangCheng <bobwxc@email.cn>
Reviewed-by: Yanteng Si<siyanteng@loongson.cn>
Reviewed-by: Alex Shi <alexs@kernel.org>
Link: https://lore.kernel.org/r/96b19575ca7e9e23941e8a5ef92120f1bffbc518.1660881950.git.zhoubinbin@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../translations/zh_CN/core-api/index.rst          |   2 +-
 .../translations/zh_CN/core-api/packing.rst        | 160 +++++++++++++++++++++
 2 files changed, 161 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/translations/zh_CN/core-api/packing.rst

diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst
index c62658941aee..37756d240b5e 100644
--- a/Documentation/translations/zh_CN/core-api/index.rst
+++ b/Documentation/translations/zh_CN/core-api/index.rst
@@ -47,12 +47,12 @@
    idr
    circular-buffers
    generic-radix-tree
+   packing
 
 Todolist:
 
 
-   packing
    this_cpu_ops
    timekeeping
    errseq
diff --git a/Documentation/translations/zh_CN/core-api/packing.rst b/Documentation/translations/zh_CN/core-api/packing.rst
new file mode 100644
index 000000000000..c0aab3a349d0
--- /dev/null
+++ b/Documentation/translations/zh_CN/core-api/packing.rst
@@ -0,0 +1,160 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/core-api/packing.rst
+
+:翻译:
+
+ 周彬彬 Binbin Zhou <zhoubinbin@loongson.cn>
+
+:校译:
+
+ 司延腾 Yanteng Si <siyanteng@loongson.cn>
+ 吴想成 Wu Xiangcheng <bobwxc@email.cn>
+ 时奎亮 Alex Shi <alexs@kernel.org>
+
+========================
+通用的位域打包和解包函数
+========================
+
+问题陈述
+--------
+
+使用硬件时，必须在几种与其交互的方法之间进行选择。
+
+可以将指针映射到在硬件设备的内存区上精心设计的结构体，并将其字段作为结构成员（可
+能声明为位域）访问。但是由于CPU和硬件设备之间潜在的字节顺序不匹配，以这种方式编写
+代码会降低其可移植性。
+
+此外，必须密切注意将硬件文档中的寄存器定义转换为结构的位域索引。此外，一些硬件
+（通常是网络设备）倾向于以违反任何合理字边界（有时甚至是64位）的方式对其寄存器字
+段进行分组。这就造成了不得不在结构中定义寄存器字段的“高”和“低”部分的不便。
+
+结构域定义的更可靠的替代方法是通过移动适当数量的位来提取所需的字段。但这仍然不能
+防止字节顺序不匹配，除非所有内存访问都是逐字节执行的。此外，代码很容易变得杂乱无
+章，同时可能会在所需的许多位移操作中丢失一些高层次的想法。
+
+许多驱动程序采用了位移的方法，然后试图用定制的宏来减少杂乱无章的东西，但更多的时
+候，这些宏所采用的捷径依旧妨碍了代码真正的可移植性。
+
+解决方案
+--------
+
+该API涉及2个基本操作：
+
+  - 将一个CPU可使用的数字打包到内存缓冲区中（具有硬件约束/特殊性）。
+  - 将内存缓冲区（具有硬件约束/特殊性）解压缩为一个CPU可使用的数字。
+
+该API提供了对所述硬件约束和特殊性以及CPU字节序的抽象，因此这两者之间可能不匹配。
+
+这些API函数的基本单元是u64。从CPU的角度来看，位63总是意味着字节7的位偏移量7，尽管
+只是逻辑上的。问题是：我们将这个比特放在内存的什么位置？
+
+以下示例介绍了打包u64字段的内存布局。打包缓冲区中的字节偏移量始终默认为0，1...7。
+示例显示的是逻辑字节和位所在的位置。
+
+1. 通常情况下（无特殊性），我们会这样做：
+
+::
+
+  63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+  7                       6                       5                        4
+  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
+  3                       2                       1                        0
+
+也就是说，CPU可使用的u64的MSByte(7)位于内存偏移量0处，而u64的LSByte(0)位于内存偏移量7处。
+
+这对应于大多数人认为的“大端”，其中位i对应于数字2^i。这在代码注释中也称为“逻辑”符号。
+
+
+2. 如果设置了QUIRK_MSB_ON_THE_RIGHT，我们按如下方式操作：
+
+::
+
+  56 57 58 59 60 61 62 63 48 49 50 51 52 53 54 55 40 41 42 43 44 45 46 47 32 33 34 35 36 37 38 39
+  7                       6                        5                       4
+  24 25 26 27 28 29 30 31 16 17 18 19 20 21 22 23  8  9 10 11 12 13 14 15  0  1  2  3  4  5  6  7
+  3                       2                        1                       0
+
+也就是说，QUIRK_MSB_ON_THE_RIGHT不会影响字节定位，但会反转字节内的位偏移量。
+
+
+3. 如果设置了QUIRK_LITTLE_ENDIAN，我们按如下方式操作：
+
+::
+
+  39 38 37 36 35 34 33 32 47 46 45 44 43 42 41 40 55 54 53 52 51 50 49 48 63 62 61 60 59 58 57 56
+  4                       5                       6                       7
+  7  6  5  4  3  2  1  0  15 14 13 12 11 10  9  8 23 22 21 20 19 18 17 16 31 30 29 28 27 26 25 24
+  0                       1                       2                       3
+
+因此，QUIRK_LITTLE_ENDIAN意味着在内存区域内，每个4字节的字的每个字节都被放置在与
+该字的边界相比的镜像位置。
+
+
+4. 如果设置了QUIRK_MSB_ON_THE_RIGHT和QUIRK_LITTLE_ENDIAN，我们这样做：
+
+::
+
+  32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
+  4                       5                       6                       7
+  0  1  2  3  4  5  6  7  8   9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+  0                       1                       2                       3
+
+
+5. 如果只设置了QUIRK_LSW32_IS_FIRST，我们这样做：
+
+::
+
+  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
+  3                       2                       1                        0
+  63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48 47 46 45 44 43 42 41 40 39 38 37 36 35 34 33 32
+  7                       6                       5                        4
+
+在这种情况下，8字节内存区域解释如下：前4字节对应最不重要的4字节的字，后4字节对应
+更重要的4字节的字。
+
+6. 如果设置了QUIRK_LSW32_IS_FIRST和QUIRK_MSB_ON_THE_RIGHT，我们这样做：
+
+::
+
+  24 25 26 27 28 29 30 31 16 17 18 19 20 21 22 23  8  9 10 11 12 13 14 15  0  1  2  3  4  5  6  7
+  3                       2                        1                       0
+  56 57 58 59 60 61 62 63 48 49 50 51 52 53 54 55 40 41 42 43 44 45 46 47 32 33 34 35 36 37 38 39
+  7                       6                        5                       4
+
+
+7. 如果设置了QUIRK_LSW32_IS_FIRST和QUIRK_LITTLE_ENDIAN，则如下所示：
+
+::
+
+  7  6  5  4  3  2  1  0  15 14 13 12 11 10  9  8 23 22 21 20 19 18 17 16 31 30 29 28 27 26 25 24
+  0                       1                       2                       3
+  39 38 37 36 35 34 33 32 47 46 45 44 43 42 41 40 55 54 53 52 51 50 49 48 63 62 61 60 59 58 57 56
+  4                       5                       6                       7
+
+
+8. 如果设置了QUIRK_LSW32_IS_FIRST，QUIRK_LITTLE_ENDIAN和QUIRK_MSB_ON_THE_RIGHT，
+   则如下所示：
+
+::
+
+  0  1  2  3  4  5  6  7  8   9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+  0                       1                       2                       3
+  32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
+  4                       5                       6                       7
+
+
+我们总是认为我们的偏移量好像没有特殊性，然后在访问内存区域之前翻译它们。
+
+预期用途
+--------
+
+选择使用该API的驱动程序首先需要确定上述3种quirk组合（共8种）中的哪一种与硬件文档
+中描述的相匹配。然后，他们应该封装packing()函数，创建一个新的xxx_packing()，使用
+适当的QUIRK_* one-hot 位集合来调用它。
+
+packing()函数返回一个int类型的错误码，以防止程序员使用不正确的API。这些错误预计不
+会在运行时发生，因此xxx_packing()返回void并简单地接受这些错误是合理的。它可以选择
+转储栈或打印错误描述。
-- 
cgit v1.2.3


From 8f0ec4094d01dbc40dcbad93a7ec85675cf035cf Mon Sep 17 00:00:00 2001
From: Wu XiangCheng <bobwxc@email.cn>
Date: Thu, 22 Sep 2022 21:03:39 +0800
Subject: docs/zh_CN: Update zh_CN/admin-guide/README.rst to 6.0-rc2

* update to commit ea052e7257bd ("docs: admin-guide: for kernel bugs
  refer to other kernel documentation")

We are in 6.x now ;)

Signed-off-by: Wu XiangCheng <bobwxc@email.cn>
Reviewed-by: Yanteng Si <siyanteng@loongson.cn>
Link: https://lore.kernel.org/r/7f6e0b8961f79befa62e0070f9682ab3abde8622.1663850554.git.bobwxc@email.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../translations/zh_CN/admin-guide/README.rst      | 101 +++++----------------
 1 file changed, 22 insertions(+), 79 deletions(-)

diff --git a/Documentation/translations/zh_CN/admin-guide/README.rst b/Documentation/translations/zh_CN/admin-guide/README.rst
index d20949e8bf6f..e679cbc3c89d 100644
--- a/Documentation/translations/zh_CN/admin-guide/README.rst
+++ b/Documentation/translations/zh_CN/admin-guide/README.rst
@@ -6,10 +6,10 @@
 
  吴想成 Wu XiangCheng <bobwxc@email.cn>
 
-Linux内核5.x版本 <http://kernel.org/>
+Linux内核6.x版本 <http://kernel.org/>
 =========================================
 
-以下是Linux版本5的发行注记。仔细阅读它们，
+以下是Linux版本6的发行注记。仔细阅读它们，
 它们会告诉你这些都是什么，解释如何安装内核，以及遇到问题时该如何做。
 
 什么是Linux？
@@ -61,27 +61,27 @@ Linux内核5.x版本 <http://kernel.org/>
  - 如果您要安装完整的源代码，请把内核tar档案包放在您有权限的目录中（例如您
    的主目录）并将其解包::
 
-     xz -cd linux-5.x.tar.xz | tar xvf -
+     xz -cd linux-6.x.tar.xz | tar xvf -
 
    将“X”替换成最新内核的版本号。
 
    【不要】使用 /usr/src/linux 目录！这里有一组库头文件使用的内核头文件
    （通常是不完整的）。它们应该与库匹配，而不是被内核的变化搞得一团糟。
 
- - 您还可以通过打补丁在5.x版本之间升级。补丁以xz格式分发。要通过打补丁进行
-   安装，请获取所有较新的补丁文件，进入内核源代码（linux-5.x）的目录并
+ - 您还可以通过打补丁在6.x版本之间升级。补丁以xz格式分发。要通过打补丁进行
+   安装，请获取所有较新的补丁文件，进入内核源代码（linux-6.x）的目录并
    执行::
 
-     xz -cd ../patch-5.x.xz | patch -p1
+     xz -cd ../patch-6.x.xz | patch -p1
 
    请【按顺序】替换所有大于当前源代码树版本的“x”，这样就可以了。您可能想要
    删除备份文件（文件名类似xxx~ 或 xxx.orig)，并确保没有失败的补丁（文件名
    类似xxx# 或 xxx.rej）。如果有，不是你就是我犯了错误。
 
-   与5.x内核的补丁不同，5.x.y内核（也称为稳定版内核）的补丁不是增量的，而是
-   直接应用于基本的5.x内核。例如，如果您的基本内核是5.0，并且希望应用5.0.3
-   补丁，则不应先应用5.0.1和5.0.2的补丁。类似地，如果您运行的是5.0.2内核，
-   并且希望跳转到5.0.3，那么在应用5.0.3补丁之前，必须首先撤销5.0.2补丁
+   与6.x内核的补丁不同，6.x.y内核（也称为稳定版内核）的补丁不是增量的，而是
+   直接应用于基本的6.x内核。例如，如果您的基本内核是6.0，并且希望应用6.0.3
+   补丁，则不应先应用6.0.1和6.0.2的补丁。类似地，如果您运行的是6.0.2内核，
+   并且希望跳转到6.0.3，那么在应用6.0.3补丁之前，必须首先撤销6.0.2补丁
    （即patch -R）。更多关于这方面的内容，请阅读
    :ref:`Documentation/process/applying-patches.rst <applying_patches>` 。
 
@@ -103,7 +103,7 @@ Linux内核5.x版本 <http://kernel.org/>
 软件要求
 ---------
 
-   编译和运行5.x内核需要各种软件包的最新版本。请参考
+   编译和运行6.x内核需要各种软件包的最新版本。请参考
    :ref:`Documentation/process/changes.rst <changes>`
    来了解最低版本要求以及如何升级软件包。请注意，使用过旧版本的这些包可能会
    导致很难追踪的间接错误，因此不要以为在生成或操作过程中出现明显问题时可以
@@ -116,12 +116,12 @@ Linux内核5.x版本 <http://kernel.org/>
    ``make O=output/dir`` 选项可以为输出文件（包括 .config）指定备用位置。
    例如::
 
-     kernel source code: /usr/src/linux-5.x
+     kernel source code: /usr/src/linux-6.x
      build directory:    /home/name/build/kernel
 
    要配置和构建内核，请使用::
 
-     cd /usr/src/linux-5.x
+     cd /usr/src/linux-6.x
      make O=/home/name/build/kernel menuconfig
      make O=/home/name/build/kernel
      sudo make O=/home/name/build/kernel modules_install install
@@ -227,8 +227,6 @@ Linux内核5.x版本 <http://kernel.org/>
  - 确保您至少有gcc 5.1可用。
    有关更多信息，请参阅 :ref:`Documentation/process/changes.rst <changes>` 。
 
-   请注意，您仍然可以使用此内核运行a.out用户程序。
-
  - 执行 ``make`` 来创建压缩内核映像。如果您安装了lilo以适配内核makefile，
    那么也可以进行 ``make install`` ，但是您可能需要先检查特定的lilo设置。
 
@@ -282,67 +280,12 @@ Linux内核5.x版本 <http://kernel.org/>
 若遇到问题
 -----------
 
- - 如果您发现了一些可能由于内核缺陷所导致的问题，请检查MAINTAINERS（维护者）
-   文件看看是否有人与令您遇到麻烦的内核部分相关。如果无人在此列出，那么第二
-   个最好的方案就是把它们发给我（torvalds@linux-foundation.org），也可能发送
-   到任何其他相关的邮件列表或新闻组。
-
- - 在所有的缺陷报告中，【请】告诉我们您在说什么内核，如何复现问题，以及您的
-   设置是什么的（使用您的常识）。如果问题是新的，请告诉我；如果问题是旧的，
-   请尝试告诉我您什么时候首次注意到它。
-
- - 如果缺陷导致如下消息::
-
-     unable to handle kernel paging request at address C0000010
-     Oops: 0002
-     EIP:   0010:XXXXXXXX
-     eax: xxxxxxxx   ebx: xxxxxxxx   ecx: xxxxxxxx   edx: xxxxxxxx
-     esi: xxxxxxxx   edi: xxxxxxxx   ebp: xxxxxxxx
-     ds: xxxx  es: xxxx  fs: xxxx  gs: xxxx
-     Pid: xx, process nr: xx
-     xx xx xx xx xx xx xx xx xx xx
-
-   或者类似的内核调试信息显示在屏幕上或在系统日志里，请【如实】复制它。
-   可能对你来说转储（dump）看起来不可理解，但它确实包含可能有助于调试问题的
-   信息。转储上方的文本也很重要：它说明了内核转储代码的原因（在上面的示例中，
-   是由于内核指针错误）。更多关于如何理解转储的信息，请参见
-   Documentation/admin-guide/bug-hunting.rst。
-
- - 如果使用 CONFIG_KALLSYMS 编译内核，则可以按原样发送转储，否则必须使用
-   ``ksymoops`` 程序来理解转储（但通常首选使用CONFIG_KALLSYMS编译）。
-   此实用程序可从
-   https://www.kernel.org/pub/linux/utils/kernel/ksymoops/ 下载。
-   或者，您可以手动执行转储查找：
-
- - 在调试像上面这样的转储时，如果您可以查找EIP值的含义，这将非常有帮助。
-   十六进制值本身对我或其他任何人都没有太大帮助：它会取决于特定的内核设置。
-   您应该做的是从EIP行获取十六进制值（忽略 ``0010:`` ），然后在内核名字列表
-   中查找它，以查看哪个内核函数包含有问题的地址。
-
-   要找到内核函数名，您需要找到与显示症状的内核相关联的系统二进制文件。就是
-   文件“linux/vmlinux”。要提取名字列表并将其与内核崩溃中的EIP进行匹配，
-   请执行::
-
-     nm vmlinux | sort | less
-
-   这将为您提供一个按升序排序的内核地址列表，从中很容易找到包含有问题的地址
-   的函数。请注意，内核调试消息提供的地址不一定与函数地址完全匹配（事实上，
-   这是不可能的），因此您不能只“grep”列表：不过列表将为您提供每个内核函数
-   的起点，因此通过查找起始地址低于你正在搜索的地址，但后一个函数的高于的
-   函数，你会找到您想要的。实际上，在您的问题报告中加入一些“上下文”可能是
-   一个好主意，给出相关的上下几行。
-
-   如果您由于某些原因无法完成上述操作（如您使用预编译的内核映像或类似的映像），
-   请尽可能多地告诉我您的相关设置信息，这会有所帮助。有关详细信息请阅读
-   ‘Documentation/admin-guide/reporting-issues.rst’。
-
- - 或者，您可以在正在运行的内核上使用gdb（只读的；即不能更改值或设置断点）。
-   为此，请首先使用-g编译内核；适当地编辑arch/x86/Makefile，然后执行 ``make
-   clean`` 。您还需要启用CONFIG_PROC_FS（通过 ``make config`` ）。
-
-   使用新内核重新启动后，执行 ``gdb vmlinux /proc/kcore`` 。现在可以使用所有
-   普通的gdb命令。查找系统崩溃点的命令是 ``l *0xXXXXXXXX`` （将xxx替换为EIP
-   值）。
-
-   用gdb无法调试一个当前未运行的内核是由于gdb（错误地）忽略了编译内核的起始
-   偏移量。
+如果您发现了一些可能由于内核缺陷所导致的问题，请参阅：
+Documentation/translations/zh_CN/admin-guide/reporting-issues.rst 。
+
+想要理解内核错误报告，请参阅：
+Documentation/translations/zh_CN/admin-guide/bug-hunting.rst 。
+
+更多用GDB调试内核的信息，请参阅：
+Documentation/translations/zh_CN/dev-tools/gdb-kernel-debugging.rst
+和 Documentation/dev-tools/kgdb.rst 。
-- 
cgit v1.2.3


From 0a13b6c3c546e3b6df75639cfe09954a1a17dd63 Mon Sep 17 00:00:00 2001
From: Wu XiangCheng <bobwxc@email.cn>
Date: Thu, 22 Sep 2022 21:04:22 +0800
Subject: docs/zh_CN: Add new translation of admin-guide/bootconfig.rst

The last English version used:
  commit 2f51efc6b71d ("docs: bootconfig: Add how to embed the
  bootconfig into kernel")

Signed-off-by: Wu XiangCheng <bobwxc@email.cn>
Reviewed-by: Yanteng Si <siyanteng@loongson.cn>
Link: https://lore.kernel.org/r/386249dc333a3e40b80c3a9483d60d2bfd24a6c4.1663850554.git.bobwxc@email.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../translations/zh_CN/admin-guide/bootconfig.rst  | 293 +++++++++++++++++++++
 .../translations/zh_CN/admin-guide/index.rst       |   2 +-
 2 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/translations/zh_CN/admin-guide/bootconfig.rst

diff --git a/Documentation/translations/zh_CN/admin-guide/bootconfig.rst b/Documentation/translations/zh_CN/admin-guide/bootconfig.rst
new file mode 100644
index 000000000000..072d17f5f199
--- /dev/null
+++ b/Documentation/translations/zh_CN/admin-guide/bootconfig.rst
@@ -0,0 +1,293 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. include:: ../disclaimer-zh_CN.rst
+
+:Original: Documentation/admin-guide/bootconfig.rst
+
+:译者: 吴想成 Wu XiangCheng <bobwxc@email.cn>
+
+========
+引导配置
+========
+
+:作者: Masami Hiramatsu <mhiramat@kernel.org>
+
+概述
+====
+
+引导配置扩展了现有的内核命令行，以一种更有效率的方式在引导内核时进一步支持
+键值数据。这允许管理员传递一份结构化关键字的配置文件。
+
+配置文件语法
+============
+
+引导配置文件的语法采用非常简单的键值结构。每个关键字由点连接的单词组成，键
+和值由 ``=`` 连接。值以分号（ ``;`` ）或换行符（ ``\n`` ）结尾。数组值中每
+个元素由逗号（ ``,`` ）分隔。::
+
+  KEY[.WORD[...]] = VALUE[, VALUE2[...]][;]
+
+与内核命令行语法不同，逗号和 ``=`` 周围允许有空格。
+
+关键字只允许包含字母、数字、连字符（ ``-`` ）和下划线（ ``_`` ）。值可包含
+可打印字符和空格，但分号（ ``;`` ）、换行符（ ``\n`` ）、逗号（ ``,`` ）、
+井号（ ``#`` ）和右大括号（ ``}`` ）等分隔符除外。
+
+如果你需要在值中使用这些分隔符，可以用双引号（ ``"VALUE"`` ）或单引号
+（ ``'VALUE'`` ）括起来。注意，引号无法转义。
+
+键的值可以为空或不存在。这些键用于检查该键是否存在（类似布尔值）。
+
+键值语法
+--------
+
+引导配置文件语法允许用户通过大括号合并键名部分相同的关键字。例如::
+
+ foo.bar.baz = value1
+ foo.bar.qux.quux = value2
+
+也可以写成::
+
+ foo.bar {
+    baz = value1
+    qux.quux = value2
+ }
+
+或者更紧凑一些，写成::
+
+ foo.bar { baz = value1; qux.quux = value2 }
+
+在这两种样式中，引导解析时相同的关键字都会自动合并。因此可以追加类似的树或
+键值。
+
+相同关键字的值
+--------------
+
+禁止两个或多个值或数组共享同一个关键字。例如::
+
+ foo = bar, baz
+ foo = qux  # !错误! 我们不可以重定义相同的关键字
+
+如果你想要更新值，必须显式使用覆盖操作符 ``:=`` 。例如::
+
+ foo = bar, baz
+ foo := qux
+
+这样 ``foo`` 关键字的值就变成了 ``qux`` 。这对于通过添加（部分）自定义引导
+配置来覆盖默认值非常有用，免于解析默认引导配置。
+
+如果你想对现有关键字追加值作为数组成员，可以使用 ``+=`` 操作符。例如::
+
+ foo = bar, baz
+ foo += qux
+
+这样， ``foo`` 关键字就同时拥有了 ``bar`` ， ``baz`` 和 ``qux`` 。
+
+此外，父关键字下可同时存在值和子关键字。
+例如，下列配置是可行的。::
+
+ foo = value1
+ foo.bar = value2
+ foo := value3 # 这会更新foo的值。
+
+注意，裸值不能直接放进结构化关键字中，必须在大括号外定义它。例如::
+
+ foo {
+     bar = value1
+     bar {
+         baz = value2
+         qux = value3
+     }
+ }
+
+同时，关键字下值节点的顺序是固定的。如果值和子关键字同时存在，值永远是该关
+键字的第一个子节点。因此如果用户先指定子关键字，如::
+
+ foo.bar = value1
+ foo = value2
+
+则在程序（和/proc/bootconfig）中，它会按如下显示::
+
+ foo = value2
+ foo.bar = value1
+
+注释
+----
+
+配置语法接受shell脚本风格的注释。注释以井号（ ``#`` ）开始，到换行符
+（ ``\n`` ）结束。
+
+::
+
+ # comment line
+ foo = value # value is set to foo.
+ bar = 1, # 1st element
+       2, # 2nd element
+       3  # 3rd element
+
+会被解析为::
+
+ foo = value
+ bar = 1, 2, 3
+
+注意你不能把注释放在值和分隔符（ ``,`` 或 ``;`` ）之间。如下配置语法是错误的::
+
+ key = 1 # comment
+       ,2
+
+
+/proc/bootconfig
+================
+
+/proc/bootconfig是引导配置的用户空间接口。与/proc/cmdline不同，此文件内容以
+键值列表样式显示。
+每个键值对一行，样式如下::
+
+ KEY[.WORDS...] = "[VALUE]"[,"VALUE2"...]
+
+
+用引导配置引导内核
+==================
+
+用引导配置引导内核有两种方法：将引导配置附加到initrd镜像或直接嵌入内核中。
+
+*initrd: initial RAM disk，初始内存磁盘*
+
+将引导配置附加到initrd
+----------------------
+
+由于默认情况下引导配置文件是用initrd加载的，因此它将被添加到initrd（initramfs）
+镜像文件的末尾，其中包含填充、大小、校验值和12字节幻数，如下所示::
+
+ [initrd][bootconfig][padding][size(le32)][checksum(le32)][#BOOTCONFIG\n]
+
+大小和校验值为小端序存放的32位无符号值。
+
+当引导配置被加到initrd镜像时，整个文件大小会对齐到4字节。空字符（ ``\0`` ）
+会填补对齐空隙。因此 ``size`` 就是引导配置文件的长度+填充的字节。
+
+Linux内核在内存中解码initrd镜像的最后部分以获取引导配置数据。由于这种“背负式”
+的方法，只要引导加载器传递了正确的initrd文件大小，就无需更改或更新引导加载器
+和内核镜像本身。如果引导加载器意外传递了更长的大小，内核将无法找到引导配置数
+据。
+
+Linux内核在tools/bootconfig下提供了 ``bootconfig`` 命令来完成此操作，管理员
+可以用它从initrd镜像中删除或追加配置文件。你可以用以下命令来构建它::
+
+ # make -C tools/bootconfig
+
+要向initrd镜像添加你的引导配置文件，请按如下命令操作（旧数据会自动移除）::
+
+ # tools/bootconfig/bootconfig -a your-config /boot/initrd.img-X.Y.Z
+
+要从镜像中移除配置，可以使用-d选项::
+
+ # tools/bootconfig/bootconfig -d /boot/initrd.img-X.Y.Z
+
+然后在内核命令行上添加 ``bootconfig`` 告诉内核去initrd文件末尾寻找内核配置。
+
+将引导配置嵌入内核
+------------------
+
+如果你不能使用initrd，也可以通过Kconfig选项将引导配置文件嵌入内核中。在此情
+况下，你需要用以下选项重新编译内核::
+
+ CONFIG_BOOT_CONFIG_EMBED=y
+ CONFIG_BOOT_CONFIG_EMBED_FILE="/引导配置/文件/的/路径"
+
+``CONFIG_BOOT_CONFIG_EMBED_FILE`` 需要从源码树或对象树开始的引导配置文件的
+绝对/相对路径。内核会将其嵌入作为默认引导配置。
+
+与将引导配置附加到initrd一样，你也需要在内核命令行上添加 ``bootconfig`` 告诉
+内核去启用内嵌的引导配置。
+
+注意，即使你已经设置了此选项，仍可用附加到initrd的其他引导配置覆盖内嵌的引导
+配置。
+
+通过引导配置传递内核参数
+========================
+
+除了内核命令行，引导配置也可以用于传递内核参数。所有 ``kernel`` 关键字下的键
+值对都将直接传递给内核命令行。此外， ``init`` 下的键值对将通过命令行传递给
+init进程。参数按以下顺序与用户给定的内核命令行字符串相连，因此命令行参数可以
+覆盖引导配置参数（这取决于子系统如何处理参数，但通常前面的参数将被后面的参数
+覆盖）::
+
+ [bootconfig params][cmdline params] -- [bootconfig init params][cmdline init params]
+
+如果引导配置文件给出的kernel/init参数是::
+
+ kernel {
+   root = 01234567-89ab-cdef-0123-456789abcd
+ }
+ init {
+  splash
+ }
+
+这将被复制到内核命令行字符串中，如下所示::
+
+ root="01234567-89ab-cdef-0123-456789abcd" -- splash
+
+如果用户给出的其他命令行是::
+
+ ro bootconfig -- quiet
+
+则最后的内核命令行如下::
+
+ root="01234567-89ab-cdef-0123-456789abcd" ro bootconfig -- splash quiet
+
+
+配置文件的限制
+==============
+
+当前最大的配置大小是32KB，关键字总数（不是键值条目）必须少于1024个节点。
+注意：这不是条目数而是节点数，条目必须消耗超过2个节点（一个关键字和一个值）。
+所以从理论上讲最多512个键值对。如果关键字平均包含3个单词，则可有256个键值对。
+在大多数情况下，配置项的数量将少于100个条目，小于8KB，因此这应该足够了。如果
+节点数超过1024，解析器将返回错误，即使文件大小小于32KB。（请注意，此最大尺寸
+不包括填充的空字符。）
+无论如何，因为 ``bootconfig`` 命令在附加启动配置到initrd映像时会验证它，用户
+可以在引导之前注意到它。
+
+
+引导配置API
+===========
+
+用户可以查询或遍历键值对，也可以查找（前缀）根关键字节点，并在查找该节点下的
+键值。
+
+如果您有一个关键字字符串，则可以直接使用 xbc_find_value() 查询该键的值。如果
+你想知道引导配置里有哪些关键字，可以使用 xbc_for_each_key_value() 迭代键值对。
+请注意，您需要使用 xbc_array_for_each_value() 访问数组的值，例如::
+
+ vnode = NULL;
+ xbc_find_value("key.word", &vnode);
+ if (vnode && xbc_node_is_array(vnode))
+    xbc_array_for_each_value(vnode, value) {
+      printk("%s ", value);
+    }
+
+如果您想查找具有前缀字符串的键，可以使用 xbc_find_node() 通过前缀字符串查找
+节点，然后用 xbc_node_for_each_key_value() 迭代前缀节点下的键。
+
+但最典型的用法是获取前缀下的命名值或前缀下的命名数组，例如::
+
+ root = xbc_find_node("key.prefix");
+ value = xbc_node_find_value(root, "option", &vnode);
+ ...
+ xbc_node_for_each_array_value(root, "array-option", value, anode) {
+    ...
+ }
+
+这将访问值“key.prefix.option”的值和“key.prefix.array-option”的数组。
+
+锁是不需要的，因为在初始化之后配置只读。如果需要修改，必须复制所有数据和关键字。
+
+
+函数与结构体
+============
+
+相关定义的kernel-doc参见：
+
+ - include/linux/bootconfig.h
+ - lib/bootconfig.c
diff --git a/Documentation/translations/zh_CN/admin-guide/index.rst b/Documentation/translations/zh_CN/admin-guide/index.rst
index 2f6970d0a032..ac2960da33e6 100644
--- a/Documentation/translations/zh_CN/admin-guide/index.rst
+++ b/Documentation/translations/zh_CN/admin-guide/index.rst
@@ -63,6 +63,7 @@ Todolist:
 .. toctree::
    :maxdepth: 1
 
+   bootconfig
    clearing-warn-once
    cpu-load
    cputopology
@@ -80,7 +81,6 @@ Todolist:
 *   binderfs
 *   binfmt-misc
 *   blockdev/index
-*   bootconfig
 *   braille-console
 *   btmrvl
 *   cgroup-v1/index
-- 
cgit v1.2.3


From 16461c66de0b559ee0b19e429561cae9cae72e8c Mon Sep 17 00:00:00 2001
From: Hoi Pok Wu <wuhoipok@gmail.com>
Date: Thu, 22 Sep 2022 11:06:45 +0800
Subject: docs: hugetlbpage.rst: fix a typo of hugepage size

should be kB instead of Kb

Signed-off-by: Hoi Pok Wu <wuhoipok@gmail.com>
Reviewed-by: Muchun Song <songmuchun@bytedance.com>
Link: https://lore.kernel.org/r/20220922030645.9719-1-wuhoipok@gmail.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/admin-guide/mm/hugetlbpage.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst
index 8e2727dc18d4..19f27c0d92e0 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -65,7 +65,7 @@ HugePages_Surp
 	may be temporarily larger than the maximum number of surplus huge
 	pages when the system is under memory pressure.
 Hugepagesize
-	is the default hugepage size (in Kb).
+	is the default hugepage size (in kB).
 Hugetlb
         is the total amount of memory (in kB), consumed by huge
         pages of all sizes.
-- 
cgit v1.2.3


From a9a7da031dcaeb5064df6605535cf360276b911a Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Thu, 22 Sep 2022 23:37:37 +0800
Subject: Documentation: devres: add missing PINCTRL helpers

Add devm_pinctrl_get_select() and devm_pinctrl_register_and_init() to
devres.rst.

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Link: https://lore.kernel.org/r/20220922153737.2863951-1-yangyingliang@huawei.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/driver-api/driver-model/devres.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 55272942e721..755aecdd2f2d 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -392,7 +392,9 @@ PHY
 PINCTRL
   devm_pinctrl_get()
   devm_pinctrl_put()
+  devm_pinctrl_get_select()
   devm_pinctrl_register()
+  devm_pinctrl_register_and_init()
   devm_pinctrl_unregister()
 
 POWER
-- 
cgit v1.2.3


From 49beeea7ebdb1d86ba8465e5021d72ad9e9474b8 Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Fri, 23 Sep 2022 22:18:03 +0800
Subject: Documentation: devres: add missing SPI helper

Add devm_spi_alloc_master() and devm_spi_alloc_slave() to devres.rst.
They are introduced by
commit 5e844cc37a5c ("spi: Introduce device-managed SPI controller allocation").

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Reviewed-by: Lukas Wunner <lukas@wunner.de>
Link: https://lore.kernel.org/r/20220923141803.75734-1-yangyingliang@huawei.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/driver-api/driver-model/devres.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 755aecdd2f2d..6f190967ba2e 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -429,6 +429,8 @@ SLAVE DMA ENGINE
   devm_acpi_dma_controller_register()
 
 SPI
+  devm_spi_alloc_master()
+  devm_spi_alloc_slave()
   devm_spi_register_master()
 
 WATCHDOG
-- 
cgit v1.2.3


From bf21f3f8d0336ec28d9ab09ad274ac0bd71b7cf8 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Tue, 27 Sep 2022 08:29:42 -0700
Subject: MIPS: Lantiq: vmmc: fix compile break introduced by gpiod patch

"MIPS: Lantiq: switch vmmc to use gpiod API" patch introduced compile
errors, this patch fixes them.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/lantiq/xway/vmmc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/mips/lantiq/xway/vmmc.c b/arch/mips/lantiq/xway/vmmc.c
index a1947306ac18..2796e87dfcae 100644
--- a/arch/mips/lantiq/xway/vmmc.c
+++ b/arch/mips/lantiq/xway/vmmc.c
@@ -29,6 +29,7 @@ static int vmmc_probe(struct platform_device *pdev)
 	struct gpio_desc *gpio;
 	int gpio_count;
 	dma_addr_t dma;
+	int error;
 
 	cp1_base =
 		(void *) CPHYSADDR(dma_alloc_coherent(&pdev->dev, CP1_SIZE,
@@ -38,14 +39,15 @@ static int vmmc_probe(struct platform_device *pdev)
 	while (gpio_count > 0) {
 		gpio = devm_gpiod_get_index(&pdev->dev,
 					    NULL, --gpio_count, GPIOD_OUT_HIGH);
-		if (IS_ERR(gpio)) {
+		error = PTR_ERR_OR_ZERO(gpio);
+		if (error) {
 			dev_err(&pdev->dev,
 				"failed to request GPIO idx %d: %d\n",
-				gpio_count, PTR_ERR(gpio);
+				gpio_count, error);
 			continue;
 		}
 
-		gpio_consumer_set_name(gpio, "vmmc-relay");
+		gpiod_set_consumer_name(gpio, "vmmc-relay");
 	}
 
 	dev_info(&pdev->dev, "reserved %dMB at 0x%p", CP1_SIZE >> 20, cp1_base);
-- 
cgit v1.2.3


From efd608fa7403ba106412b437f873929e2c862e28 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Wed, 21 Sep 2022 18:09:32 +0000
Subject: x86/alternative: Fix race in try_get_desc()

I encountered some occasional crashes of poke_int3_handler() when
kprobes are set, while accessing desc->vec.

The text poke mechanism claims to have an RCU-like behavior, but it
does not appear that there is any quiescent state to ensure that
nobody holds reference to desc. As a result, the following race
appears to be possible, which can lead to memory corruption.

  CPU0					CPU1
  ----					----
  text_poke_bp_batch()
  -> smp_store_release(&bp_desc, &desc)

  [ notice that desc is on
    the stack			]

					poke_int3_handler()

					[ int3 might be kprobe's
					  so sync events are do not
					  help ]

					-> try_get_desc(descp=&bp_desc)
					   desc = __READ_ONCE(bp_desc)

					   if (!desc) [false, success]
  WRITE_ONCE(bp_desc, NULL);
  atomic_dec_and_test(&desc.refs)

  [ success, desc space on the stack
    is being reused and might have
    non-zero value. ]
					arch_atomic_inc_not_zero(&desc->refs)

					[ might succeed since desc points to
					  stack memory that was freed and might
					  be reused. ]

Fix this issue with small backportable patch. Instead of trying to
make RCU-like behavior for bp_desc, just eliminate the unnecessary
level of indirection of bp_desc, and hold the whole descriptor as a
global.  Anyhow, there is only a single descriptor at any given
moment.

Fixes: 1f676247f36a4 ("x86/alternatives: Implement a better poke_int3_handler() completion scheme")
Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@kernel.org
Link: https://lkml.kernel.org/r/20220920224743.3089-1-namit@vmware.com
---
 arch/x86/kernel/alternative.c | 45 ++++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 62f6b8b7c4a5..4f3204364caa 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1319,22 +1319,23 @@ struct bp_patching_desc {
 	atomic_t refs;
 };
 
-static struct bp_patching_desc *bp_desc;
+static struct bp_patching_desc bp_desc;
 
 static __always_inline
-struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
+struct bp_patching_desc *try_get_desc(void)
 {
-	/* rcu_dereference */
-	struct bp_patching_desc *desc = __READ_ONCE(*descp);
+	struct bp_patching_desc *desc = &bp_desc;
 
-	if (!desc || !arch_atomic_inc_not_zero(&desc->refs))
+	if (!arch_atomic_inc_not_zero(&desc->refs))
 		return NULL;
 
 	return desc;
 }
 
-static __always_inline void put_desc(struct bp_patching_desc *desc)
+static __always_inline void put_desc(void)
 {
+	struct bp_patching_desc *desc = &bp_desc;
+
 	smp_mb__before_atomic();
 	arch_atomic_dec(&desc->refs);
 }
@@ -1367,15 +1368,15 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
 
 	/*
 	 * Having observed our INT3 instruction, we now must observe
-	 * bp_desc:
+	 * bp_desc with non-zero refcount:
 	 *
-	 *	bp_desc = desc			INT3
+	 *	bp_desc.refs = 1		INT3
 	 *	WMB				RMB
-	 *	write INT3			if (desc)
+	 *	write INT3			if (bp_desc.refs != 0)
 	 */
 	smp_rmb();
 
-	desc = try_get_desc(&bp_desc);
+	desc = try_get_desc();
 	if (!desc)
 		return 0;
 
@@ -1429,7 +1430,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
 	ret = 1;
 
 out_put:
-	put_desc(desc);
+	put_desc();
 	return ret;
 }
 
@@ -1460,18 +1461,20 @@ static int tp_vec_nr;
  */
 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
 {
-	struct bp_patching_desc desc = {
-		.vec = tp,
-		.nr_entries = nr_entries,
-		.refs = ATOMIC_INIT(1),
-	};
 	unsigned char int3 = INT3_INSN_OPCODE;
 	unsigned int i;
 	int do_sync;
 
 	lockdep_assert_held(&text_mutex);
 
-	smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */
+	bp_desc.vec = tp;
+	bp_desc.nr_entries = nr_entries;
+
+	/*
+	 * Corresponds to the implicit memory barrier in try_get_desc() to
+	 * ensure reading a non-zero refcount provides up to date bp_desc data.
+	 */
+	atomic_set_release(&bp_desc.refs, 1);
 
 	/*
 	 * Corresponding read barrier in int3 notifier for making sure the
@@ -1559,12 +1562,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 		text_poke_sync();
 
 	/*
-	 * Remove and synchronize_rcu(), except we have a very primitive
-	 * refcount based completion.
+	 * Remove and wait for refs to be zero.
 	 */
-	WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */
-	if (!atomic_dec_and_test(&desc.refs))
-		atomic_cond_read_acquire(&desc.refs, !VAL);
+	if (!atomic_dec_and_test(&bp_desc.refs))
+		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
 }
 
 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
-- 
cgit v1.2.3


From 7516777434570833469252c16e5fd7d8f73c45be Mon Sep 17 00:00:00 2001
From: Evan Quan <evan.quan@amd.com>
Date: Fri, 26 Aug 2022 19:30:20 +0800
Subject: drm/amdgpu: avoid gfx register accessing during gfxoff

Make sure gfxoff is disabled before gfx register accessing.

Signed-off-by: Evan Quan <evan.quan@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index f6b1bb40e503..fbb13c9147f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -5260,6 +5260,8 @@ static void gfx_v11_0_update_spm_vmid(struct amdgpu_device *adev, unsigned vmid)
 {
 	u32 reg, data;
 
+	amdgpu_gfx_off_ctrl(adev, false);
+
 	reg = SOC15_REG_OFFSET(GC, 0, regRLC_SPM_MC_CNTL);
 	if (amdgpu_sriov_is_pp_one_vf(adev))
 		data = RREG32_NO_KIQ(reg);
@@ -5273,6 +5275,8 @@ static void gfx_v11_0_update_spm_vmid(struct amdgpu_device *adev, unsigned vmid)
 		WREG32_SOC15_NO_KIQ(GC, 0, regRLC_SPM_MC_CNTL, data);
 	else
 		WREG32_SOC15(GC, 0, regRLC_SPM_MC_CNTL, data);
+
+	amdgpu_gfx_off_ctrl(adev, true);
 }
 
 static const struct amdgpu_rlc_funcs gfx_v11_0_rlc_funcs = {
-- 
cgit v1.2.3


From 3c480531a4b1652561fd16824c16be8b8a3ba8f1 Mon Sep 17 00:00:00 2001
From: Evan Quan <evan.quan@amd.com>
Date: Wed, 3 Aug 2022 16:46:47 +0800
Subject: drm/amd/pm: enable gfxoff feature for SMU 13.0.0

The feature is ready with latest 78.58.0 PMFW.

Signed-off-by: Evan Quan <evan.quan@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index 096327513dd0..03b732bf8cd0 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -239,6 +239,7 @@ smu_v13_0_0_get_allowed_feature_mask(struct smu_context *smu,
 				  uint32_t *feature_mask, uint32_t num)
 {
 	struct amdgpu_device *adev = smu->adev;
+	u32 smu_version;
 
 	if (num > 2)
 		return -EINVAL;
@@ -262,10 +263,11 @@ smu_v13_0_0_get_allowed_feature_mask(struct smu_context *smu,
 	if (adev->pm.pp_feature & PP_SOCCLK_DPM_MASK)
 		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DPM_SOCCLK_BIT);
 
-#if 0
-	if (adev->pm.pp_feature & PP_GFXOFF_MASK)
+	/* PMFW 78.58 contains a critical fix for gfxoff feature */
+	smu_cmn_get_smc_version(smu, NULL, &smu_version);
+	if ((smu_version >= 0x004e3a00) &&
+	    (adev->pm.pp_feature & PP_GFXOFF_MASK))
 		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_GFXOFF_BIT);
-#endif
 
 	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_THROTTLERS_BIT);
 	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_FAN_CONTROL_BIT);
-- 
cgit v1.2.3


From b9caeadf5cb89566412b94b14a1583419aeba45e Mon Sep 17 00:00:00 2001
From: Evan Quan <evan.quan@amd.com>
Date: Thu, 1 Sep 2022 11:45:02 +0800
Subject: drm/amd/pm: use adverse selection for dpm features unsupported by
 driver

It's vbios and pmfw instead of driver who decide whether some dpm features
is supported or not. Driver just de-selects those features which are not
permitted on user's request. Thus, we use adverse selects model.

Signed-off-by: Evan Quan <evan.quan@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c   | 85 ++++++----------------
 1 file changed, 24 insertions(+), 61 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index 03b732bf8cd0..1d454485e0d9 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -244,79 +244,42 @@ smu_v13_0_0_get_allowed_feature_mask(struct smu_context *smu,
 	if (num > 2)
 		return -EINVAL;
 
-	memset(feature_mask, 0, sizeof(uint32_t) * num);
+	memset(feature_mask, 0xff, sizeof(uint32_t) * num);
 
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_FW_DATA_READ_BIT);
-
-	if (adev->pm.pp_feature & PP_SCLK_DPM_MASK) {
-		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DPM_GFXCLK_BIT);
-		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_GFX_IMU_BIT);
+	if (!(adev->pm.pp_feature & PP_SCLK_DPM_MASK)) {
+		*(uint64_t *)feature_mask &= ~FEATURE_MASK(FEATURE_DPM_GFXCLK_BIT);
+		*(uint64_t *)feature_mask &= ~FEATURE_MASK(FEATURE_GFX_IMU_BIT);
 	}
 
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_MM_DPM_BIT);
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DS_VCN_BIT);
-
-	if ((adev->pg_flags & AMD_PG_SUPPORT_ATHUB) &&
-	    (adev->pg_flags & AMD_PG_SUPPORT_MMHUB))
-		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_ATHUB_MMHUB_PG_BIT);
+	if (!(adev->pg_flags & AMD_PG_SUPPORT_ATHUB) ||
+	    !(adev->pg_flags & AMD_PG_SUPPORT_MMHUB))
+		*(uint64_t *)feature_mask &= ~FEATURE_MASK(FEATURE_ATHUB_MMHUB_PG_BIT);
 
-	if (adev->pm.pp_feature & PP_SOCCLK_DPM_MASK)
-		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DPM_SOCCLK_BIT);
+	if (!(adev->pm.pp_feature & PP_SOCCLK_DPM_MASK))
+		*(uint64_t *)feature_mask &= ~FEATURE_MASK(FEATURE_DPM_SOCCLK_BIT);
 
 	/* PMFW 78.58 contains a critical fix for gfxoff feature */
 	smu_cmn_get_smc_version(smu, NULL, &smu_version);
-	if ((smu_version >= 0x004e3a00) &&
-	    (adev->pm.pp_feature & PP_GFXOFF_MASK))
-		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_GFXOFF_BIT);
-
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_THROTTLERS_BIT);
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_FAN_CONTROL_BIT);
-
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DF_CSTATE_BIT);
-
-	if (adev->pm.pp_feature & PP_MCLK_DPM_MASK) {
-		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DPM_UCLK_BIT);
-		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_VMEMP_SCALING_BIT);
-		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_VDDIO_MEM_SCALING_BIT);
+	if ((smu_version < 0x004e3a00) ||
+	     !(adev->pm.pp_feature & PP_GFXOFF_MASK))
+		*(uint64_t *)feature_mask &= ~FEATURE_MASK(FEATURE_GFXOFF_BIT);
+
+	if (!(adev->pm.pp_feature & PP_MCLK_DPM_MASK)) {
+		*(uint64_t *)feature_mask &= ~FEATURE_MASK(FEATURE_DPM_UCLK_BIT);
+		*(uint64_t *)feature_mask &= ~FEATURE_MASK(FEATURE_VMEMP_SCALING_BIT);
+		*(uint64_t *)feature_mask &= ~FEATURE_MASK(FEATURE_VDDIO_MEM_SCALING_BIT);
 	}
 
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_MEM_TEMP_READ_BIT);
-
-	if (adev->pm.pp_feature & PP_SCLK_DEEP_SLEEP_MASK)
-		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DS_GFXCLK_BIT);
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DS_SOCCLK_BIT);
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_SOC_MPCLK_DS_BIT);
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_BACO_MPCLK_DS_BIT);
+	if (!(adev->pm.pp_feature & PP_SCLK_DEEP_SLEEP_MASK))
+		*(uint64_t *)feature_mask &= ~FEATURE_MASK(FEATURE_DS_GFXCLK_BIT);
 
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DPM_FCLK_BIT);
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DPM_DCN_BIT);
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DS_DCFCLK_BIT);
-
-	if (adev->pm.pp_feature & PP_PCIE_DPM_MASK) {
-		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DPM_LINK_BIT);
-		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DS_LCLK_BIT);
+	if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) {
+		*(uint64_t *)feature_mask &= ~FEATURE_MASK(FEATURE_DPM_LINK_BIT);
+		*(uint64_t *)feature_mask &= ~FEATURE_MASK(FEATURE_DS_LCLK_BIT);
 	}
 
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_BACO_BIT);
-
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DPM_MP0CLK_BIT);
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_FW_DSTATE_BIT);
-
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_OUT_OF_BAND_MONITOR_BIT);
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_SOC_CG_BIT);
-
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DS_FCLK_BIT);
-
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_FW_CTF_BIT);
-
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DS_UCLK_BIT);
-
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_VR0HOT_BIT);
-
-	*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_DPM_GFX_POWER_OPTIMIZER_BIT);
-
-	if (adev->pm.pp_feature & PP_ULV_MASK)
-		*(uint64_t *)feature_mask |= FEATURE_MASK(FEATURE_GFX_ULV_BIT);
+	if (!(adev->pm.pp_feature & PP_ULV_MASK))
+		*(uint64_t *)feature_mask &= ~FEATURE_MASK(FEATURE_GFX_ULV_BIT);
 
 	return 0;
 }
-- 
cgit v1.2.3


From a9b4700279f21c5c1a3425630dea86d338ab13b2 Mon Sep 17 00:00:00 2001
From: Graham Sider <Graham.Sider@amd.com>
Date: Tue, 20 Sep 2022 17:16:13 -0400
Subject: drm/amdkfd: fix MQD init for GFX11 in init_mqd

Set remaining compute_static_thread_mgmt_se* accordingly.

Signed-off-by: Graham Sider <Graham.Sider@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
index b8e14c2cc295..3ae350220d42 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c
@@ -126,6 +126,10 @@ static void init_mqd(struct mqd_manager *mm, void **mqd,
 	m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
 	m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
 	m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF;
+	m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF;
 
 	m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
 			0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
-- 
cgit v1.2.3


From 91ef6cfd30bf75763f41e7a4bc119883fe190606 Mon Sep 17 00:00:00 2001
From: Graham Sider <Graham.Sider@amd.com>
Date: Mon, 19 Sep 2022 13:57:14 -0400
Subject: drm/amdgpu: pass queue size and is_aql_queue to MES

Update mes_v11_api_def.h add_queue API with is_aql_queue parameter. Also
re-use gds_size for the queue size (unused for KFD). MES requires the
queue size in order to compute the actual wptr offset within the queue
RB since it increases monotonically for AQL queues.

v2: Make is_aql_queue assign clearer

Signed-off-by: Graham Sider <Graham.Sider@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h               | 2 ++
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c                | 4 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 ++
 drivers/gpu/drm/amd/include/mes_v11_api_def.h         | 3 ++-
 4 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 7b46f6bf4187..ad980f4b66e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -222,6 +222,8 @@ struct mes_add_queue_input {
 	uint64_t	tba_addr;
 	uint64_t	tma_addr;
 	uint32_t	is_kfd_process;
+	uint32_t	is_aql_queue;
+	uint32_t	queue_size;
 };
 
 struct mes_remove_queue_input {
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index cc3fdbbcd314..f92744b8d79d 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -185,6 +185,10 @@ static int mes_v11_0_add_hw_queue(struct amdgpu_mes *mes,
 	mes_add_queue_pkt.is_kfd_process = input->is_kfd_process;
 	mes_add_queue_pkt.trap_en = 1;
 
+	/* For KFD, gds_size is re-used for queue size (needed in MES for AQL queues) */
+	mes_add_queue_pkt.is_aql_queue = input->is_aql_queue;
+	mes_add_queue_pkt.gds_size = input->queue_size;
+
 	return mes_v11_0_submit_pkt_and_poll_completion(mes,
 			&mes_add_queue_pkt, sizeof(mes_add_queue_pkt),
 			offsetof(union MESAPI__ADD_QUEUE, api_status));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index e83725a28106..007a3db69df1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -205,6 +205,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	}
 
 	queue_input.is_kfd_process = 1;
+	queue_input.is_aql_queue = (q->properties.format == KFD_QUEUE_FORMAT_AQL);
+	queue_input.queue_size = q->properties.queue_size >> 2;
 
 	queue_input.paging = false;
 	queue_input.tba_addr = qpd->tba_addr;
diff --git a/drivers/gpu/drm/amd/include/mes_v11_api_def.h b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
index 50bfa513cb35..7e85cdc5bd34 100644
--- a/drivers/gpu/drm/amd/include/mes_v11_api_def.h
+++ b/drivers/gpu/drm/amd/include/mes_v11_api_def.h
@@ -269,7 +269,8 @@ union MESAPI__ADD_QUEUE {
 			uint32_t map_kiq_utility_queue  : 1;
 			uint32_t is_kfd_process		: 1;
 			uint32_t trap_en		: 1;
-			uint32_t reserved		: 21;
+			uint32_t is_aql_queue		: 1;
+			uint32_t reserved		: 20;
 		};
 		struct MES_API_STATUS		api_status;
 		uint64_t                        tma_addr;
-- 
cgit v1.2.3


From 664883ddde67971d59764f2dda855183ecf8bc46 Mon Sep 17 00:00:00 2001
From: Graham Sider <Graham.Sider@amd.com>
Date: Fri, 23 Sep 2022 10:07:15 -0400
Subject: drm/amdkfd: fix dropped interrupt in kfd_int_process_v11

Shader wave interrupts were getting dropped in event_interrupt_wq_v11
if the PRIV bit was set to 1. This would often lead to a hang. Until
debugger logic is upstreamed, expand comment to stop early return.

Signed-off-by: Graham Sider <Graham.Sider@amd.com>
Reviewed-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
index a6fcbeeb7428..0d53f6067422 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
@@ -350,11 +350,11 @@ static void event_interrupt_wq_v11(struct kfd_dev *dev,
 				print_sq_intr_info_inst(context_id0, context_id1);
 				sq_int_priv = REG_GET_FIELD(context_id0,
 						SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV);
-				if (sq_int_priv /*&& (kfd_set_dbg_ev_from_interrupt(dev, pasid,
+				/*if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(dev, pasid,
 						KFD_CTXID0_DOORBELL_ID(context_id0),
 						KFD_CTXID0_TRAP_CODE(context_id0),
-						NULL, 0))*/)
-					return;
+						NULL, 0)))
+					return;*/
 				break;
 			case SQ_INTERRUPT_WORD_ENCODING_ERROR:
 				print_sq_intr_info_error(context_id0, context_id1);
-- 
cgit v1.2.3


From 783d446749c09eb34178cb8600a9a93d3b99e35d Mon Sep 17 00:00:00 2001
From: Taimur Hassan <Syed.Hassan@amd.com>
Date: Tue, 13 Sep 2022 16:04:06 -0400
Subject: drm/amd/display: Avoid avoid unnecessary pixel rate divider
 programming

[Why]
Programming pixel rate divider when FIFO is enabled can cause FIFO error.

[How]
Skip divider programming when divider values are the same to prevent FIFO
error.

Reviewed-by: Nicholas Kazlauskas <Nicholas.Kazlauskas@amd.com>
Acked-by: Jasdeep Dhillon <jdhillon@amd.com>
Signed-off-by: Taimur Hassan <Syed.Hassan@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../gpu/drm/amd/display/dc/dcn314/dcn314_dccg.c    | 47 ++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dccg.c b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dccg.c
index 232cc15979dd..60540864e028 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dccg.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dccg.c
@@ -45,6 +45,48 @@
 #define DC_LOGGER \
 	dccg->ctx->logger
 
+static void dccg314_get_pixel_rate_div(
+		struct dccg *dccg,
+		uint32_t otg_inst,
+		enum pixel_rate_div *k1,
+		enum pixel_rate_div *k2)
+{
+	struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg);
+	uint32_t val_k1 = PIXEL_RATE_DIV_NA, val_k2 = PIXEL_RATE_DIV_NA;
+
+	*k1 = PIXEL_RATE_DIV_NA;
+	*k2 = PIXEL_RATE_DIV_NA;
+
+	switch (otg_inst) {
+	case 0:
+		REG_GET_2(OTG_PIXEL_RATE_DIV,
+			OTG0_PIXEL_RATE_DIVK1, &val_k1,
+			OTG0_PIXEL_RATE_DIVK2, &val_k2);
+		break;
+	case 1:
+		REG_GET_2(OTG_PIXEL_RATE_DIV,
+			OTG1_PIXEL_RATE_DIVK1, &val_k1,
+			OTG1_PIXEL_RATE_DIVK2, &val_k2);
+		break;
+	case 2:
+		REG_GET_2(OTG_PIXEL_RATE_DIV,
+			OTG2_PIXEL_RATE_DIVK1, &val_k1,
+			OTG2_PIXEL_RATE_DIVK2, &val_k2);
+		break;
+	case 3:
+		REG_GET_2(OTG_PIXEL_RATE_DIV,
+			OTG3_PIXEL_RATE_DIVK1, &val_k1,
+			OTG3_PIXEL_RATE_DIVK2, &val_k1);
+		break;
+	default:
+		BREAK_TO_DEBUGGER();
+		return;
+	}
+
+	*k1 = (enum pixel_rate_div)val_k1;
+	*k2 = (enum pixel_rate_div)val_k2;
+}
+
 static void dccg314_set_pixel_rate_div(
 		struct dccg *dccg,
 		uint32_t otg_inst,
@@ -52,6 +94,11 @@ static void dccg314_set_pixel_rate_div(
 		enum pixel_rate_div k2)
 {
 	struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg);
+	enum pixel_rate_div cur_k1 = PIXEL_RATE_DIV_NA, cur_k2 = PIXEL_RATE_DIV_NA;
+
+	dccg314_get_pixel_rate_div(dccg, otg_inst, &cur_k1, &cur_k2);
+	if (k1 == PIXEL_RATE_DIV_NA || k2 == PIXEL_RATE_DIV_NA || (k1 == cur_k1 && k2 == cur_k2))
+		return;
 
 	switch (otg_inst) {
 	case 0:
-- 
cgit v1.2.3


From b6d1c39bc637fe4879d610e6687b1d4224c4b2fb Mon Sep 17 00:00:00 2001
From: Alvin Lee <Alvin.Lee2@amd.com>
Date: Tue, 13 Sep 2022 11:06:31 -0400
Subject: drm/amd/display: Update DCN32 to use new SR latencies

[Description]
Update to new SR latencies for DCN32

Reviewed-by: Nevenko Stupar <Nevenko.Stupar@amd.com>
Reviewed-by: Jun Lei <Jun.Lei@amd.com>
Acked-by: Jasdeep Dhillon <jdhillon@amd.com>
Signed-off-by: Alvin Lee <Alvin.Lee2@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c
index f43686997917..c3cc06876210 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c
@@ -121,8 +121,8 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_2_soc = {
 		},
 	},
 	.num_states = 1,
-	.sr_exit_time_us = 20.16,
-	.sr_enter_plus_exit_time_us = 27.13,
+	.sr_exit_time_us = 42.97,
+	.sr_enter_plus_exit_time_us = 49.94,
 	.sr_exit_z8_time_us = 285.0,
 	.sr_enter_plus_exit_z8_time_us = 320,
 	.writeback_latency_us = 12.0,
-- 
cgit v1.2.3


From dd37fba90e72a48998d9f0eb5497088ea2b02ad1 Mon Sep 17 00:00:00 2001
From: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
Date: Wed, 7 Sep 2022 10:11:34 -0400
Subject: drm/amd/display: Do DIO FIFO enable after DP video stream enable

[Why]
Avoids a race condition where DIO FIFO can underflow due to no incoming
data available.

[How]
Shift the FIFO enable below stream enable.

Make sure fullness level is written before the DIO reset takes place
and that we're not doing it twice.

Reviewed-by: Syed Hassan <Syed.Hassan@amd.com>
Acked-by: Jasdeep Dhillon <jdhillon@amd.com>
Signed-off-by: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c   | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c
index 06d8638db696..52b71b2fef67 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c
@@ -56,7 +56,8 @@ static void enc314_enable_fifo(struct stream_encoder *enc)
 
 	/* TODO: Confirm if we need to wait for DIG_SYMCLK_FE_ON */
 	REG_WAIT(DIG_FE_CNTL, DIG_SYMCLK_FE_ON, 1, 10, 5000);
-	REG_UPDATE_2(DIG_FIFO_CTRL0, DIG_FIFO_RESET, 1, DIG_FIFO_READ_START_LEVEL, 0x7);
+	REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_READ_START_LEVEL, 0x7);
+	REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_RESET, 1);
 	REG_WAIT(DIG_FIFO_CTRL0, DIG_FIFO_RESET_DONE, 1, 10, 5000);
 	REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_RESET, 0);
 	REG_WAIT(DIG_FIFO_CTRL0, DIG_FIFO_RESET_DONE, 0, 10, 5000);
@@ -316,15 +317,11 @@ static void enc314_stream_encoder_dp_unblank(
 	/* switch DP encoder to CRTC data, but reset it the fifo first. It may happen
 	 * that it overflows during mode transition, and sometimes doesn't recover.
 	 */
-	REG_UPDATE(DIG_FIFO_CTRL0, DIG_FIFO_READ_START_LEVEL, 0x7);
 	REG_UPDATE(DP_STEER_FIFO, DP_STEER_FIFO_RESET, 1);
 	udelay(10);
 
 	REG_UPDATE(DP_STEER_FIFO, DP_STEER_FIFO_RESET, 0);
 
-	/* DIG Resync FIFO now needs to be explicitly enabled. */
-	enc314_enable_fifo(enc);
-
 	/* wait 100us for DIG/DP logic to prime
 	 * (i.e. a few video lines)
 	 */
@@ -340,6 +337,12 @@ static void enc314_stream_encoder_dp_unblank(
 
 	REG_UPDATE(DP_VID_STREAM_CNTL, DP_VID_STREAM_ENABLE, true);
 
+	/*
+	 * DIG Resync FIFO now needs to be explicitly enabled.
+	 * This should come after DP_VID_STREAM_ENABLE per HW docs.
+	 */
+	enc314_enable_fifo(enc);
+
 	dp_source_sequence_trace(link, DPCD_SOURCE_SEQ_AFTER_ENABLE_DP_VID_STREAM);
 }
 
-- 
cgit v1.2.3


From ec669ef2e2cf277f3e73d842bb3bda1c3ea2ea73 Mon Sep 17 00:00:00 2001
From: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
Date: Thu, 8 Sep 2022 14:17:01 -0400
Subject: drm/amd/display: Wrap OTG disable workaround with FIFO control

[Why]
The DIO FIFO will underflow if we turn off the OTG before we turn
off the FIFO.

Since this happens as part of the OTG workaround and we don't reset
the FIFO afterwards we see the error persist.

[How]
Add disable FIFO before the disable CRTC and enable FIFO after enabling
the CRTC.

Reviewed-by: Syed Hassan <Syed.Hassan@amd.com>
Acked-by: Jasdeep Dhillon <jdhillon@amd.com>
Signed-off-by: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c    | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c
index 8559dcd80af0..4a15aa7a375f 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_clk_mgr.c
@@ -130,11 +130,20 @@ static void dcn314_disable_otg_wa(struct clk_mgr *clk_mgr_base, struct dc_state
 		if (pipe->top_pipe || pipe->prev_odm_pipe)
 			continue;
 		if (pipe->stream && (pipe->stream->dpms_off || dc_is_virtual_signal(pipe->stream->signal))) {
+			struct stream_encoder *stream_enc = pipe->stream_res.stream_enc;
+
 			if (disable) {
+				if (stream_enc && stream_enc->funcs->disable_fifo)
+					pipe->stream_res.stream_enc->funcs->disable_fifo(stream_enc);
+
 				pipe->stream_res.tg->funcs->immediate_disable_crtc(pipe->stream_res.tg);
 				reset_sync_context_for_pipe(dc, context, i);
-			} else
+			} else {
 				pipe->stream_res.tg->funcs->enable_crtc(pipe->stream_res.tg);
+
+				if (stream_enc && stream_enc->funcs->enable_fifo)
+					pipe->stream_res.stream_enc->funcs->enable_fifo(stream_enc);
+			}
 		}
 	}
 }
-- 
cgit v1.2.3


From 85096a73f4dc223b5848e41ce40afc5ee913079e Mon Sep 17 00:00:00 2001
From: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
Date: Thu, 8 Sep 2022 11:11:47 -0400
Subject: drm/amd/display: Add explicit FIFO disable for DP blank

[Why]
We rely on DMCUB to do this when disabling the link but it should
actually come before we disable the DP VID stream.

If we don't then the FIFO can end up with underflow that persists
the next time it's enabled.

[How]
Add a DCN314 specific blank sequence that will disable the DIG FIFO
first.

Reviewed-by: Syed Hassan <Syed.Hassan@amd.com>
Acked-by: Jasdeep Dhillon <jdhillon@amd.com>
Signed-off-by: Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c    | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c
index 52b71b2fef67..8c0ab013764e 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dio_stream_encoder.c
@@ -262,6 +262,16 @@ static bool is_two_pixels_per_containter(const struct dc_crtc_timing *timing)
 	return two_pix;
 }
 
+void enc314_stream_encoder_dp_blank(
+	struct dc_link *link,
+	struct stream_encoder *enc)
+{
+	/* New to DCN314 - disable the FIFO before VID stream disable. */
+	enc314_disable_fifo(enc);
+
+	enc1_stream_encoder_dp_blank(link, enc);
+}
+
 static void enc314_stream_encoder_dp_unblank(
 		struct dc_link *link,
 		struct stream_encoder *enc,
@@ -411,7 +421,7 @@ static const struct stream_encoder_funcs dcn314_str_enc_funcs = {
 	.stop_dp_info_packets =
 		enc1_stream_encoder_stop_dp_info_packets,
 	.dp_blank =
-		enc1_stream_encoder_dp_blank,
+		enc314_stream_encoder_dp_blank,
 	.dp_unblank =
 		enc314_stream_encoder_dp_unblank,
 	.audio_mute_control = enc3_audio_mute_control,
-- 
cgit v1.2.3


From 1ec5c0e1289c70c8e7dd93a988d945ad5bc3f4b1 Mon Sep 17 00:00:00 2001
From: Aric Cyr <aric.cyr@amd.com>
Date: Wed, 14 Sep 2022 12:54:52 -0400
Subject: drm/amd/display: Fix audio on display after unplugging another

Revert "dc: skip audio setup when audio stream is enabled"

This reverts commit 65fbfb02c2734cacffec5e3f492e1b4f1dabcf98

[why]
We have minimal pipe split transition method to avoid pipe
allocation outage.However, this method will invoke audio setup
which cause audio output stuck once pipe reallocate.

[how]
skip audio setup for pipelines which audio stream has been enabled

Reviewed-by: Martin Leung <Martin.Leung@amd.com>
Acked-by: Jasdeep Dhillon <jdhillon@amd.com>
Signed-off-by: Aric Cyr <aric.cyr@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
index aea49334021c..38a67051d470 100644
--- a/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
+++ b/drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
@@ -2164,8 +2164,7 @@ static void dce110_setup_audio_dto(
 			continue;
 		if (pipe_ctx->stream->signal != SIGNAL_TYPE_HDMI_TYPE_A)
 			continue;
-		if (pipe_ctx->stream_res.audio != NULL &&
-			pipe_ctx->stream_res.audio->enabled == false) {
+		if (pipe_ctx->stream_res.audio != NULL) {
 			struct audio_output audio_output;
 
 			build_audio_output(context, pipe_ctx, &audio_output);
@@ -2205,8 +2204,7 @@ static void dce110_setup_audio_dto(
 			if (!dc_is_dp_signal(pipe_ctx->stream->signal))
 				continue;
 
-			if (pipe_ctx->stream_res.audio != NULL &&
-				pipe_ctx->stream_res.audio->enabled == false) {
+			if (pipe_ctx->stream_res.audio != NULL) {
 				struct audio_output audio_output;
 
 				build_audio_output(context, pipe_ctx, &audio_output);
-- 
cgit v1.2.3


From 391ce355e72bd5713afc85ec5acc955294499dcf Mon Sep 17 00:00:00 2001
From: Taimur Hassan <Syed.Hassan@amd.com>
Date: Wed, 14 Sep 2022 16:25:31 -0400
Subject: drm/amd/display: Fix typo in get_pixel_rate_div

[Why & How]
Some FIFO errors still occur due to reading wrong pixel rate divider.
Fix typo to prevent FIFO error.

Reviewed-by: Nicholas Kazlauskas <Nicholas.Kazlauskas@amd.com>
Acked-by: Jasdeep Dhillon <jdhillon@amd.com>
Signed-off-by: Taimur Hassan <Syed.Hassan@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dccg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dccg.c b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dccg.c
index 60540864e028..fb729674953b 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dccg.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn314/dcn314_dccg.c
@@ -76,7 +76,7 @@ static void dccg314_get_pixel_rate_div(
 	case 3:
 		REG_GET_2(OTG_PIXEL_RATE_DIV,
 			OTG3_PIXEL_RATE_DIVK1, &val_k1,
-			OTG3_PIXEL_RATE_DIVK2, &val_k1);
+			OTG3_PIXEL_RATE_DIVK2, &val_k2);
 		break;
 	default:
 		BREAK_TO_DEBUGGER();
-- 
cgit v1.2.3


From 8dd2e87dbd49ba0b6afc1dd22a5930d797cf423f Mon Sep 17 00:00:00 2001
From: Eric Bernstein <eric.bernstein@amd.com>
Date: Mon, 21 Mar 2022 10:42:34 -0400
Subject: drm/amd/display: Remove assert for odm transition case

Remove assert that will hit during odm transition case, since this is a
valid case.

Signed-off-by: Eric Bernstein <eric.bernstein@amd.com>
Reviewed-by: Rodrigo Siqueira <Rodrigo.Siqueira@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubbub.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubbub.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubbub.c
index 99eb239bbc7b..9aebc1be2f59 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubbub.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_hubbub.c
@@ -98,9 +98,13 @@ static void dcn32_program_det_size(struct hubbub *hubbub, int hubp_inst, unsigne
 	default:
 		break;
 	}
-	/* Should never be hit, if it is we have an erroneous hw config*/
-	ASSERT(hubbub2->det0_size + hubbub2->det1_size + hubbub2->det2_size
-			+ hubbub2->det3_size + hubbub2->compbuf_size_segments <= hubbub2->crb_size_segs);
+	if (hubbub2->det0_size + hubbub2->det1_size + hubbub2->det2_size
+			+ hubbub2->det3_size + hubbub2->compbuf_size_segments > hubbub2->crb_size_segs) {
+		/* This may happen during seamless transition from ODM 2:1 to ODM4:1 */
+		DC_LOG_WARNING("CRB Config Warning: DET size (%d,%d,%d,%d) + Compbuf size (%d) >  CRB segments (%d)\n",
+						hubbub2->det0_size, hubbub2->det1_size, hubbub2->det2_size, hubbub2->det3_size,
+						hubbub2->compbuf_size_segments, hubbub2->crb_size_segs);
+	}
 }
 
 static void dcn32_program_compbuf_size(struct hubbub *hubbub, unsigned int compbuf_size_kb, bool safe_to_increase)
-- 
cgit v1.2.3


From a28acf7091babf4248b042326946cee7736f796a Mon Sep 17 00:00:00 2001
From: Taimur Hassan <Syed.Hassan@amd.com>
Date: Tue, 13 Sep 2022 18:35:20 -0400
Subject: drm/amd/display: Avoid unnecessary pixel rate divider programming

[Why]
Programming pixel rate divider when FIFO is enabled can cause FIFO error.

[How]
Skip divider programming when divider values are the same to prevent FIFO
error.

Reviewed-by: Alvin Lee <alvin.lee2@amd.com>
Acked-by: Jasdeep Dhillon <jdhillon@amd.com>
Signed-off-by: Taimur Hassan <Syed.Hassan@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dccg.c | 53 +++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dccg.c b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dccg.c
index 0d5e8a441512..6640d0ac4304 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dccg.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn32/dcn32_dccg.c
@@ -42,6 +42,48 @@
 #define DC_LOGGER \
 	dccg->ctx->logger
 
+static void dccg32_get_pixel_rate_div(
+		struct dccg *dccg,
+		uint32_t otg_inst,
+		enum pixel_rate_div *k1,
+		enum pixel_rate_div *k2)
+{
+	struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg);
+	uint32_t val_k1 = PIXEL_RATE_DIV_NA, val_k2 = PIXEL_RATE_DIV_NA;
+
+	*k1 = PIXEL_RATE_DIV_NA;
+	*k2 = PIXEL_RATE_DIV_NA;
+
+	switch (otg_inst) {
+	case 0:
+		REG_GET_2(OTG_PIXEL_RATE_DIV,
+			OTG0_PIXEL_RATE_DIVK1, &val_k1,
+			OTG0_PIXEL_RATE_DIVK2, &val_k2);
+		break;
+	case 1:
+		REG_GET_2(OTG_PIXEL_RATE_DIV,
+			OTG1_PIXEL_RATE_DIVK1, &val_k1,
+			OTG1_PIXEL_RATE_DIVK2, &val_k2);
+		break;
+	case 2:
+		REG_GET_2(OTG_PIXEL_RATE_DIV,
+			OTG2_PIXEL_RATE_DIVK1, &val_k1,
+			OTG2_PIXEL_RATE_DIVK2, &val_k2);
+		break;
+	case 3:
+		REG_GET_2(OTG_PIXEL_RATE_DIV,
+			OTG3_PIXEL_RATE_DIVK1, &val_k1,
+			OTG3_PIXEL_RATE_DIVK2, &val_k2);
+		break;
+	default:
+		BREAK_TO_DEBUGGER();
+		return;
+	}
+
+	*k1 = (enum pixel_rate_div)val_k1;
+	*k2 = (enum pixel_rate_div)val_k2;
+}
+
 static void dccg32_set_pixel_rate_div(
 		struct dccg *dccg,
 		uint32_t otg_inst,
@@ -50,6 +92,17 @@ static void dccg32_set_pixel_rate_div(
 {
 	struct dcn_dccg *dccg_dcn = TO_DCN_DCCG(dccg);
 
+	enum pixel_rate_div cur_k1 = PIXEL_RATE_DIV_NA, cur_k2 = PIXEL_RATE_DIV_NA;
+
+	// Don't program 0xF into the register field. Not valid since
+	// K1 / K2 field is only 1 / 2 bits wide
+	if (k1 == PIXEL_RATE_DIV_NA || k2 == PIXEL_RATE_DIV_NA)
+		return;
+
+	dccg32_get_pixel_rate_div(dccg, otg_inst, &cur_k1, &cur_k2);
+	if (k1 == cur_k1 && k2 == cur_k2)
+		return;
+
 	switch (otg_inst) {
 	case 0:
 		REG_UPDATE_2(OTG_PIXEL_RATE_DIV,
-- 
cgit v1.2.3


From 2d3907c152611a0d65efe54b93972320dcce1565 Mon Sep 17 00:00:00 2001
From: Samson Tam <samson.tam@amd.com>
Date: Fri, 9 Sep 2022 17:16:32 -0400
Subject: drm/amd/display: fill in clock values when DPM is not enabled

[Why]
For individual feature testing, PMFW may not report all clock
values back. Driver will default them to 0 but this will
cause the BB table to be skipped and default to one state
with max clocks.

[How]
Add helper function to scan through initial clock values and
populate them with default clock limits so that BB table
can be built.
Add dpm_enabled flag to check when DPM is not enabled and
to trigger helper function.

Reviewed-by: Jun Lei <Jun.Lei@amd.com>
Acked-by: Jasdeep Dhillon <jdhillon@amd.com>
Signed-off-by: Samson Tam <samson.tam@amd.com>
Tested-by: Daniel Wheeler <daniel.wheeler@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 .../amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c   | 14 ++++++++
 .../gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c   | 39 ++++++++++++++++++++++
 .../gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.h   |  2 ++
 .../drm/amd/display/dc/inc/hw/clk_mgr_internal.h   |  2 ++
 4 files changed, 57 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c
index c6785969eb1a..f0f3f66629cc 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn32/dcn32_clk_mgr.c
@@ -156,12 +156,14 @@ void dcn32_init_clocks(struct clk_mgr *clk_mgr_base)
 {
 	struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base);
 	unsigned int num_levels;
+	unsigned int num_dcfclk_levels, num_dtbclk_levels, num_dispclk_levels;
 
 	memset(&(clk_mgr_base->clks), 0, sizeof(struct dc_clocks));
 	clk_mgr_base->clks.p_state_change_support = true;
 	clk_mgr_base->clks.prev_p_state_change_support = true;
 	clk_mgr_base->clks.fclk_prev_p_state_change_support = true;
 	clk_mgr->smu_present = false;
+	clk_mgr->dpm_present = false;
 
 	if (!clk_mgr_base->bw_params)
 		return;
@@ -179,6 +181,7 @@ void dcn32_init_clocks(struct clk_mgr *clk_mgr_base)
 	dcn32_init_single_clock(clk_mgr, PPCLK_DCFCLK,
 			&clk_mgr_base->bw_params->clk_table.entries[0].dcfclk_mhz,
 			&num_levels);
+	num_dcfclk_levels = num_levels;
 
 	/* SOCCLK */
 	dcn32_init_single_clock(clk_mgr, PPCLK_SOCCLK,
@@ -189,11 +192,16 @@ void dcn32_init_clocks(struct clk_mgr *clk_mgr_base)
 		dcn32_init_single_clock(clk_mgr, PPCLK_DTBCLK,
 				&clk_mgr_base->bw_params->clk_table.entries[0].dtbclk_mhz,
 				&num_levels);
+	num_dtbclk_levels = num_levels;
 
 	/* DISPCLK */
 	dcn32_init_single_clock(clk_mgr, PPCLK_DISPCLK,
 			&clk_mgr_base->bw_params->clk_table.entries[0].dispclk_mhz,
 			&num_levels);
+	num_dispclk_levels = num_levels;
+
+	if (num_dcfclk_levels && num_dtbclk_levels && num_dispclk_levels)
+		clk_mgr->dpm_present = true;
 
 	if (clk_mgr_base->ctx->dc->debug.min_disp_clk_khz) {
 		unsigned int i;
@@ -658,6 +666,12 @@ static void dcn32_get_memclk_states_from_smu(struct clk_mgr *clk_mgr_base)
 			&num_levels);
 	clk_mgr_base->bw_params->clk_table.num_entries = num_levels ? num_levels : 1;
 
+	if (clk_mgr->dpm_present && !num_levels)
+		clk_mgr->dpm_present = false;
+
+	if (!clk_mgr->dpm_present)
+		dcn32_patch_dpm_table(clk_mgr_base->bw_params);
+
 	DC_FP_START();
 	/* Refresh bounding box */
 	clk_mgr_base->ctx->dc->res_pool->funcs->update_bw_bounding_box(
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c
index c3cc06876210..e573e706430d 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.c
@@ -1926,6 +1926,45 @@ static void remove_entry_from_table_at_index(struct _vcs_dpi_voltage_scaling_st
 	memset(&table[--(*num_entries)], 0, sizeof(struct _vcs_dpi_voltage_scaling_st));
 }
 
+void dcn32_patch_dpm_table(struct clk_bw_params *bw_params)
+{
+	int i;
+	unsigned int max_dcfclk_mhz = 0, max_dispclk_mhz = 0, max_dppclk_mhz = 0,
+			max_phyclk_mhz = 0, max_dtbclk_mhz = 0, max_fclk_mhz = 0, max_uclk_mhz = 0;
+
+	for (i = 0; i < MAX_NUM_DPM_LVL; i++) {
+		if (bw_params->clk_table.entries[i].dcfclk_mhz > max_dcfclk_mhz)
+			max_dcfclk_mhz = bw_params->clk_table.entries[i].dcfclk_mhz;
+		if (bw_params->clk_table.entries[i].fclk_mhz > max_fclk_mhz)
+			max_fclk_mhz = bw_params->clk_table.entries[i].fclk_mhz;
+		if (bw_params->clk_table.entries[i].memclk_mhz > max_uclk_mhz)
+			max_uclk_mhz = bw_params->clk_table.entries[i].memclk_mhz;
+		if (bw_params->clk_table.entries[i].dispclk_mhz > max_dispclk_mhz)
+			max_dispclk_mhz = bw_params->clk_table.entries[i].dispclk_mhz;
+		if (bw_params->clk_table.entries[i].dppclk_mhz > max_dppclk_mhz)
+			max_dppclk_mhz = bw_params->clk_table.entries[i].dppclk_mhz;
+		if (bw_params->clk_table.entries[i].phyclk_mhz > max_phyclk_mhz)
+			max_phyclk_mhz = bw_params->clk_table.entries[i].phyclk_mhz;
+		if (bw_params->clk_table.entries[i].dtbclk_mhz > max_dtbclk_mhz)
+			max_dtbclk_mhz = bw_params->clk_table.entries[i].dtbclk_mhz;
+	}
+
+	/* Scan through clock values we currently have and if they are 0,
+	 *  then populate it with dcn3_2_soc.clock_limits[] value.
+	 *
+	 * Do it for DCFCLK, DISPCLK, DTBCLK and UCLK as any of those being
+	 *  0, will cause it to skip building the clock table.
+	 */
+	if (max_dcfclk_mhz == 0)
+		bw_params->clk_table.entries[0].dcfclk_mhz = dcn3_2_soc.clock_limits[0].dcfclk_mhz;
+	if (max_dispclk_mhz == 0)
+		bw_params->clk_table.entries[0].dispclk_mhz = dcn3_2_soc.clock_limits[0].dispclk_mhz;
+	if (max_dtbclk_mhz == 0)
+		bw_params->clk_table.entries[0].dtbclk_mhz = dcn3_2_soc.clock_limits[0].dtbclk_mhz;
+	if (max_uclk_mhz == 0)
+		bw_params->clk_table.entries[0].memclk_mhz = dcn3_2_soc.clock_limits[0].dram_speed_mts / 16;
+}
+
 static int build_synthetic_soc_states(struct clk_bw_params *bw_params,
 		struct _vcs_dpi_voltage_scaling_st *table, unsigned int *num_entries)
 {
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.h b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.h
index 6ce221098979..e1b79e2aab8c 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.h
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn32/dcn32_fpu.h
@@ -77,4 +77,6 @@ int dcn32_find_dummy_latency_index_for_fw_based_mclk_switch(struct dc *dc,
 							    int pipe_cnt,
 							    int vlevel);
 
+void dcn32_patch_dpm_table(struct clk_bw_params *bw_params);
+
 #endif
diff --git a/drivers/gpu/drm/amd/display/dc/inc/hw/clk_mgr_internal.h b/drivers/gpu/drm/amd/display/dc/inc/hw/clk_mgr_internal.h
index 68c2ed434d2c..cff5fd55a0ad 100644
--- a/drivers/gpu/drm/amd/display/dc/inc/hw/clk_mgr_internal.h
+++ b/drivers/gpu/drm/amd/display/dc/inc/hw/clk_mgr_internal.h
@@ -340,6 +340,8 @@ struct clk_mgr_internal {
 	bool smu_present;
 	void *wm_range_table;
 	long long wm_range_table_addr;
+
+	bool dpm_present;
 };
 
 struct clk_mgr_internal_funcs {
-- 
cgit v1.2.3


From b3e45b18e5c40f1f7e5f6383953343f96f963b13 Mon Sep 17 00:00:00 2001
From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
Date: Thu, 15 Sep 2022 15:19:20 +0800
Subject: drm/amdgpu: Correct the position in patch_cond_exec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current position calulated in gfx_v9_0_ring_emit_patch_cond_exec
underflows when the wptr is divisible by ring->buf_mask + 1.

Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index fc9c1043244c..037af8352677 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -5597,7 +5597,7 @@ static void gfx_v9_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
 	BUG_ON(offset > ring->buf_mask);
 	BUG_ON(ring->ring[offset] != 0x55aa55aa);
 
-	cur = (ring->wptr & ring->buf_mask) - 1;
+	cur = (ring->wptr - 1) & ring->buf_mask;
 	if (likely(cur > offset))
 		ring->ring[offset] = cur - offset;
 	else
-- 
cgit v1.2.3


From 11e38360cc15e2d5995449d19030b13a67d73cb3 Mon Sep 17 00:00:00 2001
From: "Jiadong.Zhu" <Jiadong.Zhu@amd.com>
Date: Fri, 23 Sep 2022 21:17:53 +0800
Subject: drm/amdgpu: Remove fence_process in count_emitted
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The function amdgpu_fence_count_emitted used in work_hander should not call
amdgpu_fence_process which must be used in irq handler.

Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Jiadong.Zhu <Jiadong.Zhu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 8adeb7469f1e..d0d99ed607dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -400,7 +400,6 @@ unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring)
 	/* We are not protected by ring lock when reading the last sequence
 	 * but it's ok to report slightly wrong fence count here.
 	 */
-	amdgpu_fence_process(ring);
 	emitted = 0x100000000ull;
 	emitted -= atomic_read(&ring->fence_drv.last_seq);
 	emitted += READ_ONCE(ring->fence_drv.sync_seq);
-- 
cgit v1.2.3


From 3b7329cf5a767c1be38352d43066012e220ad43c Mon Sep 17 00:00:00 2001
From: Bokun Zhang <Bokun.Zhang@amd.com>
Date: Wed, 28 Sep 2022 00:30:04 +0800
Subject: drm/amdgpu: Add amdgpu suspend-resume code path under SRIOV

- Under SRIOV, we need to send REQ_GPU_FINI to the hypervisor
  during the suspend time. Furthermore, we cannot request a
  mode 1 reset under SRIOV as VF. Therefore, we will skip it
  as it is called in suspend_noirq() function.

- In the resume code path, we need to send REQ_GPU_INIT to the
  hypervisor and also resume PSP IP block under SRIOV.

Signed-off-by: Bokun Zhang <Bokun.Zhang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Cc: stable@vger.kernel.org
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c   |  4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 27 ++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
index 130060834b4e..48bd660ddb85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
@@ -1050,6 +1050,10 @@ bool amdgpu_acpi_should_gpu_reset(struct amdgpu_device *adev)
 {
 	if (adev->flags & AMD_IS_APU)
 		return false;
+
+	if (amdgpu_sriov_vf(adev))
+		return false;
+
 	return pm_suspend_target_state != PM_SUSPEND_TO_IDLE;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index be7aff2d4a57..25e1f5ed7ead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3152,7 +3152,8 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
 			continue;
 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
-		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
+		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
+		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
 
 			r = adev->ip_blocks[i].version->funcs->resume(adev);
 			if (r) {
@@ -4064,12 +4065,20 @@ static void amdgpu_device_evict_resources(struct amdgpu_device *adev)
 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
 {
 	struct amdgpu_device *adev = drm_to_adev(dev);
+	int r = 0;
 
 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
 		return 0;
 
 	adev->in_suspend = true;
 
+	if (amdgpu_sriov_vf(adev)) {
+		amdgpu_virt_fini_data_exchange(adev);
+		r = amdgpu_virt_request_full_gpu(adev, false);
+		if (r)
+			return r;
+	}
+
 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
 		DRM_WARN("smart shift update failed\n");
 
@@ -4093,6 +4102,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
 
 	amdgpu_device_ip_suspend_phase2(adev);
 
+	if (amdgpu_sriov_vf(adev))
+		amdgpu_virt_release_full_gpu(adev, false);
+
 	return 0;
 }
 
@@ -4111,6 +4123,12 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
 	struct amdgpu_device *adev = drm_to_adev(dev);
 	int r = 0;
 
+	if (amdgpu_sriov_vf(adev)) {
+		r = amdgpu_virt_request_full_gpu(adev, true);
+		if (r)
+			return r;
+	}
+
 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
 		return 0;
 
@@ -4125,6 +4143,13 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
 	}
 
 	r = amdgpu_device_ip_resume(adev);
+
+	/* no matter what r is, always need to properly release full GPU */
+	if (amdgpu_sriov_vf(adev)) {
+		amdgpu_virt_init_data_exchange(adev);
+		amdgpu_virt_release_full_gpu(adev, true);
+	}
+
 	if (r) {
 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
 		return r;
-- 
cgit v1.2.3


From db5db1a00d0816207be3a0166fcb4f523eaf3b52 Mon Sep 17 00:00:00 2001
From: Angus Chen <angus.chen@jaguarmicro.com>
Date: Fri, 23 Sep 2022 17:10:13 +0800
Subject: vdpa/ifcvf: fix the calculation of queuepair

The q_pair_id to address a queue pair in the lm bar should be
calculated by queue_id / 2 rather than queue_id / nr_vring.

Fixes: 2ddae773c93b ("vDPA/ifcvf: detect and use the onboard number of queues directly")
Signed-off-by: Angus Chen <angus.chen@jaguarmicro.com>
Reviewed-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Zhu Lingshan <lingshan.zhu@intel.com>
Message-Id: <20220923091013.191-1-angus.chen@jaguarmicro.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/ifcvf/ifcvf_base.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c
index 75a703b803a2..3e4486bfa0b7 100644
--- a/drivers/vdpa/ifcvf/ifcvf_base.c
+++ b/drivers/vdpa/ifcvf/ifcvf_base.c
@@ -323,7 +323,7 @@ u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid)
 	u32 q_pair_id;
 
 	ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
-	q_pair_id = qid / hw->nr_vring;
+	q_pair_id = qid / 2;
 	avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
 	last_avail_idx = vp_ioread16(avail_idx_addr);
 
@@ -337,7 +337,7 @@ int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num)
 	u32 q_pair_id;
 
 	ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg;
-	q_pair_id = qid / hw->nr_vring;
+	q_pair_id = qid / 2;
 	avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2];
 	hw->vring[qid].last_avail_idx = num;
 	vp_iowrite16(num, avail_idx_addr);
-- 
cgit v1.2.3


From 1bedcf22c081a6e9943f09937b2da8d3ef52d20d Mon Sep 17 00:00:00 2001
From: lei he <helei.sig11@bytedance.com>
Date: Mon, 19 Sep 2022 15:51:58 +0800
Subject: virtio-crypto: fix memory-leak

Fix memory-leak for virtio-crypto akcipher request, this problem is
introduced by 59ca6c93387d3(virtio-crypto: implement RSA algorithm).
The leak can be reproduced and tested with the following script
inside virtual machine:

#!/bin/bash

LOOP_TIMES=10000

# required module: pkcs8_key_parser, virtio_crypto
modprobe pkcs8_key_parser # if CONFIG_PKCS8_PRIVATE_KEY_PARSER=m
modprobe virtio_crypto # if CONFIG_CRYPTO_DEV_VIRTIO=m
rm -rf /tmp/data
dd if=/dev/random of=/tmp/data count=1 bs=230

# generate private key and self-signed cert
openssl req -nodes -x509 -newkey rsa:2048 -keyout key.pem \
		-outform der -out cert.der  \
		-subj "/C=CN/ST=GD/L=SZ/O=vihoo/OU=dev/CN=always.com/emailAddress=yy@always.com"
# convert private key from pem to der
openssl pkcs8 -in key.pem -topk8 -nocrypt -outform DER -out key.der

# add key
PRIV_KEY_ID=`cat key.der | keyctl padd asymmetric test_priv_key @s`
echo "priv key id = "$PRIV_KEY_ID
PUB_KEY_ID=`cat cert.der | keyctl padd asymmetric test_pub_key @s`
echo "pub key id = "$PUB_KEY_ID

# query key
keyctl pkey_query $PRIV_KEY_ID 0
keyctl pkey_query $PUB_KEY_ID 0

# here we only run pkey_encrypt becasuse it is the fastest interface
function bench_pub() {
	keyctl pkey_encrypt $PUB_KEY_ID 0 /tmp/data enc=pkcs1 >/tmp/enc.pub
}

# do bench_pub in loop to obtain the memory leak
for (( i = 0; i < ${LOOP_TIMES}; ++i )); do
	bench_pub
done

Signed-off-by: lei he <helei.sig11@bytedance.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Gonglei <arei.gonglei@huawei.com>
Message-Id: <20220919075158.3625-1-helei.sig11@bytedance.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/crypto/virtio/virtio_crypto_akcipher_algs.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
index 2a60d0525cde..168195672e2e 100644
--- a/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
+++ b/drivers/crypto/virtio/virtio_crypto_akcipher_algs.c
@@ -56,6 +56,10 @@ static void virtio_crypto_akcipher_finalize_req(
 	struct virtio_crypto_akcipher_request *vc_akcipher_req,
 	struct akcipher_request *req, int err)
 {
+	kfree(vc_akcipher_req->src_buf);
+	kfree(vc_akcipher_req->dst_buf);
+	vc_akcipher_req->src_buf = NULL;
+	vc_akcipher_req->dst_buf = NULL;
 	virtcrypto_clear_request(&vc_akcipher_req->base);
 
 	crypto_finalize_akcipher_request(vc_akcipher_req->base.dataq->engine, req, err);
-- 
cgit v1.2.3


From dbe449d8f8f2ef91af729f0adcc9bef785873168 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Tue, 30 Aug 2022 19:05:49 +0800
Subject: virtio_test: fixup for vq reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix virtio test compilation failure caused by vq reset.

../../drivers/virtio/virtio_ring.c: In function ‘vring_create_virtqueue_packed’:
../../drivers/virtio/virtio_ring.c:1999:8: error: ‘struct virtqueue’ has no member named ‘reset’
 1999 |  vq->vq.reset = false;
      |        ^
../../drivers/virtio/virtio_ring.c: In function ‘__vring_new_virtqueue’:
../../drivers/virtio/virtio_ring.c:2493:8: error: ‘struct virtqueue’ has no member named ‘reset’
 2493 |  vq->vq.reset = false;
      |        ^
../../drivers/virtio/virtio_ring.c: In function ‘virtqueue_resize’:
../../drivers/virtio/virtio_ring.c:2587:18: error: ‘struct virtqueue’ has no member named ‘num_max’
 2587 |  if (num > vq->vq.num_max)
      |                  ^
../../drivers/virtio/virtio_ring.c:2596:11: error: ‘struct virtio_device’ has no member named ‘config’
 2596 |  if (!vdev->config->disable_vq_and_reset)
      |           ^~
../../drivers/virtio/virtio_ring.c:2599:11: error: ‘struct virtio_device’ has no member named ‘config’
 2599 |  if (!vdev->config->enable_vq_after_reset)
      |           ^~
../../drivers/virtio/virtio_ring.c:2602:12: error: ‘struct virtio_device’ has no member named ‘config’
 2602 |  err = vdev->config->disable_vq_and_reset(_vq);
      |            ^~
../../drivers/virtio/virtio_ring.c:2614:10: error: ‘struct virtio_device’ has no member named ‘config’
 2614 |  if (vdev->config->enable_vq_after_reset(_vq))
      |          ^~
make: *** [<builtin>: virtio_ring.o] Error 1

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Message-Id: <20220830110549.103168-1-xuanzhuo@linux.alibaba.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tools/virtio/linux/virtio.h        | 3 +++
 tools/virtio/linux/virtio_config.h | 5 +++++
 2 files changed, 8 insertions(+)

diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h
index 363b98228301..5d3440f474dd 100644
--- a/tools/virtio/linux/virtio.h
+++ b/tools/virtio/linux/virtio.h
@@ -14,6 +14,7 @@ struct virtio_device {
 	u64 features;
 	struct list_head vqs;
 	spinlock_t vqs_list_lock;
+	const struct virtio_config_ops *config;
 };
 
 struct virtqueue {
@@ -23,7 +24,9 @@ struct virtqueue {
 	struct virtio_device *vdev;
         unsigned int index;
         unsigned int num_free;
+	unsigned int num_max;
 	void *priv;
+	bool reset;
 };
 
 /* Interfaces exported by virtio_ring. */
diff --git a/tools/virtio/linux/virtio_config.h b/tools/virtio/linux/virtio_config.h
index f2640e505c4e..2a8a70e2a950 100644
--- a/tools/virtio/linux/virtio_config.h
+++ b/tools/virtio/linux/virtio_config.h
@@ -3,6 +3,11 @@
 #include <linux/virtio.h>
 #include <uapi/linux/virtio_config.h>
 
+struct virtio_config_ops {
+	int (*disable_vq_and_reset)(struct virtqueue *vq);
+	int (*enable_vq_after_reset)(struct virtqueue *vq);
+};
+
 /*
  * __virtio_test_bit - helper to test feature bits. For use by transports.
  *                     Devices should normally use virtio_has_feature,
-- 
cgit v1.2.3


From 37fafe6b61e4f15d977982635bb785f4e605f7cd Mon Sep 17 00:00:00 2001
From: Suwan Kim <suwan.kim027@gmail.com>
Date: Wed, 31 Aug 2022 00:01:53 +0900
Subject: virtio-blk: Fix WARN_ON_ONCE in virtio_queue_rq()

If a request fails at virtio_queue_rqs(), it is inserted to requeue_list
and passed to virtio_queue_rq(). Then blk_mq_start_request() can be called
again at virtio_queue_rq() and trigger WARN_ON_ONCE like below trace because
request state was already set to MQ_RQ_IN_FLIGHT in virtio_queue_rqs()
despite the failure.

[    1.890468] ------------[ cut here ]------------
[    1.890776] WARNING: CPU: 2 PID: 122 at block/blk-mq.c:1143
blk_mq_start_request+0x8a/0xe0
[    1.891045] Modules linked in:
[    1.891250] CPU: 2 PID: 122 Comm: journal-offline Not tainted 5.19.0+ #44
[    1.891504] Hardware name: ChromiumOS crosvm, BIOS 0
[    1.891739] RIP: 0010:blk_mq_start_request+0x8a/0xe0
[    1.891961] Code: 12 80 74 22 48 8b 4b 10 8b 89 64 01 00 00 8b 53
20 83 fa ff 75 08 ba 00 00 00 80 0b 53 24 c1 e1 10 09 d1 89 48 34 5b
41 5e c3 <0f> 0b eb b8 65 8b 05 2b 39 b6 7e 89 c0 48 0f a3 05 39 77 5b
01 0f
[    1.892443] RSP: 0018:ffffc900002777b0 EFLAGS: 00010202
[    1.892673] RAX: 0000000000000000 RBX: ffff888004bc0000 RCX: 0000000000000000
[    1.892952] RDX: 0000000000000000 RSI: ffff888003d7c200 RDI: ffff888004bc0000
[    1.893228] RBP: 0000000000000000 R08: 0000000000000001 R09: ffff888004bc0100
[    1.893506] R10: ffffffffffffffff R11: ffffffff8185ca10 R12: ffff888004bc0000
[    1.893797] R13: ffffc90000277900 R14: ffff888004ab2340 R15: ffff888003d86e00
[    1.894060] FS:  00007ffa143a4640(0000) GS:ffff88807dd00000(0000)
knlGS:0000000000000000
[    1.894412] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    1.894682] CR2: 00005648577d9088 CR3: 00000000053da004 CR4: 0000000000170ee0
[    1.894953] Call Trace:
[    1.895139]  <TASK>
[    1.895303]  virtblk_prep_rq+0x1e5/0x280
[    1.895509]  virtio_queue_rq+0x5c/0x310
[    1.895710]  ? virtqueue_add_sgs+0x95/0xb0
[    1.895905]  ? _raw_spin_unlock_irqrestore+0x16/0x30
[    1.896133]  ? virtio_queue_rqs+0x340/0x390
[    1.896453]  ? sbitmap_get+0xfa/0x220
[    1.896678]  __blk_mq_issue_directly+0x41/0x180
[    1.896906]  blk_mq_plug_issue_direct+0xd8/0x2c0
[    1.897115]  blk_mq_flush_plug_list+0x115/0x180
[    1.897342]  blk_add_rq_to_plug+0x51/0x130
[    1.897543]  blk_mq_submit_bio+0x3a1/0x570
[    1.897750]  submit_bio_noacct_nocheck+0x418/0x520
[    1.897985]  ? submit_bio_noacct+0x1e/0x260
[    1.897989]  ext4_bio_write_page+0x222/0x420
[    1.898000]  mpage_process_page_bufs+0x178/0x1c0
[    1.899451]  mpage_prepare_extent_to_map+0x2d2/0x440
[    1.899603]  ext4_writepages+0x495/0x1020
[    1.899733]  do_writepages+0xcb/0x220
[    1.899871]  ? __seccomp_filter+0x171/0x7e0
[    1.900006]  file_write_and_wait_range+0xcd/0xf0
[    1.900167]  ext4_sync_file+0x72/0x320
[    1.900308]  __x64_sys_fsync+0x66/0xa0
[    1.900449]  do_syscall_64+0x31/0x50
[    1.900595]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[    1.900747] RIP: 0033:0x7ffa16ec96ea
[    1.900883] Code: b8 4a 00 00 00 0f 05 48 3d 00 f0 ff ff 77 41 c3
48 83 ec 18 89 7c 24 0c e8 e3 02 f8 ff 8b 7c 24 0c 89 c2 b8 4a 00 00
00 0f 05 <48> 3d 00 f0 ff ff 77 36 89 d7 89 44 24 0c e8 43 03 f8 ff 8b
44 24
[    1.901302] RSP: 002b:00007ffa143a3ac0 EFLAGS: 00000293 ORIG_RAX:
000000000000004a
[    1.901499] RAX: ffffffffffffffda RBX: 0000560277ec6fe0 RCX: 00007ffa16ec96ea
[    1.901696] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000016
[    1.901884] RBP: 0000560277ec5910 R08: 0000000000000000 R09: 00007ffa143a4640
[    1.902082] R10: 00007ffa16e4d39e R11: 0000000000000293 R12: 00005602773f59e0
[    1.902459] R13: 0000000000000000 R14: 00007fffbfc007ff R15: 00007ffa13ba4000
[    1.902763]  </TASK>
[    1.902877] ---[ end trace 0000000000000000 ]---

To avoid calling blk_mq_start_request() twice, This patch moves the
execution of blk_mq_start_request() to the end of virtblk_prep_rq().
And instead of requeuing failed request to plug list in the error path of
virtblk_add_req_batch(), it uses blk_mq_requeue_request() to change failed
request state to MQ_RQ_IDLE. Then virtblk can safely handle the request
on the next trial.

Fixes: 0e9911fa768f ("virtio-blk: support mq_ops->queue_rqs()")
Reported-by: Alexandre Courbot <acourbot@chromium.org>
Tested-by: Alexandre Courbot <acourbot@chromium.org>
Signed-off-by: Suwan Kim <suwan.kim027@gmail.com>
Message-Id: <20220830150153.12627-1-suwan.kim027@gmail.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Pankaj Raghav <p.raghav@samsung.com>
---
 drivers/block/virtio_blk.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 30255fcaf181..dd9a05174726 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -322,14 +322,14 @@ static blk_status_t virtblk_prep_rq(struct blk_mq_hw_ctx *hctx,
 	if (unlikely(status))
 		return status;
 
-	blk_mq_start_request(req);
-
 	vbr->sg_table.nents = virtblk_map_data(hctx, req, vbr);
 	if (unlikely(vbr->sg_table.nents < 0)) {
 		virtblk_cleanup_cmd(req);
 		return BLK_STS_RESOURCE;
 	}
 
+	blk_mq_start_request(req);
+
 	return BLK_STS_OK;
 }
 
@@ -391,8 +391,7 @@ static bool virtblk_prep_rq_batch(struct request *req)
 }
 
 static bool virtblk_add_req_batch(struct virtio_blk_vq *vq,
-					struct request **rqlist,
-					struct request **requeue_list)
+					struct request **rqlist)
 {
 	unsigned long flags;
 	int err;
@@ -408,7 +407,7 @@ static bool virtblk_add_req_batch(struct virtio_blk_vq *vq,
 		if (err) {
 			virtblk_unmap_data(req, vbr);
 			virtblk_cleanup_cmd(req);
-			rq_list_add(requeue_list, req);
+			blk_mq_requeue_request(req, true);
 		}
 	}
 
@@ -436,7 +435,7 @@ static void virtio_queue_rqs(struct request **rqlist)
 
 		if (!next || req->mq_hctx != next->mq_hctx) {
 			req->rq_next = NULL;
-			kick = virtblk_add_req_batch(vq, rqlist, &requeue_list);
+			kick = virtblk_add_req_batch(vq, rqlist);
 			if (kick)
 				virtqueue_notify(vq->vq);
 
-- 
cgit v1.2.3


From 46f8a29272e51b6df7393d58fc5cb8967397ef2b Mon Sep 17 00:00:00 2001
From: Maxime Coquelin <maxime.coquelin@redhat.com>
Date: Wed, 31 Aug 2022 17:49:23 +0200
Subject: vduse: prevent uninitialized memory accesses

If the VDUSE application provides a smaller config space
than the driver expects, the driver may use uninitialized
memory from the stack.

This patch prevents it by initializing the buffer passed by
the driver to store the config value.

This fix addresses CVE-2022-2308.

Cc: stable@vger.kernel.org # v5.15+
Fixes: c8a6153b6c59 ("vduse: Introduce VDUSE - vDPA Device in Userspace")
Reviewed-by: Xie Yongji <xieyongji@bytedance.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Message-Id: <20220831154923.97809-1-maxime.coquelin@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 41c0b29739f1..35dceee3ed56 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -673,10 +673,15 @@ static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset,
 {
 	struct vduse_dev *dev = vdpa_to_vduse(vdpa);
 
-	if (offset > dev->config_size ||
-	    len > dev->config_size - offset)
+	/* Initialize the buffer in case of partial copy. */
+	memset(buf, 0, len);
+
+	if (offset > dev->config_size)
 		return;
 
+	if (len > dev->config_size - offset)
+		len = dev->config_size - offset;
+
 	memcpy(buf, dev->config + offset, len);
 }
 
-- 
cgit v1.2.3


From a43ae8057cc154fd26a3a23c0e8643bef104d995 Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Mon, 12 Sep 2022 15:50:19 +0300
Subject: vdpa/mlx5: Fix MQ to support non power of two num queues

RQT objects require that a power of two value be configured for both
rqt_max_size and rqt_actual size.

For create_rqt, make sure to round up to the power of two the value of
given by the user who created the vdpa device and given by
ndev->rqt_size. The actual size is also rounded up to the power of two
using the current number of VQs given by ndev->cur_num_vqs.

Same goes with modify_rqt where we need to make sure act size is power
of two based on the new number of QPs.

Without this patch, attempt to create a device with non power of two QPs
would result in error from firmware.

Fixes: 52893733f2c5 ("vdpa/mlx5: Add multiqueue support")
Signed-off-by: Eli Cohen <elic@nvidia.com>
Message-Id: <20220912125019.833708-1-elic@nvidia.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index ed100a35e596..90913365def4 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1320,6 +1320,8 @@ static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *
 
 static int create_rqt(struct mlx5_vdpa_net *ndev)
 {
+	int rqt_table_size = roundup_pow_of_two(ndev->rqt_size);
+	int act_sz = roundup_pow_of_two(ndev->cur_num_vqs / 2);
 	__be32 *list;
 	void *rqtc;
 	int inlen;
@@ -1327,7 +1329,7 @@ static int create_rqt(struct mlx5_vdpa_net *ndev)
 	int i, j;
 	int err;
 
-	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + ndev->rqt_size * MLX5_ST_SZ_BYTES(rq_num);
+	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + rqt_table_size * MLX5_ST_SZ_BYTES(rq_num);
 	in = kzalloc(inlen, GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
@@ -1336,12 +1338,12 @@ static int create_rqt(struct mlx5_vdpa_net *ndev)
 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
 
 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
-	MLX5_SET(rqtc, rqtc, rqt_max_size, ndev->rqt_size);
+	MLX5_SET(rqtc, rqtc, rqt_max_size, rqt_table_size);
 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
-	for (i = 0, j = 0; i < ndev->rqt_size; i++, j += 2)
+	for (i = 0, j = 0; i < act_sz; i++, j += 2)
 		list[i] = cpu_to_be32(ndev->vqs[j % ndev->cur_num_vqs].virtq_id);
 
-	MLX5_SET(rqtc, rqtc, rqt_actual_size, ndev->rqt_size);
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
 	kfree(in);
 	if (err)
@@ -1354,6 +1356,7 @@ static int create_rqt(struct mlx5_vdpa_net *ndev)
 
 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
 {
+	int act_sz = roundup_pow_of_two(num / 2);
 	__be32 *list;
 	void *rqtc;
 	int inlen;
@@ -1361,7 +1364,7 @@ static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
 	int i, j;
 	int err;
 
-	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + ndev->rqt_size * MLX5_ST_SZ_BYTES(rq_num);
+	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + act_sz * MLX5_ST_SZ_BYTES(rq_num);
 	in = kzalloc(inlen, GFP_KERNEL);
 	if (!in)
 		return -ENOMEM;
@@ -1372,10 +1375,10 @@ static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
 
 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
-	for (i = 0, j = 0; i < ndev->rqt_size; i++, j += 2)
+	for (i = 0, j = 0; i < act_sz; i++, j = j + 2)
 		list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id);
 
-	MLX5_SET(rqtc, rqtc, rqt_actual_size, ndev->rqt_size);
+	MLX5_SET(rqtc, rqtc, rqt_actual_size, act_sz);
 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
 	kfree(in);
 	if (err)
-- 
cgit v1.2.3


From 116523c8fac05d1d26f748fee7919a4ec5df67ea Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 26 Sep 2022 15:44:42 -0700
Subject: Bluetooth: hci_core: Fix not handling link timeouts propertly

Change that introduced the use of __check_timeout did not account for
link types properly, it always assumes ACL_LINK is used thus causing
hdev->acl_last_tx to be used even in case of LE_LINK and then again
uses ACL_LINK with hci_link_tx_to.

To fix this __check_timeout now takes the link type as parameter and
then procedure to use the right last_tx based on the link type and pass
it to hci_link_tx_to.

Fixes: 1b1d29e51499 ("Bluetooth: Make use of __check_timeout on hci_sched_le")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Tested-by: David Beinder <david@beinder.at>
---
 net/bluetooth/hci_core.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 66c7cdba0d32..063fbb8e07ca 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3485,15 +3485,27 @@ static inline int __get_blocks(struct hci_dev *hdev, struct sk_buff *skb)
 	return DIV_ROUND_UP(skb->len - HCI_ACL_HDR_SIZE, hdev->block_len);
 }
 
-static void __check_timeout(struct hci_dev *hdev, unsigned int cnt)
+static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type)
 {
-	if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
-		/* ACL tx timeout must be longer than maximum
-		 * link supervision timeout (40.9 seconds) */
-		if (!cnt && time_after(jiffies, hdev->acl_last_tx +
-				       HCI_ACL_TX_TIMEOUT))
-			hci_link_tx_to(hdev, ACL_LINK);
+	unsigned long last_tx;
+
+	if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+		return;
+
+	switch (type) {
+	case LE_LINK:
+		last_tx = hdev->le_last_tx;
+		break;
+	default:
+		last_tx = hdev->acl_last_tx;
+		break;
 	}
+
+	/* tx timeout must be longer than maximum link supervision timeout
+	 * (40.9 seconds)
+	 */
+	if (!cnt && time_after(jiffies, last_tx + HCI_ACL_TX_TIMEOUT))
+		hci_link_tx_to(hdev, type);
 }
 
 /* Schedule SCO */
@@ -3551,7 +3563,7 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev)
 	struct sk_buff *skb;
 	int quote;
 
-	__check_timeout(hdev, cnt);
+	__check_timeout(hdev, cnt, ACL_LINK);
 
 	while (hdev->acl_cnt &&
 	       (chan = hci_chan_sent(hdev, ACL_LINK, &quote))) {
@@ -3594,8 +3606,6 @@ static void hci_sched_acl_blk(struct hci_dev *hdev)
 	int quote;
 	u8 type;
 
-	__check_timeout(hdev, cnt);
-
 	BT_DBG("%s", hdev->name);
 
 	if (hdev->dev_type == HCI_AMP)
@@ -3603,6 +3613,8 @@ static void hci_sched_acl_blk(struct hci_dev *hdev)
 	else
 		type = ACL_LINK;
 
+	__check_timeout(hdev, cnt, type);
+
 	while (hdev->block_cnt > 0 &&
 	       (chan = hci_chan_sent(hdev, type, &quote))) {
 		u32 priority = (skb_peek(&chan->data_q))->priority;
@@ -3676,7 +3688,7 @@ static void hci_sched_le(struct hci_dev *hdev)
 
 	cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt;
 
-	__check_timeout(hdev, cnt);
+	__check_timeout(hdev, cnt, LE_LINK);
 
 	tmp = cnt;
 	while (cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) {
-- 
cgit v1.2.3


From b66c874fdb6653aefb3019aeaa9b9f8c1aadd6a0 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Wed, 27 Jul 2022 17:58:20 +0200
Subject: kallsyms: use `ARRAY_SIZE` instead of hardcoded size

This removes one place where the `500` constant is hardcoded.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Geert Stappers <stappers@stappers.nl>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Co-developed-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 scripts/kallsyms.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index f18e6dfc68c5..8551513f9311 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -206,7 +206,7 @@ static struct sym_entry *read_symbol(FILE *in)
 
 	rc = fscanf(in, "%llx %c %499s\n", &addr, &type, name);
 	if (rc != 3) {
-		if (rc != EOF && fgets(name, 500, in) == NULL)
+		if (rc != EOF && fgets(name, ARRAY_SIZE(name), in) == NULL)
 			fprintf(stderr, "Read error or end of file.\n");
 		return NULL;
 	}
-- 
cgit v1.2.3


From b471927ebf9bb54ba6e99f20848d70193e645eed Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Wed, 27 Jul 2022 20:54:19 +0200
Subject: kallsyms: avoid hardcoding buffer size

This introduces `KSYM_NAME_LEN_BUFFER` in place of the previously
hardcoded size of the input buffer.

It will also make it easier to update the size in a single place
in a later patch.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Co-developed-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 scripts/kallsyms.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 8551513f9311..25e2fe5fbcd4 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -27,8 +27,14 @@
 
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
 
+#define _stringify_1(x)	#x
+#define _stringify(x)	_stringify_1(x)
+
 #define KSYM_NAME_LEN		128
 
+/* A substantially bigger size than the current maximum. */
+#define KSYM_NAME_LEN_BUFFER	499
+
 struct sym_entry {
 	unsigned long long addr;
 	unsigned int len;
@@ -198,13 +204,13 @@ static void check_symbol_range(const char *sym, unsigned long long addr,
 
 static struct sym_entry *read_symbol(FILE *in)
 {
-	char name[500], type;
+	char name[KSYM_NAME_LEN_BUFFER+1], type;
 	unsigned long long addr;
 	unsigned int len;
 	struct sym_entry *sym;
 	int rc;
 
-	rc = fscanf(in, "%llx %c %499s\n", &addr, &type, name);
+	rc = fscanf(in, "%llx %c %" _stringify(KSYM_NAME_LEN_BUFFER) "s\n", &addr, &type, name);
 	if (rc != 3) {
 		if (rc != EOF && fgets(name, ARRAY_SIZE(name), in) == NULL)
 			fprintf(stderr, "Read error or end of file.\n");
-- 
cgit v1.2.3


From 6e8c5bbd5e83e649251c198e743c8b9e7c48372b Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Wed, 27 Jul 2022 21:41:56 +0200
Subject: kallsyms: add static relationship between `KSYM_NAME_LEN{,_BUFFER}`

This adds a static assert to ensure `KSYM_NAME_LEN_BUFFER`
gets updated when `KSYM_NAME_LEN` changes.

The relationship used is one that keeps the new size (512+1)
close to the original buffer size (500).

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 scripts/kallsyms.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 25e2fe5fbcd4..411ff5058b51 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -32,8 +32,18 @@
 
 #define KSYM_NAME_LEN		128
 
-/* A substantially bigger size than the current maximum. */
-#define KSYM_NAME_LEN_BUFFER	499
+/*
+ * A substantially bigger size than the current maximum.
+ *
+ * It cannot be defined as an expression because it gets stringified
+ * for the fscanf() format string. Therefore, a _Static_assert() is
+ * used instead to maintain the relationship with KSYM_NAME_LEN.
+ */
+#define KSYM_NAME_LEN_BUFFER	512
+_Static_assert(
+	KSYM_NAME_LEN_BUFFER == KSYM_NAME_LEN * 4,
+	"Please keep KSYM_NAME_LEN_BUFFER in sync with KSYM_NAME_LEN"
+);
 
 struct sym_entry {
 	unsigned long long addr;
-- 
cgit v1.2.3


From 73bbb94466fd3f8b313eeb0b0467314a262dddb3 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Mon, 5 Apr 2021 04:58:39 +0200
Subject: kallsyms: support "big" kernel symbols

Rust symbols can become quite long due to namespacing introduced
by modules, types, traits, generics, etc.

Increasing to 255 is not enough in some cases, therefore
introduce longer lengths to the symbol table.

In order to avoid increasing all lengths to 2 bytes (since most
of them are small, including many Rust ones), use ULEB128 to
keep smaller symbols in 1 byte, with the rest in 2 bytes.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Co-developed-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 kernel/kallsyms.c  | 26 ++++++++++++++++++++++----
 scripts/kallsyms.c | 29 ++++++++++++++++++++++++++---
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3e7e2c2ad2f7..fc5e26348d25 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -50,12 +50,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off,
 	data = &kallsyms_names[off];
 	len = *data;
 	data++;
+	off++;
+
+	/* If MSB is 1, it is a "big" symbol, so needs an additional byte. */
+	if ((len & 0x80) != 0) {
+		len = (len & 0x7F) | (*data << 7);
+		data++;
+		off++;
+	}
 
 	/*
 	 * Update the offset to return the offset for the next symbol on
 	 * the compressed stream.
 	 */
-	off += len + 1;
+	off += len;
 
 	/*
 	 * For every byte on the compressed symbol data, copy the table
@@ -108,7 +116,7 @@ static char kallsyms_get_symbol_type(unsigned int off)
 static unsigned int get_symbol_offset(unsigned long pos)
 {
 	const u8 *name;
-	int i;
+	int i, len;
 
 	/*
 	 * Use the closest marker we have. We have markers every 256 positions,
@@ -122,8 +130,18 @@ static unsigned int get_symbol_offset(unsigned long pos)
 	 * so we just need to add the len to the current pointer for every
 	 * symbol we wish to skip.
 	 */
-	for (i = 0; i < (pos & 0xFF); i++)
-		name = name + (*name) + 1;
+	for (i = 0; i < (pos & 0xFF); i++) {
+		len = *name;
+
+		/*
+		 * If MSB is 1, it is a "big" symbol, so we need to look into
+		 * the next byte (and skip it, too).
+		 */
+		if ((len & 0x80) != 0)
+			len = ((len & 0x7F) | (name[1] << 7)) + 1;
+
+		name = name + len + 1;
+	}
 
 	return name - kallsyms_names;
 }
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 411ff5058b51..6502c4001f01 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -487,12 +487,35 @@ static void write_src(void)
 		if ((i & 0xFF) == 0)
 			markers[i >> 8] = off;
 
-		printf("\t.byte 0x%02x", table[i]->len);
+		/* There cannot be any symbol of length zero. */
+		if (table[i]->len == 0) {
+			fprintf(stderr, "kallsyms failure: "
+				"unexpected zero symbol length\n");
+			exit(EXIT_FAILURE);
+		}
+
+		/* Only lengths that fit in up-to-two-byte ULEB128 are supported. */
+		if (table[i]->len > 0x3FFF) {
+			fprintf(stderr, "kallsyms failure: "
+				"unexpected huge symbol length\n");
+			exit(EXIT_FAILURE);
+		}
+
+		/* Encode length with ULEB128. */
+		if (table[i]->len <= 0x7F) {
+			/* Most symbols use a single byte for the length. */
+			printf("\t.byte 0x%02x", table[i]->len);
+			off += table[i]->len + 1;
+		} else {
+			/* "Big" symbols use two bytes. */
+			printf("\t.byte 0x%02x, 0x%02x",
+				(table[i]->len & 0x7F) | 0x80,
+				(table[i]->len >> 7) & 0x7F);
+			off += table[i]->len + 2;
+		}
 		for (k = 0; k < table[i]->len; k++)
 			printf(", 0x%02x", table[i]->sym[k]);
 		printf("\n");
-
-		off += table[i]->len + 1;
 	}
 	printf("\n");
 
-- 
cgit v1.2.3


From b8a94bfb33952bb17fbc65f8903d242a721c533d Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Mon, 5 Apr 2021 05:03:50 +0200
Subject: kallsyms: increase maximum kernel symbol length to 512

Rust symbols can become quite long due to namespacing introduced
by modules, types, traits, generics, etc. For instance,
the following code:

    pub mod my_module {
        pub struct MyType;
        pub struct MyGenericType<T>(T);

        pub trait MyTrait {
            fn my_method() -> u32;
        }

        impl MyTrait for MyGenericType<MyType> {
            fn my_method() -> u32 {
                42
            }
        }
    }

generates a symbol of length 96 when using the upcoming v0 mangling scheme:

    _RNvXNtCshGpAVYOtgW1_7example9my_moduleINtB2_13MyGenericTypeNtB2_6MyTypeENtB2_7MyTrait9my_method

At the moment, Rust symbols may reach up to 300 in length.
Setting 512 as the maximum seems like a reasonable choice to
keep some headroom.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 include/linux/kallsyms.h            | 2 +-
 kernel/livepatch/core.c             | 4 ++--
 scripts/kallsyms.c                  | 4 ++--
 tools/include/linux/kallsyms.h      | 2 +-
 tools/lib/perf/include/perf/event.h | 2 +-
 tools/lib/symbol/kallsyms.h         | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index ad39636e0c3f..649faac31ddb 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -15,7 +15,7 @@
 
 #include <asm/sections.h>
 
-#define KSYM_NAME_LEN 128
+#define KSYM_NAME_LEN 512
 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s %s]") + \
 			(KSYM_NAME_LEN - 1) + \
 			2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + \
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index bc475e62279d..ec06ce59d728 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -213,7 +213,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
 	 * we use the smallest/strictest upper bound possible (56, based on
 	 * the current definition of MODULE_NAME_LEN) to prevent overflows.
 	 */
-	BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+	BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 512);
 
 	relas = (Elf_Rela *) relasec->sh_addr;
 	/* For each rela in this klp relocation section */
@@ -227,7 +227,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
 
 		/* Format: .klp.sym.sym_objname.sym_name,sympos */
 		cnt = sscanf(strtab + sym->st_name,
-			     ".klp.sym.%55[^.].%127[^,],%lu",
+			     ".klp.sym.%55[^.].%511[^,],%lu",
 			     sym_objname, sym_name, &sympos);
 		if (cnt != 3) {
 			pr_err("symbol %s has an incorrectly formatted name\n",
diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index 6502c4001f01..c4793301a27e 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -30,7 +30,7 @@
 #define _stringify_1(x)	#x
 #define _stringify(x)	_stringify_1(x)
 
-#define KSYM_NAME_LEN		128
+#define KSYM_NAME_LEN		512
 
 /*
  * A substantially bigger size than the current maximum.
@@ -39,7 +39,7 @@
  * for the fscanf() format string. Therefore, a _Static_assert() is
  * used instead to maintain the relationship with KSYM_NAME_LEN.
  */
-#define KSYM_NAME_LEN_BUFFER	512
+#define KSYM_NAME_LEN_BUFFER	2048
 _Static_assert(
 	KSYM_NAME_LEN_BUFFER == KSYM_NAME_LEN * 4,
 	"Please keep KSYM_NAME_LEN_BUFFER in sync with KSYM_NAME_LEN"
diff --git a/tools/include/linux/kallsyms.h b/tools/include/linux/kallsyms.h
index efb6c3f5f2a9..5a37ccbec54f 100644
--- a/tools/include/linux/kallsyms.h
+++ b/tools/include/linux/kallsyms.h
@@ -6,7 +6,7 @@
 #include <stdio.h>
 #include <unistd.h>
 
-#define KSYM_NAME_LEN 128
+#define KSYM_NAME_LEN 512
 
 struct module;
 
diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h
index 93bf93a59c99..d8ae4e944467 100644
--- a/tools/lib/perf/include/perf/event.h
+++ b/tools/lib/perf/include/perf/event.h
@@ -97,7 +97,7 @@ struct perf_record_throttle {
 };
 
 #ifndef KSYM_NAME_LEN
-#define KSYM_NAME_LEN 256
+#define KSYM_NAME_LEN 512
 #endif
 
 struct perf_record_ksymbol {
diff --git a/tools/lib/symbol/kallsyms.h b/tools/lib/symbol/kallsyms.h
index 72ab9870454b..542f9b059c3b 100644
--- a/tools/lib/symbol/kallsyms.h
+++ b/tools/lib/symbol/kallsyms.h
@@ -7,7 +7,7 @@
 #include <linux/types.h>
 
 #ifndef KSYM_NAME_LEN
-#define KSYM_NAME_LEN 256
+#define KSYM_NAME_LEN 512
 #endif
 
 static inline u8 kallsyms2elf_binding(char type)
-- 
cgit v1.2.3


From 12f577216a490b7afe809f53a11ecc734a3a9287 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sat, 3 Jul 2021 16:52:41 +0200
Subject: rust: add C helpers

Introduces the source file that will contain forwarders to C macros
and inlined functions.

Initially this only contains a single helper, but will gain more as
more functionality is added to the `kernel` crate in the future.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Geoffrey Thomas <geofft@ldpreload.com>
Signed-off-by: Geoffrey Thomas <geofft@ldpreload.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Sven Van Asbroeck <thesven73@gmail.com>
Signed-off-by: Sven Van Asbroeck <thesven73@gmail.com>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Co-developed-by: Maciej Falkowski <m.falkowski@samsung.com>
Signed-off-by: Maciej Falkowski <m.falkowski@samsung.com>
Co-developed-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/helpers.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 rust/helpers.c

diff --git a/rust/helpers.c b/rust/helpers.c
new file mode 100644
index 000000000000..b4f15eee2ffd
--- /dev/null
+++ b/rust/helpers.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Non-trivial C macros cannot be used in Rust. Similarly, inlined C functions
+ * cannot be called either. This file explicitly creates functions ("helpers")
+ * that wrap those so that they can be called from Rust.
+ *
+ * Even though Rust kernel modules should never use directly the bindings, some
+ * of these helpers need to be exported because Rust generics and inlined
+ * functions may not get their code generated in the crate where they are
+ * defined. Other helpers, called from non-inline functions, may not be
+ * exported, in principle. However, in general, the Rust compiler does not
+ * guarantee codegen will be performed for a non-inline function either.
+ * Therefore, this file exports all the helpers. In the future, this may be
+ * revisited to reduce the number of exports after the compiler is informed
+ * about the places codegen is required.
+ *
+ * All symbols are exported as GPL-only to guarantee no GPL-only feature is
+ * accidentally exposed.
+ */
+
+#include <linux/bug.h>
+#include <linux/build_bug.h>
+
+__noreturn void rust_helper_BUG(void)
+{
+	BUG();
+}
+EXPORT_SYMBOL_GPL(rust_helper_BUG);
+
+/*
+ * We use `bindgen`'s `--size_t-is-usize` option to bind the C `size_t` type
+ * as the Rust `usize` type, so we can use it in contexts where Rust
+ * expects a `usize` like slice (array) indices. `usize` is defined to be
+ * the same as C's `uintptr_t` type (can hold any pointer) but not
+ * necessarily the same as `size_t` (can hold the size of any single
+ * object). Most modern platforms use the same concrete integer type for
+ * both of them, but in case we find ourselves on a platform where
+ * that's not true, fail early instead of risking ABI or
+ * integer-overflow issues.
+ *
+ * If your platform fails this assertion, it means that you are in
+ * danger of integer-overflow bugs (even if you attempt to remove
+ * `--size_t-is-usize`). It may be easiest to change the kernel ABI on
+ * your platform such that `size_t` matches `uintptr_t` (i.e., to increase
+ * `size_t`, because `uintptr_t` has to be at least as big as `size_t`).
+ */
+static_assert(
+	sizeof(size_t) == sizeof(uintptr_t) &&
+	__alignof__(size_t) == __alignof__(uintptr_t),
+	"Rust code expects C `size_t` to match Rust `usize`"
+);
-- 
cgit v1.2.3


From 753dece88d70a23b015e01674a662e683235c08f Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Fri, 6 May 2022 17:52:44 +0200
Subject: rust: import upstream `alloc` crate

This is a subset of the Rust standard library `alloc` crate,
version 1.62.0, licensed under "Apache-2.0 OR MIT", from:

    https://github.com/rust-lang/rust/tree/1.62.0/library/alloc/src

The files are copied as-is, with no modifications whatsoever
(not even adding the SPDX identifiers).

For copyright details, please see:

    https://github.com/rust-lang/rust/blob/1.62.0/COPYRIGHT

The next patch modifies these files as needed for use within
the kernel. This patch split allows reviewers to double-check
the import and to clearly see the differences introduced.

Vendoring `alloc`, at least for the moment, allows us to have fallible
allocations support (i.e. the `try_*` versions of methods which return
a `Result` instead of panicking) early on. It also gives a bit more
freedom to experiment with new interfaces and to iterate quickly.

Eventually, the goal is to have everything the kernel needs in
upstream `alloc` and drop it from the kernel tree.

For a summary of work on `alloc` happening upstream, please see:

    https://github.com/Rust-for-Linux/linux/issues/408

The following script may be used to verify the contents:

    for path in $(cd rust/alloc/ && find . -type f -name '*.rs'); do
        curl --silent --show-error --location \
            https://github.com/rust-lang/rust/raw/1.62.0/library/alloc/src/$path \
            | diff --unified rust/alloc/$path - && echo $path: OK
    done

Reviewed-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/alloc/alloc.rs            |  438 ++++++
 rust/alloc/borrow.rs           |  496 +++++++
 rust/alloc/boxed.rs            | 2024 ++++++++++++++++++++++++++
 rust/alloc/collections/mod.rs  |  154 ++
 rust/alloc/lib.rs              |  236 +++
 rust/alloc/raw_vec.rs          |  518 +++++++
 rust/alloc/slice.rs            | 1202 ++++++++++++++++
 rust/alloc/vec/drain.rs        |  184 +++
 rust/alloc/vec/drain_filter.rs |  143 ++
 rust/alloc/vec/into_iter.rs    |  362 +++++
 rust/alloc/vec/is_zero.rs      |  118 ++
 rust/alloc/vec/mod.rs          | 3115 ++++++++++++++++++++++++++++++++++++++++
 rust/alloc/vec/partial_eq.rs   |   47 +
 13 files changed, 9037 insertions(+)
 create mode 100644 rust/alloc/alloc.rs
 create mode 100644 rust/alloc/borrow.rs
 create mode 100644 rust/alloc/boxed.rs
 create mode 100644 rust/alloc/collections/mod.rs
 create mode 100644 rust/alloc/lib.rs
 create mode 100644 rust/alloc/raw_vec.rs
 create mode 100644 rust/alloc/slice.rs
 create mode 100644 rust/alloc/vec/drain.rs
 create mode 100644 rust/alloc/vec/drain_filter.rs
 create mode 100644 rust/alloc/vec/into_iter.rs
 create mode 100644 rust/alloc/vec/is_zero.rs
 create mode 100644 rust/alloc/vec/mod.rs
 create mode 100644 rust/alloc/vec/partial_eq.rs

diff --git a/rust/alloc/alloc.rs b/rust/alloc/alloc.rs
new file mode 100644
index 000000000000..6162b5c6d4c9
--- /dev/null
+++ b/rust/alloc/alloc.rs
@@ -0,0 +1,438 @@
+//! Memory allocation APIs
+
+#![stable(feature = "alloc_module", since = "1.28.0")]
+
+#[cfg(not(test))]
+use core::intrinsics;
+use core::intrinsics::{min_align_of_val, size_of_val};
+
+use core::ptr::Unique;
+#[cfg(not(test))]
+use core::ptr::{self, NonNull};
+
+#[stable(feature = "alloc_module", since = "1.28.0")]
+#[doc(inline)]
+pub use core::alloc::*;
+
+use core::marker::Destruct;
+
+#[cfg(test)]
+mod tests;
+
+extern "Rust" {
+    // These are the magic symbols to call the global allocator.  rustc generates
+    // them to call `__rg_alloc` etc. if there is a `#[global_allocator]` attribute
+    // (the code expanding that attribute macro generates those functions), or to call
+    // the default implementations in libstd (`__rdl_alloc` etc. in `library/std/src/alloc.rs`)
+    // otherwise.
+    // The rustc fork of LLVM also special-cases these function names to be able to optimize them
+    // like `malloc`, `realloc`, and `free`, respectively.
+    #[rustc_allocator]
+    #[rustc_allocator_nounwind]
+    fn __rust_alloc(size: usize, align: usize) -> *mut u8;
+    #[rustc_allocator_nounwind]
+    fn __rust_dealloc(ptr: *mut u8, size: usize, align: usize);
+    #[rustc_allocator_nounwind]
+    fn __rust_realloc(ptr: *mut u8, old_size: usize, align: usize, new_size: usize) -> *mut u8;
+    #[rustc_allocator_nounwind]
+    fn __rust_alloc_zeroed(size: usize, align: usize) -> *mut u8;
+}
+
+/// The global memory allocator.
+///
+/// This type implements the [`Allocator`] trait by forwarding calls
+/// to the allocator registered with the `#[global_allocator]` attribute
+/// if there is one, or the `std` crate’s default.
+///
+/// Note: while this type is unstable, the functionality it provides can be
+/// accessed through the [free functions in `alloc`](self#functions).
+#[unstable(feature = "allocator_api", issue = "32838")]
+#[derive(Copy, Clone, Default, Debug)]
+#[cfg(not(test))]
+pub struct Global;
+
+#[cfg(test)]
+pub use std::alloc::Global;
+
+/// Allocate memory with the global allocator.
+///
+/// This function forwards calls to the [`GlobalAlloc::alloc`] method
+/// of the allocator registered with the `#[global_allocator]` attribute
+/// if there is one, or the `std` crate’s default.
+///
+/// This function is expected to be deprecated in favor of the `alloc` method
+/// of the [`Global`] type when it and the [`Allocator`] trait become stable.
+///
+/// # Safety
+///
+/// See [`GlobalAlloc::alloc`].
+///
+/// # Examples
+///
+/// ```
+/// use std::alloc::{alloc, dealloc, Layout};
+///
+/// unsafe {
+///     let layout = Layout::new::<u16>();
+///     let ptr = alloc(layout);
+///
+///     *(ptr as *mut u16) = 42;
+///     assert_eq!(*(ptr as *mut u16), 42);
+///
+///     dealloc(ptr, layout);
+/// }
+/// ```
+#[stable(feature = "global_alloc", since = "1.28.0")]
+#[must_use = "losing the pointer will leak memory"]
+#[inline]
+pub unsafe fn alloc(layout: Layout) -> *mut u8 {
+    unsafe { __rust_alloc(layout.size(), layout.align()) }
+}
+
+/// Deallocate memory with the global allocator.
+///
+/// This function forwards calls to the [`GlobalAlloc::dealloc`] method
+/// of the allocator registered with the `#[global_allocator]` attribute
+/// if there is one, or the `std` crate’s default.
+///
+/// This function is expected to be deprecated in favor of the `dealloc` method
+/// of the [`Global`] type when it and the [`Allocator`] trait become stable.
+///
+/// # Safety
+///
+/// See [`GlobalAlloc::dealloc`].
+#[stable(feature = "global_alloc", since = "1.28.0")]
+#[inline]
+pub unsafe fn dealloc(ptr: *mut u8, layout: Layout) {
+    unsafe { __rust_dealloc(ptr, layout.size(), layout.align()) }
+}
+
+/// Reallocate memory with the global allocator.
+///
+/// This function forwards calls to the [`GlobalAlloc::realloc`] method
+/// of the allocator registered with the `#[global_allocator]` attribute
+/// if there is one, or the `std` crate’s default.
+///
+/// This function is expected to be deprecated in favor of the `realloc` method
+/// of the [`Global`] type when it and the [`Allocator`] trait become stable.
+///
+/// # Safety
+///
+/// See [`GlobalAlloc::realloc`].
+#[stable(feature = "global_alloc", since = "1.28.0")]
+#[must_use = "losing the pointer will leak memory"]
+#[inline]
+pub unsafe fn realloc(ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
+    unsafe { __rust_realloc(ptr, layout.size(), layout.align(), new_size) }
+}
+
+/// Allocate zero-initialized memory with the global allocator.
+///
+/// This function forwards calls to the [`GlobalAlloc::alloc_zeroed`] method
+/// of the allocator registered with the `#[global_allocator]` attribute
+/// if there is one, or the `std` crate’s default.
+///
+/// This function is expected to be deprecated in favor of the `alloc_zeroed` method
+/// of the [`Global`] type when it and the [`Allocator`] trait become stable.
+///
+/// # Safety
+///
+/// See [`GlobalAlloc::alloc_zeroed`].
+///
+/// # Examples
+///
+/// ```
+/// use std::alloc::{alloc_zeroed, dealloc, Layout};
+///
+/// unsafe {
+///     let layout = Layout::new::<u16>();
+///     let ptr = alloc_zeroed(layout);
+///
+///     assert_eq!(*(ptr as *mut u16), 0);
+///
+///     dealloc(ptr, layout);
+/// }
+/// ```
+#[stable(feature = "global_alloc", since = "1.28.0")]
+#[must_use = "losing the pointer will leak memory"]
+#[inline]
+pub unsafe fn alloc_zeroed(layout: Layout) -> *mut u8 {
+    unsafe { __rust_alloc_zeroed(layout.size(), layout.align()) }
+}
+
+#[cfg(not(test))]
+impl Global {
+    #[inline]
+    fn alloc_impl(&self, layout: Layout, zeroed: bool) -> Result<NonNull<[u8]>, AllocError> {
+        match layout.size() {
+            0 => Ok(NonNull::slice_from_raw_parts(layout.dangling(), 0)),
+            // SAFETY: `layout` is non-zero in size,
+            size => unsafe {
+                let raw_ptr = if zeroed { alloc_zeroed(layout) } else { alloc(layout) };
+                let ptr = NonNull::new(raw_ptr).ok_or(AllocError)?;
+                Ok(NonNull::slice_from_raw_parts(ptr, size))
+            },
+        }
+    }
+
+    // SAFETY: Same as `Allocator::grow`
+    #[inline]
+    unsafe fn grow_impl(
+        &self,
+        ptr: NonNull<u8>,
+        old_layout: Layout,
+        new_layout: Layout,
+        zeroed: bool,
+    ) -> Result<NonNull<[u8]>, AllocError> {
+        debug_assert!(
+            new_layout.size() >= old_layout.size(),
+            "`new_layout.size()` must be greater than or equal to `old_layout.size()`"
+        );
+
+        match old_layout.size() {
+            0 => self.alloc_impl(new_layout, zeroed),
+
+            // SAFETY: `new_size` is non-zero as `old_size` is greater than or equal to `new_size`
+            // as required by safety conditions. Other conditions must be upheld by the caller
+            old_size if old_layout.align() == new_layout.align() => unsafe {
+                let new_size = new_layout.size();
+
+                // `realloc` probably checks for `new_size >= old_layout.size()` or something similar.
+                intrinsics::assume(new_size >= old_layout.size());
+
+                let raw_ptr = realloc(ptr.as_ptr(), old_layout, new_size);
+                let ptr = NonNull::new(raw_ptr).ok_or(AllocError)?;
+                if zeroed {
+                    raw_ptr.add(old_size).write_bytes(0, new_size - old_size);
+                }
+                Ok(NonNull::slice_from_raw_parts(ptr, new_size))
+            },
+
+            // SAFETY: because `new_layout.size()` must be greater than or equal to `old_size`,
+            // both the old and new memory allocation are valid for reads and writes for `old_size`
+            // bytes. Also, because the old allocation wasn't yet deallocated, it cannot overlap
+            // `new_ptr`. Thus, the call to `copy_nonoverlapping` is safe. The safety contract
+            // for `dealloc` must be upheld by the caller.
+            old_size => unsafe {
+                let new_ptr = self.alloc_impl(new_layout, zeroed)?;
+                ptr::copy_nonoverlapping(ptr.as_ptr(), new_ptr.as_mut_ptr(), old_size);
+                self.deallocate(ptr, old_layout);
+                Ok(new_ptr)
+            },
+        }
+    }
+}
+
+#[unstable(feature = "allocator_api", issue = "32838")]
+#[cfg(not(test))]
+unsafe impl Allocator for Global {
+    #[inline]
+    fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
+        self.alloc_impl(layout, false)
+    }
+
+    #[inline]
+    fn allocate_zeroed(&self, layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
+        self.alloc_impl(layout, true)
+    }
+
+    #[inline]
+    unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) {
+        if layout.size() != 0 {
+            // SAFETY: `layout` is non-zero in size,
+            // other conditions must be upheld by the caller
+            unsafe { dealloc(ptr.as_ptr(), layout) }
+        }
+    }
+
+    #[inline]
+    unsafe fn grow(
+        &self,
+        ptr: NonNull<u8>,
+        old_layout: Layout,
+        new_layout: Layout,
+    ) -> Result<NonNull<[u8]>, AllocError> {
+        // SAFETY: all conditions must be upheld by the caller
+        unsafe { self.grow_impl(ptr, old_layout, new_layout, false) }
+    }
+
+    #[inline]
+    unsafe fn grow_zeroed(
+        &self,
+        ptr: NonNull<u8>,
+        old_layout: Layout,
+        new_layout: Layout,
+    ) -> Result<NonNull<[u8]>, AllocError> {
+        // SAFETY: all conditions must be upheld by the caller
+        unsafe { self.grow_impl(ptr, old_layout, new_layout, true) }
+    }
+
+    #[inline]
+    unsafe fn shrink(
+        &self,
+        ptr: NonNull<u8>,
+        old_layout: Layout,
+        new_layout: Layout,
+    ) -> Result<NonNull<[u8]>, AllocError> {
+        debug_assert!(
+            new_layout.size() <= old_layout.size(),
+            "`new_layout.size()` must be smaller than or equal to `old_layout.size()`"
+        );
+
+        match new_layout.size() {
+            // SAFETY: conditions must be upheld by the caller
+            0 => unsafe {
+                self.deallocate(ptr, old_layout);
+                Ok(NonNull::slice_from_raw_parts(new_layout.dangling(), 0))
+            },
+
+            // SAFETY: `new_size` is non-zero. Other conditions must be upheld by the caller
+            new_size if old_layout.align() == new_layout.align() => unsafe {
+                // `realloc` probably checks for `new_size <= old_layout.size()` or something similar.
+                intrinsics::assume(new_size <= old_layout.size());
+
+                let raw_ptr = realloc(ptr.as_ptr(), old_layout, new_size);
+                let ptr = NonNull::new(raw_ptr).ok_or(AllocError)?;
+                Ok(NonNull::slice_from_raw_parts(ptr, new_size))
+            },
+
+            // SAFETY: because `new_size` must be smaller than or equal to `old_layout.size()`,
+            // both the old and new memory allocation are valid for reads and writes for `new_size`
+            // bytes. Also, because the old allocation wasn't yet deallocated, it cannot overlap
+            // `new_ptr`. Thus, the call to `copy_nonoverlapping` is safe. The safety contract
+            // for `dealloc` must be upheld by the caller.
+            new_size => unsafe {
+                let new_ptr = self.allocate(new_layout)?;
+                ptr::copy_nonoverlapping(ptr.as_ptr(), new_ptr.as_mut_ptr(), new_size);
+                self.deallocate(ptr, old_layout);
+                Ok(new_ptr)
+            },
+        }
+    }
+}
+
+/// The allocator for unique pointers.
+#[cfg(all(not(no_global_oom_handling), not(test)))]
+#[lang = "exchange_malloc"]
+#[inline]
+unsafe fn exchange_malloc(size: usize, align: usize) -> *mut u8 {
+    let layout = unsafe { Layout::from_size_align_unchecked(size, align) };
+    match Global.allocate(layout) {
+        Ok(ptr) => ptr.as_mut_ptr(),
+        Err(_) => handle_alloc_error(layout),
+    }
+}
+
+#[cfg_attr(not(test), lang = "box_free")]
+#[inline]
+#[rustc_const_unstable(feature = "const_box", issue = "92521")]
+// This signature has to be the same as `Box`, otherwise an ICE will happen.
+// When an additional parameter to `Box` is added (like `A: Allocator`), this has to be added here as
+// well.
+// For example if `Box` is changed to  `struct Box<T: ?Sized, A: Allocator>(Unique<T>, A)`,
+// this function has to be changed to `fn box_free<T: ?Sized, A: Allocator>(Unique<T>, A)` as well.
+pub(crate) const unsafe fn box_free<T: ?Sized, A: ~const Allocator + ~const Destruct>(
+    ptr: Unique<T>,
+    alloc: A,
+) {
+    unsafe {
+        let size = size_of_val(ptr.as_ref());
+        let align = min_align_of_val(ptr.as_ref());
+        let layout = Layout::from_size_align_unchecked(size, align);
+        alloc.deallocate(From::from(ptr.cast()), layout)
+    }
+}
+
+// # Allocation error handler
+
+#[cfg(not(no_global_oom_handling))]
+extern "Rust" {
+    // This is the magic symbol to call the global alloc error handler.  rustc generates
+    // it to call `__rg_oom` if there is a `#[alloc_error_handler]`, or to call the
+    // default implementations below (`__rdl_oom`) otherwise.
+    fn __rust_alloc_error_handler(size: usize, align: usize) -> !;
+}
+
+/// Abort on memory allocation error or failure.
+///
+/// Callers of memory allocation APIs wishing to abort computation
+/// in response to an allocation error are encouraged to call this function,
+/// rather than directly invoking `panic!` or similar.
+///
+/// The default behavior of this function is to print a message to standard error
+/// and abort the process.
+/// It can be replaced with [`set_alloc_error_hook`] and [`take_alloc_error_hook`].
+///
+/// [`set_alloc_error_hook`]: ../../std/alloc/fn.set_alloc_error_hook.html
+/// [`take_alloc_error_hook`]: ../../std/alloc/fn.take_alloc_error_hook.html
+#[stable(feature = "global_alloc", since = "1.28.0")]
+#[rustc_const_unstable(feature = "const_alloc_error", issue = "92523")]
+#[cfg(all(not(no_global_oom_handling), not(test)))]
+#[cold]
+pub const fn handle_alloc_error(layout: Layout) -> ! {
+    const fn ct_error(_: Layout) -> ! {
+        panic!("allocation failed");
+    }
+
+    fn rt_error(layout: Layout) -> ! {
+        unsafe {
+            __rust_alloc_error_handler(layout.size(), layout.align());
+        }
+    }
+
+    unsafe { core::intrinsics::const_eval_select((layout,), ct_error, rt_error) }
+}
+
+// For alloc test `std::alloc::handle_alloc_error` can be used directly.
+#[cfg(all(not(no_global_oom_handling), test))]
+pub use std::alloc::handle_alloc_error;
+
+#[cfg(all(not(no_global_oom_handling), not(test)))]
+#[doc(hidden)]
+#[allow(unused_attributes)]
+#[unstable(feature = "alloc_internals", issue = "none")]
+pub mod __alloc_error_handler {
+    use crate::alloc::Layout;
+
+    // called via generated `__rust_alloc_error_handler`
+
+    // if there is no `#[alloc_error_handler]`
+    #[rustc_std_internal_symbol]
+    pub unsafe extern "C-unwind" fn __rdl_oom(size: usize, _align: usize) -> ! {
+        panic!("memory allocation of {size} bytes failed")
+    }
+
+    // if there is an `#[alloc_error_handler]`
+    #[rustc_std_internal_symbol]
+    pub unsafe extern "C-unwind" fn __rg_oom(size: usize, align: usize) -> ! {
+        let layout = unsafe { Layout::from_size_align_unchecked(size, align) };
+        extern "Rust" {
+            #[lang = "oom"]
+            fn oom_impl(layout: Layout) -> !;
+        }
+        unsafe { oom_impl(layout) }
+    }
+}
+
+/// Specialize clones into pre-allocated, uninitialized memory.
+/// Used by `Box::clone` and `Rc`/`Arc::make_mut`.
+pub(crate) trait WriteCloneIntoRaw: Sized {
+    unsafe fn write_clone_into_raw(&self, target: *mut Self);
+}
+
+impl<T: Clone> WriteCloneIntoRaw for T {
+    #[inline]
+    default unsafe fn write_clone_into_raw(&self, target: *mut Self) {
+        // Having allocated *first* may allow the optimizer to create
+        // the cloned value in-place, skipping the local and move.
+        unsafe { target.write(self.clone()) };
+    }
+}
+
+impl<T: Copy> WriteCloneIntoRaw for T {
+    #[inline]
+    unsafe fn write_clone_into_raw(&self, target: *mut Self) {
+        // We can always copy in-place, without ever involving a local value.
+        unsafe { target.copy_from_nonoverlapping(self, 1) };
+    }
+}
diff --git a/rust/alloc/borrow.rs b/rust/alloc/borrow.rs
new file mode 100644
index 000000000000..cb4e438f8bea
--- /dev/null
+++ b/rust/alloc/borrow.rs
@@ -0,0 +1,496 @@
+//! A module for working with borrowed data.
+
+#![stable(feature = "rust1", since = "1.0.0")]
+
+use core::cmp::Ordering;
+use core::hash::{Hash, Hasher};
+use core::ops::Deref;
+#[cfg(not(no_global_oom_handling))]
+use core::ops::{Add, AddAssign};
+
+#[stable(feature = "rust1", since = "1.0.0")]
+pub use core::borrow::{Borrow, BorrowMut};
+
+use crate::fmt;
+#[cfg(not(no_global_oom_handling))]
+use crate::string::String;
+
+use Cow::*;
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<'a, B: ?Sized> Borrow<B> for Cow<'a, B>
+where
+    B: ToOwned,
+    <B as ToOwned>::Owned: 'a,
+{
+    fn borrow(&self) -> &B {
+        &**self
+    }
+}
+
+/// A generalization of `Clone` to borrowed data.
+///
+/// Some types make it possible to go from borrowed to owned, usually by
+/// implementing the `Clone` trait. But `Clone` works only for going from `&T`
+/// to `T`. The `ToOwned` trait generalizes `Clone` to construct owned data
+/// from any borrow of a given type.
+#[cfg_attr(not(test), rustc_diagnostic_item = "ToOwned")]
+#[stable(feature = "rust1", since = "1.0.0")]
+pub trait ToOwned {
+    /// The resulting type after obtaining ownership.
+    #[stable(feature = "rust1", since = "1.0.0")]
+    type Owned: Borrow<Self>;
+
+    /// Creates owned data from borrowed data, usually by cloning.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// let s: &str = "a";
+    /// let ss: String = s.to_owned();
+    ///
+    /// let v: &[i32] = &[1, 2];
+    /// let vv: Vec<i32> = v.to_owned();
+    /// ```
+    #[stable(feature = "rust1", since = "1.0.0")]
+    #[must_use = "cloning is often expensive and is not expected to have side effects"]
+    fn to_owned(&self) -> Self::Owned;
+
+    /// Uses borrowed data to replace owned data, usually by cloning.
+    ///
+    /// This is borrow-generalized version of `Clone::clone_from`.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// # #![feature(toowned_clone_into)]
+    /// let mut s: String = String::new();
+    /// "hello".clone_into(&mut s);
+    ///
+    /// let mut v: Vec<i32> = Vec::new();
+    /// [1, 2][..].clone_into(&mut v);
+    /// ```
+    #[unstable(feature = "toowned_clone_into", reason = "recently added", issue = "41263")]
+    fn clone_into(&self, target: &mut Self::Owned) {
+        *target = self.to_owned();
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T> ToOwned for T
+where
+    T: Clone,
+{
+    type Owned = T;
+    fn to_owned(&self) -> T {
+        self.clone()
+    }
+
+    fn clone_into(&self, target: &mut T) {
+        target.clone_from(self);
+    }
+}
+
+/// A clone-on-write smart pointer.
+///
+/// The type `Cow` is a smart pointer providing clone-on-write functionality: it
+/// can enclose and provide immutable access to borrowed data, and clone the
+/// data lazily when mutation or ownership is required. The type is designed to
+/// work with general borrowed data via the `Borrow` trait.
+///
+/// `Cow` implements `Deref`, which means that you can call
+/// non-mutating methods directly on the data it encloses. If mutation
+/// is desired, `to_mut` will obtain a mutable reference to an owned
+/// value, cloning if necessary.
+///
+/// If you need reference-counting pointers, note that
+/// [`Rc::make_mut`][crate::rc::Rc::make_mut] and
+/// [`Arc::make_mut`][crate::sync::Arc::make_mut] can provide clone-on-write
+/// functionality as well.
+///
+/// # Examples
+///
+/// ```
+/// use std::borrow::Cow;
+///
+/// fn abs_all(input: &mut Cow<[i32]>) {
+///     for i in 0..input.len() {
+///         let v = input[i];
+///         if v < 0 {
+///             // Clones into a vector if not already owned.
+///             input.to_mut()[i] = -v;
+///         }
+///     }
+/// }
+///
+/// // No clone occurs because `input` doesn't need to be mutated.
+/// let slice = [0, 1, 2];
+/// let mut input = Cow::from(&slice[..]);
+/// abs_all(&mut input);
+///
+/// // Clone occurs because `input` needs to be mutated.
+/// let slice = [-1, 0, 1];
+/// let mut input = Cow::from(&slice[..]);
+/// abs_all(&mut input);
+///
+/// // No clone occurs because `input` is already owned.
+/// let mut input = Cow::from(vec![-1, 0, 1]);
+/// abs_all(&mut input);
+/// ```
+///
+/// Another example showing how to keep `Cow` in a struct:
+///
+/// ```
+/// use std::borrow::Cow;
+///
+/// struct Items<'a, X: 'a> where [X]: ToOwned<Owned = Vec<X>> {
+///     values: Cow<'a, [X]>,
+/// }
+///
+/// impl<'a, X: Clone + 'a> Items<'a, X> where [X]: ToOwned<Owned = Vec<X>> {
+///     fn new(v: Cow<'a, [X]>) -> Self {
+///         Items { values: v }
+///     }
+/// }
+///
+/// // Creates a container from borrowed values of a slice
+/// let readonly = [1, 2];
+/// let borrowed = Items::new((&readonly[..]).into());
+/// match borrowed {
+///     Items { values: Cow::Borrowed(b) } => println!("borrowed {b:?}"),
+///     _ => panic!("expect borrowed value"),
+/// }
+///
+/// let mut clone_on_write = borrowed;
+/// // Mutates the data from slice into owned vec and pushes a new value on top
+/// clone_on_write.values.to_mut().push(3);
+/// println!("clone_on_write = {:?}", clone_on_write.values);
+///
+/// // The data was mutated. Let's check it out.
+/// match clone_on_write {
+///     Items { values: Cow::Owned(_) } => println!("clone_on_write contains owned data"),
+///     _ => panic!("expect owned data"),
+/// }
+/// ```
+#[stable(feature = "rust1", since = "1.0.0")]
+#[cfg_attr(not(test), rustc_diagnostic_item = "Cow")]
+pub enum Cow<'a, B: ?Sized + 'a>
+where
+    B: ToOwned,
+{
+    /// Borrowed data.
+    #[stable(feature = "rust1", since = "1.0.0")]
+    Borrowed(#[stable(feature = "rust1", since = "1.0.0")] &'a B),
+
+    /// Owned data.
+    #[stable(feature = "rust1", since = "1.0.0")]
+    Owned(#[stable(feature = "rust1", since = "1.0.0")] <B as ToOwned>::Owned),
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<B: ?Sized + ToOwned> Clone for Cow<'_, B> {
+    fn clone(&self) -> Self {
+        match *self {
+            Borrowed(b) => Borrowed(b),
+            Owned(ref o) => {
+                let b: &B = o.borrow();
+                Owned(b.to_owned())
+            }
+        }
+    }
+
+    fn clone_from(&mut self, source: &Self) {
+        match (self, source) {
+            (&mut Owned(ref mut dest), &Owned(ref o)) => o.borrow().clone_into(dest),
+            (t, s) => *t = s.clone(),
+        }
+    }
+}
+
+impl<B: ?Sized + ToOwned> Cow<'_, B> {
+    /// Returns true if the data is borrowed, i.e. if `to_mut` would require additional work.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(cow_is_borrowed)]
+    /// use std::borrow::Cow;
+    ///
+    /// let cow = Cow::Borrowed("moo");
+    /// assert!(cow.is_borrowed());
+    ///
+    /// let bull: Cow<'_, str> = Cow::Owned("...moo?".to_string());
+    /// assert!(!bull.is_borrowed());
+    /// ```
+    #[unstable(feature = "cow_is_borrowed", issue = "65143")]
+    #[rustc_const_unstable(feature = "const_cow_is_borrowed", issue = "65143")]
+    pub const fn is_borrowed(&self) -> bool {
+        match *self {
+            Borrowed(_) => true,
+            Owned(_) => false,
+        }
+    }
+
+    /// Returns true if the data is owned, i.e. if `to_mut` would be a no-op.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(cow_is_borrowed)]
+    /// use std::borrow::Cow;
+    ///
+    /// let cow: Cow<'_, str> = Cow::Owned("moo".to_string());
+    /// assert!(cow.is_owned());
+    ///
+    /// let bull = Cow::Borrowed("...moo?");
+    /// assert!(!bull.is_owned());
+    /// ```
+    #[unstable(feature = "cow_is_borrowed", issue = "65143")]
+    #[rustc_const_unstable(feature = "const_cow_is_borrowed", issue = "65143")]
+    pub const fn is_owned(&self) -> bool {
+        !self.is_borrowed()
+    }
+
+    /// Acquires a mutable reference to the owned form of the data.
+    ///
+    /// Clones the data if it is not already owned.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use std::borrow::Cow;
+    ///
+    /// let mut cow = Cow::Borrowed("foo");
+    /// cow.to_mut().make_ascii_uppercase();
+    ///
+    /// assert_eq!(
+    ///   cow,
+    ///   Cow::Owned(String::from("FOO")) as Cow<str>
+    /// );
+    /// ```
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn to_mut(&mut self) -> &mut <B as ToOwned>::Owned {
+        match *self {
+            Borrowed(borrowed) => {
+                *self = Owned(borrowed.to_owned());
+                match *self {
+                    Borrowed(..) => unreachable!(),
+                    Owned(ref mut owned) => owned,
+                }
+            }
+            Owned(ref mut owned) => owned,
+        }
+    }
+
+    /// Extracts the owned data.
+    ///
+    /// Clones the data if it is not already owned.
+    ///
+    /// # Examples
+    ///
+    /// Calling `into_owned` on a `Cow::Borrowed` returns a clone of the borrowed data:
+    ///
+    /// ```
+    /// use std::borrow::Cow;
+    ///
+    /// let s = "Hello world!";
+    /// let cow = Cow::Borrowed(s);
+    ///
+    /// assert_eq!(
+    ///   cow.into_owned(),
+    ///   String::from(s)
+    /// );
+    /// ```
+    ///
+    /// Calling `into_owned` on a `Cow::Owned` returns the owned data. The data is moved out of the
+    /// `Cow` without being cloned.
+    ///
+    /// ```
+    /// use std::borrow::Cow;
+    ///
+    /// let s = "Hello world!";
+    /// let cow: Cow<str> = Cow::Owned(String::from(s));
+    ///
+    /// assert_eq!(
+    ///   cow.into_owned(),
+    ///   String::from(s)
+    /// );
+    /// ```
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn into_owned(self) -> <B as ToOwned>::Owned {
+        match self {
+            Borrowed(borrowed) => borrowed.to_owned(),
+            Owned(owned) => owned,
+        }
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+#[rustc_const_unstable(feature = "const_deref", issue = "88955")]
+impl<B: ?Sized + ToOwned> const Deref for Cow<'_, B>
+where
+    B::Owned: ~const Borrow<B>,
+{
+    type Target = B;
+
+    fn deref(&self) -> &B {
+        match *self {
+            Borrowed(borrowed) => borrowed,
+            Owned(ref owned) => owned.borrow(),
+        }
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<B: ?Sized> Eq for Cow<'_, B> where B: Eq + ToOwned {}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<B: ?Sized> Ord for Cow<'_, B>
+where
+    B: Ord + ToOwned,
+{
+    #[inline]
+    fn cmp(&self, other: &Self) -> Ordering {
+        Ord::cmp(&**self, &**other)
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<'a, 'b, B: ?Sized, C: ?Sized> PartialEq<Cow<'b, C>> for Cow<'a, B>
+where
+    B: PartialEq<C> + ToOwned,
+    C: ToOwned,
+{
+    #[inline]
+    fn eq(&self, other: &Cow<'b, C>) -> bool {
+        PartialEq::eq(&**self, &**other)
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<'a, B: ?Sized> PartialOrd for Cow<'a, B>
+where
+    B: PartialOrd + ToOwned,
+{
+    #[inline]
+    fn partial_cmp(&self, other: &Cow<'a, B>) -> Option<Ordering> {
+        PartialOrd::partial_cmp(&**self, &**other)
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<B: ?Sized> fmt::Debug for Cow<'_, B>
+where
+    B: fmt::Debug + ToOwned<Owned: fmt::Debug>,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match *self {
+            Borrowed(ref b) => fmt::Debug::fmt(b, f),
+            Owned(ref o) => fmt::Debug::fmt(o, f),
+        }
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<B: ?Sized> fmt::Display for Cow<'_, B>
+where
+    B: fmt::Display + ToOwned<Owned: fmt::Display>,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match *self {
+            Borrowed(ref b) => fmt::Display::fmt(b, f),
+            Owned(ref o) => fmt::Display::fmt(o, f),
+        }
+    }
+}
+
+#[stable(feature = "default", since = "1.11.0")]
+impl<B: ?Sized> Default for Cow<'_, B>
+where
+    B: ToOwned<Owned: Default>,
+{
+    /// Creates an owned Cow<'a, B> with the default value for the contained owned value.
+    fn default() -> Self {
+        Owned(<B as ToOwned>::Owned::default())
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<B: ?Sized> Hash for Cow<'_, B>
+where
+    B: Hash + ToOwned,
+{
+    #[inline]
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        Hash::hash(&**self, state)
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: ?Sized + ToOwned> AsRef<T> for Cow<'_, T> {
+    fn as_ref(&self) -> &T {
+        self
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "cow_add", since = "1.14.0")]
+impl<'a> Add<&'a str> for Cow<'a, str> {
+    type Output = Cow<'a, str>;
+
+    #[inline]
+    fn add(mut self, rhs: &'a str) -> Self::Output {
+        self += rhs;
+        self
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "cow_add", since = "1.14.0")]
+impl<'a> Add<Cow<'a, str>> for Cow<'a, str> {
+    type Output = Cow<'a, str>;
+
+    #[inline]
+    fn add(mut self, rhs: Cow<'a, str>) -> Self::Output {
+        self += rhs;
+        self
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "cow_add", since = "1.14.0")]
+impl<'a> AddAssign<&'a str> for Cow<'a, str> {
+    fn add_assign(&mut self, rhs: &'a str) {
+        if self.is_empty() {
+            *self = Cow::Borrowed(rhs)
+        } else if !rhs.is_empty() {
+            if let Cow::Borrowed(lhs) = *self {
+                let mut s = String::with_capacity(lhs.len() + rhs.len());
+                s.push_str(lhs);
+                *self = Cow::Owned(s);
+            }
+            self.to_mut().push_str(rhs);
+        }
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "cow_add", since = "1.14.0")]
+impl<'a> AddAssign<Cow<'a, str>> for Cow<'a, str> {
+    fn add_assign(&mut self, rhs: Cow<'a, str>) {
+        if self.is_empty() {
+            *self = rhs
+        } else if !rhs.is_empty() {
+            if let Cow::Borrowed(lhs) = *self {
+                let mut s = String::with_capacity(lhs.len() + rhs.len());
+                s.push_str(lhs);
+                *self = Cow::Owned(s);
+            }
+            self.to_mut().push_str(&rhs);
+        }
+    }
+}
diff --git a/rust/alloc/boxed.rs b/rust/alloc/boxed.rs
new file mode 100644
index 000000000000..c07536f0d0ce
--- /dev/null
+++ b/rust/alloc/boxed.rs
@@ -0,0 +1,2024 @@
+//! A pointer type for heap allocation.
+//!
+//! [`Box<T>`], casually referred to as a 'box', provides the simplest form of
+//! heap allocation in Rust. Boxes provide ownership for this allocation, and
+//! drop their contents when they go out of scope. Boxes also ensure that they
+//! never allocate more than `isize::MAX` bytes.
+//!
+//! # Examples
+//!
+//! Move a value from the stack to the heap by creating a [`Box`]:
+//!
+//! ```
+//! let val: u8 = 5;
+//! let boxed: Box<u8> = Box::new(val);
+//! ```
+//!
+//! Move a value from a [`Box`] back to the stack by [dereferencing]:
+//!
+//! ```
+//! let boxed: Box<u8> = Box::new(5);
+//! let val: u8 = *boxed;
+//! ```
+//!
+//! Creating a recursive data structure:
+//!
+//! ```
+//! #[derive(Debug)]
+//! enum List<T> {
+//!     Cons(T, Box<List<T>>),
+//!     Nil,
+//! }
+//!
+//! let list: List<i32> = List::Cons(1, Box::new(List::Cons(2, Box::new(List::Nil))));
+//! println!("{list:?}");
+//! ```
+//!
+//! This will print `Cons(1, Cons(2, Nil))`.
+//!
+//! Recursive structures must be boxed, because if the definition of `Cons`
+//! looked like this:
+//!
+//! ```compile_fail,E0072
+//! # enum List<T> {
+//! Cons(T, List<T>),
+//! # }
+//! ```
+//!
+//! It wouldn't work. This is because the size of a `List` depends on how many
+//! elements are in the list, and so we don't know how much memory to allocate
+//! for a `Cons`. By introducing a [`Box<T>`], which has a defined size, we know how
+//! big `Cons` needs to be.
+//!
+//! # Memory layout
+//!
+//! For non-zero-sized values, a [`Box`] will use the [`Global`] allocator for
+//! its allocation. It is valid to convert both ways between a [`Box`] and a
+//! raw pointer allocated with the [`Global`] allocator, given that the
+//! [`Layout`] used with the allocator is correct for the type. More precisely,
+//! a `value: *mut T` that has been allocated with the [`Global`] allocator
+//! with `Layout::for_value(&*value)` may be converted into a box using
+//! [`Box::<T>::from_raw(value)`]. Conversely, the memory backing a `value: *mut
+//! T` obtained from [`Box::<T>::into_raw`] may be deallocated using the
+//! [`Global`] allocator with [`Layout::for_value(&*value)`].
+//!
+//! For zero-sized values, the `Box` pointer still has to be [valid] for reads
+//! and writes and sufficiently aligned. In particular, casting any aligned
+//! non-zero integer literal to a raw pointer produces a valid pointer, but a
+//! pointer pointing into previously allocated memory that since got freed is
+//! not valid. The recommended way to build a Box to a ZST if `Box::new` cannot
+//! be used is to use [`ptr::NonNull::dangling`].
+//!
+//! So long as `T: Sized`, a `Box<T>` is guaranteed to be represented
+//! as a single pointer and is also ABI-compatible with C pointers
+//! (i.e. the C type `T*`). This means that if you have extern "C"
+//! Rust functions that will be called from C, you can define those
+//! Rust functions using `Box<T>` types, and use `T*` as corresponding
+//! type on the C side. As an example, consider this C header which
+//! declares functions that create and destroy some kind of `Foo`
+//! value:
+//!
+//! ```c
+//! /* C header */
+//!
+//! /* Returns ownership to the caller */
+//! struct Foo* foo_new(void);
+//!
+//! /* Takes ownership from the caller; no-op when invoked with null */
+//! void foo_delete(struct Foo*);
+//! ```
+//!
+//! These two functions might be implemented in Rust as follows. Here, the
+//! `struct Foo*` type from C is translated to `Box<Foo>`, which captures
+//! the ownership constraints. Note also that the nullable argument to
+//! `foo_delete` is represented in Rust as `Option<Box<Foo>>`, since `Box<Foo>`
+//! cannot be null.
+//!
+//! ```
+//! #[repr(C)]
+//! pub struct Foo;
+//!
+//! #[no_mangle]
+//! pub extern "C" fn foo_new() -> Box<Foo> {
+//!     Box::new(Foo)
+//! }
+//!
+//! #[no_mangle]
+//! pub extern "C" fn foo_delete(_: Option<Box<Foo>>) {}
+//! ```
+//!
+//! Even though `Box<T>` has the same representation and C ABI as a C pointer,
+//! this does not mean that you can convert an arbitrary `T*` into a `Box<T>`
+//! and expect things to work. `Box<T>` values will always be fully aligned,
+//! non-null pointers. Moreover, the destructor for `Box<T>` will attempt to
+//! free the value with the global allocator. In general, the best practice
+//! is to only use `Box<T>` for pointers that originated from the global
+//! allocator.
+//!
+//! **Important.** At least at present, you should avoid using
+//! `Box<T>` types for functions that are defined in C but invoked
+//! from Rust. In those cases, you should directly mirror the C types
+//! as closely as possible. Using types like `Box<T>` where the C
+//! definition is just using `T*` can lead to undefined behavior, as
+//! described in [rust-lang/unsafe-code-guidelines#198][ucg#198].
+//!
+//! [ucg#198]: https://github.com/rust-lang/unsafe-code-guidelines/issues/198
+//! [dereferencing]: core::ops::Deref
+//! [`Box::<T>::from_raw(value)`]: Box::from_raw
+//! [`Global`]: crate::alloc::Global
+//! [`Layout`]: crate::alloc::Layout
+//! [`Layout::for_value(&*value)`]: crate::alloc::Layout::for_value
+//! [valid]: ptr#safety
+
+#![stable(feature = "rust1", since = "1.0.0")]
+
+use core::any::Any;
+use core::async_iter::AsyncIterator;
+use core::borrow;
+use core::cmp::Ordering;
+use core::convert::{From, TryFrom};
+use core::fmt;
+use core::future::Future;
+use core::hash::{Hash, Hasher};
+#[cfg(not(no_global_oom_handling))]
+use core::iter::FromIterator;
+use core::iter::{FusedIterator, Iterator};
+use core::marker::{Destruct, Unpin, Unsize};
+use core::mem;
+use core::ops::{
+    CoerceUnsized, Deref, DerefMut, DispatchFromDyn, Generator, GeneratorState, Receiver,
+};
+use core::pin::Pin;
+use core::ptr::{self, Unique};
+use core::task::{Context, Poll};
+
+#[cfg(not(no_global_oom_handling))]
+use crate::alloc::{handle_alloc_error, WriteCloneIntoRaw};
+use crate::alloc::{AllocError, Allocator, Global, Layout};
+#[cfg(not(no_global_oom_handling))]
+use crate::borrow::Cow;
+use crate::raw_vec::RawVec;
+#[cfg(not(no_global_oom_handling))]
+use crate::str::from_boxed_utf8_unchecked;
+#[cfg(not(no_global_oom_handling))]
+use crate::vec::Vec;
+
+#[unstable(feature = "thin_box", issue = "92791")]
+pub use thin::ThinBox;
+
+mod thin;
+
+/// A pointer type for heap allocation.
+///
+/// See the [module-level documentation](../../std/boxed/index.html) for more.
+#[lang = "owned_box"]
+#[fundamental]
+#[stable(feature = "rust1", since = "1.0.0")]
+// The declaration of the `Box` struct must be kept in sync with the
+// `alloc::alloc::box_free` function or ICEs will happen. See the comment
+// on `box_free` for more details.
+pub struct Box<
+    T: ?Sized,
+    #[unstable(feature = "allocator_api", issue = "32838")] A: Allocator = Global,
+>(Unique<T>, A);
+
+impl<T> Box<T> {
+    /// Allocates memory on the heap and then places `x` into it.
+    ///
+    /// This doesn't actually allocate if `T` is zero-sized.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let five = Box::new(5);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[inline(always)]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    #[must_use]
+    pub fn new(x: T) -> Self {
+        box x
+    }
+
+    /// Constructs a new box with uninitialized contents.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(new_uninit)]
+    ///
+    /// let mut five = Box::<u32>::new_uninit();
+    ///
+    /// let five = unsafe {
+    ///     // Deferred initialization:
+    ///     five.as_mut_ptr().write(5);
+    ///
+    ///     five.assume_init()
+    /// };
+    ///
+    /// assert_eq!(*five, 5)
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[unstable(feature = "new_uninit", issue = "63291")]
+    #[must_use]
+    #[inline]
+    pub fn new_uninit() -> Box<mem::MaybeUninit<T>> {
+        Self::new_uninit_in(Global)
+    }
+
+    /// Constructs a new `Box` with uninitialized contents, with the memory
+    /// being filled with `0` bytes.
+    ///
+    /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage
+    /// of this method.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(new_uninit)]
+    ///
+    /// let zero = Box::<u32>::new_zeroed();
+    /// let zero = unsafe { zero.assume_init() };
+    ///
+    /// assert_eq!(*zero, 0)
+    /// ```
+    ///
+    /// [zeroed]: mem::MaybeUninit::zeroed
+    #[cfg(not(no_global_oom_handling))]
+    #[inline]
+    #[unstable(feature = "new_uninit", issue = "63291")]
+    #[must_use]
+    pub fn new_zeroed() -> Box<mem::MaybeUninit<T>> {
+        Self::new_zeroed_in(Global)
+    }
+
+    /// Constructs a new `Pin<Box<T>>`. If `T` does not implement `Unpin`, then
+    /// `x` will be pinned in memory and unable to be moved.
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "pin", since = "1.33.0")]
+    #[must_use]
+    #[inline(always)]
+    pub fn pin(x: T) -> Pin<Box<T>> {
+        (box x).into()
+    }
+
+    /// Allocates memory on the heap then places `x` into it,
+    /// returning an error if the allocation fails
+    ///
+    /// This doesn't actually allocate if `T` is zero-sized.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api)]
+    ///
+    /// let five = Box::try_new(5)?;
+    /// # Ok::<(), std::alloc::AllocError>(())
+    /// ```
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[inline]
+    pub fn try_new(x: T) -> Result<Self, AllocError> {
+        Self::try_new_in(x, Global)
+    }
+
+    /// Constructs a new box with uninitialized contents on the heap,
+    /// returning an error if the allocation fails
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api, new_uninit)]
+    ///
+    /// let mut five = Box::<u32>::try_new_uninit()?;
+    ///
+    /// let five = unsafe {
+    ///     // Deferred initialization:
+    ///     five.as_mut_ptr().write(5);
+    ///
+    ///     five.assume_init()
+    /// };
+    ///
+    /// assert_eq!(*five, 5);
+    /// # Ok::<(), std::alloc::AllocError>(())
+    /// ```
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    // #[unstable(feature = "new_uninit", issue = "63291")]
+    #[inline]
+    pub fn try_new_uninit() -> Result<Box<mem::MaybeUninit<T>>, AllocError> {
+        Box::try_new_uninit_in(Global)
+    }
+
+    /// Constructs a new `Box` with uninitialized contents, with the memory
+    /// being filled with `0` bytes on the heap
+    ///
+    /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage
+    /// of this method.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api, new_uninit)]
+    ///
+    /// let zero = Box::<u32>::try_new_zeroed()?;
+    /// let zero = unsafe { zero.assume_init() };
+    ///
+    /// assert_eq!(*zero, 0);
+    /// # Ok::<(), std::alloc::AllocError>(())
+    /// ```
+    ///
+    /// [zeroed]: mem::MaybeUninit::zeroed
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    // #[unstable(feature = "new_uninit", issue = "63291")]
+    #[inline]
+    pub fn try_new_zeroed() -> Result<Box<mem::MaybeUninit<T>>, AllocError> {
+        Box::try_new_zeroed_in(Global)
+    }
+}
+
+impl<T, A: Allocator> Box<T, A> {
+    /// Allocates memory in the given allocator then places `x` into it.
+    ///
+    /// This doesn't actually allocate if `T` is zero-sized.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let five = Box::new_in(5, System);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[must_use]
+    #[inline]
+    pub const fn new_in(x: T, alloc: A) -> Self
+    where
+        A: ~const Allocator + ~const Destruct,
+    {
+        let mut boxed = Self::new_uninit_in(alloc);
+        unsafe {
+            boxed.as_mut_ptr().write(x);
+            boxed.assume_init()
+        }
+    }
+
+    /// Allocates memory in the given allocator then places `x` into it,
+    /// returning an error if the allocation fails
+    ///
+    /// This doesn't actually allocate if `T` is zero-sized.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let five = Box::try_new_in(5, System)?;
+    /// # Ok::<(), std::alloc::AllocError>(())
+    /// ```
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[inline]
+    pub const fn try_new_in(x: T, alloc: A) -> Result<Self, AllocError>
+    where
+        T: ~const Destruct,
+        A: ~const Allocator + ~const Destruct,
+    {
+        let mut boxed = Self::try_new_uninit_in(alloc)?;
+        unsafe {
+            boxed.as_mut_ptr().write(x);
+            Ok(boxed.assume_init())
+        }
+    }
+
+    /// Constructs a new box with uninitialized contents in the provided allocator.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api, new_uninit)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let mut five = Box::<u32, _>::new_uninit_in(System);
+    ///
+    /// let five = unsafe {
+    ///     // Deferred initialization:
+    ///     five.as_mut_ptr().write(5);
+    ///
+    ///     five.assume_init()
+    /// };
+    ///
+    /// assert_eq!(*five, 5)
+    /// ```
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[cfg(not(no_global_oom_handling))]
+    #[must_use]
+    // #[unstable(feature = "new_uninit", issue = "63291")]
+    pub const fn new_uninit_in(alloc: A) -> Box<mem::MaybeUninit<T>, A>
+    where
+        A: ~const Allocator + ~const Destruct,
+    {
+        let layout = Layout::new::<mem::MaybeUninit<T>>();
+        // NOTE: Prefer match over unwrap_or_else since closure sometimes not inlineable.
+        // That would make code size bigger.
+        match Box::try_new_uninit_in(alloc) {
+            Ok(m) => m,
+            Err(_) => handle_alloc_error(layout),
+        }
+    }
+
+    /// Constructs a new box with uninitialized contents in the provided allocator,
+    /// returning an error if the allocation fails
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api, new_uninit)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let mut five = Box::<u32, _>::try_new_uninit_in(System)?;
+    ///
+    /// let five = unsafe {
+    ///     // Deferred initialization:
+    ///     five.as_mut_ptr().write(5);
+    ///
+    ///     five.assume_init()
+    /// };
+    ///
+    /// assert_eq!(*five, 5);
+    /// # Ok::<(), std::alloc::AllocError>(())
+    /// ```
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    // #[unstable(feature = "new_uninit", issue = "63291")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    pub const fn try_new_uninit_in(alloc: A) -> Result<Box<mem::MaybeUninit<T>, A>, AllocError>
+    where
+        A: ~const Allocator + ~const Destruct,
+    {
+        let layout = Layout::new::<mem::MaybeUninit<T>>();
+        let ptr = alloc.allocate(layout)?.cast();
+        unsafe { Ok(Box::from_raw_in(ptr.as_ptr(), alloc)) }
+    }
+
+    /// Constructs a new `Box` with uninitialized contents, with the memory
+    /// being filled with `0` bytes in the provided allocator.
+    ///
+    /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage
+    /// of this method.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api, new_uninit)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let zero = Box::<u32, _>::new_zeroed_in(System);
+    /// let zero = unsafe { zero.assume_init() };
+    ///
+    /// assert_eq!(*zero, 0)
+    /// ```
+    ///
+    /// [zeroed]: mem::MaybeUninit::zeroed
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[cfg(not(no_global_oom_handling))]
+    // #[unstable(feature = "new_uninit", issue = "63291")]
+    #[must_use]
+    pub const fn new_zeroed_in(alloc: A) -> Box<mem::MaybeUninit<T>, A>
+    where
+        A: ~const Allocator + ~const Destruct,
+    {
+        let layout = Layout::new::<mem::MaybeUninit<T>>();
+        // NOTE: Prefer match over unwrap_or_else since closure sometimes not inlineable.
+        // That would make code size bigger.
+        match Box::try_new_zeroed_in(alloc) {
+            Ok(m) => m,
+            Err(_) => handle_alloc_error(layout),
+        }
+    }
+
+    /// Constructs a new `Box` with uninitialized contents, with the memory
+    /// being filled with `0` bytes in the provided allocator,
+    /// returning an error if the allocation fails,
+    ///
+    /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage
+    /// of this method.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api, new_uninit)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let zero = Box::<u32, _>::try_new_zeroed_in(System)?;
+    /// let zero = unsafe { zero.assume_init() };
+    ///
+    /// assert_eq!(*zero, 0);
+    /// # Ok::<(), std::alloc::AllocError>(())
+    /// ```
+    ///
+    /// [zeroed]: mem::MaybeUninit::zeroed
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    // #[unstable(feature = "new_uninit", issue = "63291")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    pub const fn try_new_zeroed_in(alloc: A) -> Result<Box<mem::MaybeUninit<T>, A>, AllocError>
+    where
+        A: ~const Allocator + ~const Destruct,
+    {
+        let layout = Layout::new::<mem::MaybeUninit<T>>();
+        let ptr = alloc.allocate_zeroed(layout)?.cast();
+        unsafe { Ok(Box::from_raw_in(ptr.as_ptr(), alloc)) }
+    }
+
+    /// Constructs a new `Pin<Box<T, A>>`. If `T` does not implement `Unpin`, then
+    /// `x` will be pinned in memory and unable to be moved.
+    #[cfg(not(no_global_oom_handling))]
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[must_use]
+    #[inline(always)]
+    pub const fn pin_in(x: T, alloc: A) -> Pin<Self>
+    where
+        A: 'static + ~const Allocator + ~const Destruct,
+    {
+        Self::into_pin(Self::new_in(x, alloc))
+    }
+
+    /// Converts a `Box<T>` into a `Box<[T]>`
+    ///
+    /// This conversion does not allocate on the heap and happens in place.
+    #[unstable(feature = "box_into_boxed_slice", issue = "71582")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    pub const fn into_boxed_slice(boxed: Self) -> Box<[T], A> {
+        let (raw, alloc) = Box::into_raw_with_allocator(boxed);
+        unsafe { Box::from_raw_in(raw as *mut [T; 1], alloc) }
+    }
+
+    /// Consumes the `Box`, returning the wrapped value.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(box_into_inner)]
+    ///
+    /// let c = Box::new(5);
+    ///
+    /// assert_eq!(Box::into_inner(c), 5);
+    /// ```
+    #[unstable(feature = "box_into_inner", issue = "80437")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[inline]
+    pub const fn into_inner(boxed: Self) -> T
+    where
+        Self: ~const Destruct,
+    {
+        *boxed
+    }
+}
+
+impl<T> Box<[T]> {
+    /// Constructs a new boxed slice with uninitialized contents.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(new_uninit)]
+    ///
+    /// let mut values = Box::<[u32]>::new_uninit_slice(3);
+    ///
+    /// let values = unsafe {
+    ///     // Deferred initialization:
+    ///     values[0].as_mut_ptr().write(1);
+    ///     values[1].as_mut_ptr().write(2);
+    ///     values[2].as_mut_ptr().write(3);
+    ///
+    ///     values.assume_init()
+    /// };
+    ///
+    /// assert_eq!(*values, [1, 2, 3])
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[unstable(feature = "new_uninit", issue = "63291")]
+    #[must_use]
+    pub fn new_uninit_slice(len: usize) -> Box<[mem::MaybeUninit<T>]> {
+        unsafe { RawVec::with_capacity(len).into_box(len) }
+    }
+
+    /// Constructs a new boxed slice with uninitialized contents, with the memory
+    /// being filled with `0` bytes.
+    ///
+    /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage
+    /// of this method.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(new_uninit)]
+    ///
+    /// let values = Box::<[u32]>::new_zeroed_slice(3);
+    /// let values = unsafe { values.assume_init() };
+    ///
+    /// assert_eq!(*values, [0, 0, 0])
+    /// ```
+    ///
+    /// [zeroed]: mem::MaybeUninit::zeroed
+    #[cfg(not(no_global_oom_handling))]
+    #[unstable(feature = "new_uninit", issue = "63291")]
+    #[must_use]
+    pub fn new_zeroed_slice(len: usize) -> Box<[mem::MaybeUninit<T>]> {
+        unsafe { RawVec::with_capacity_zeroed(len).into_box(len) }
+    }
+
+    /// Constructs a new boxed slice with uninitialized contents. Returns an error if
+    /// the allocation fails
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api, new_uninit)]
+    ///
+    /// let mut values = Box::<[u32]>::try_new_uninit_slice(3)?;
+    /// let values = unsafe {
+    ///     // Deferred initialization:
+    ///     values[0].as_mut_ptr().write(1);
+    ///     values[1].as_mut_ptr().write(2);
+    ///     values[2].as_mut_ptr().write(3);
+    ///     values.assume_init()
+    /// };
+    ///
+    /// assert_eq!(*values, [1, 2, 3]);
+    /// # Ok::<(), std::alloc::AllocError>(())
+    /// ```
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[inline]
+    pub fn try_new_uninit_slice(len: usize) -> Result<Box<[mem::MaybeUninit<T>]>, AllocError> {
+        unsafe {
+            let layout = match Layout::array::<mem::MaybeUninit<T>>(len) {
+                Ok(l) => l,
+                Err(_) => return Err(AllocError),
+            };
+            let ptr = Global.allocate(layout)?;
+            Ok(RawVec::from_raw_parts_in(ptr.as_mut_ptr() as *mut _, len, Global).into_box(len))
+        }
+    }
+
+    /// Constructs a new boxed slice with uninitialized contents, with the memory
+    /// being filled with `0` bytes. Returns an error if the allocation fails
+    ///
+    /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage
+    /// of this method.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api, new_uninit)]
+    ///
+    /// let values = Box::<[u32]>::try_new_zeroed_slice(3)?;
+    /// let values = unsafe { values.assume_init() };
+    ///
+    /// assert_eq!(*values, [0, 0, 0]);
+    /// # Ok::<(), std::alloc::AllocError>(())
+    /// ```
+    ///
+    /// [zeroed]: mem::MaybeUninit::zeroed
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[inline]
+    pub fn try_new_zeroed_slice(len: usize) -> Result<Box<[mem::MaybeUninit<T>]>, AllocError> {
+        unsafe {
+            let layout = match Layout::array::<mem::MaybeUninit<T>>(len) {
+                Ok(l) => l,
+                Err(_) => return Err(AllocError),
+            };
+            let ptr = Global.allocate_zeroed(layout)?;
+            Ok(RawVec::from_raw_parts_in(ptr.as_mut_ptr() as *mut _, len, Global).into_box(len))
+        }
+    }
+}
+
+impl<T, A: Allocator> Box<[T], A> {
+    /// Constructs a new boxed slice with uninitialized contents in the provided allocator.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api, new_uninit)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let mut values = Box::<[u32], _>::new_uninit_slice_in(3, System);
+    ///
+    /// let values = unsafe {
+    ///     // Deferred initialization:
+    ///     values[0].as_mut_ptr().write(1);
+    ///     values[1].as_mut_ptr().write(2);
+    ///     values[2].as_mut_ptr().write(3);
+    ///
+    ///     values.assume_init()
+    /// };
+    ///
+    /// assert_eq!(*values, [1, 2, 3])
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    // #[unstable(feature = "new_uninit", issue = "63291")]
+    #[must_use]
+    pub fn new_uninit_slice_in(len: usize, alloc: A) -> Box<[mem::MaybeUninit<T>], A> {
+        unsafe { RawVec::with_capacity_in(len, alloc).into_box(len) }
+    }
+
+    /// Constructs a new boxed slice with uninitialized contents in the provided allocator,
+    /// with the memory being filled with `0` bytes.
+    ///
+    /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage
+    /// of this method.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api, new_uninit)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let values = Box::<[u32], _>::new_zeroed_slice_in(3, System);
+    /// let values = unsafe { values.assume_init() };
+    ///
+    /// assert_eq!(*values, [0, 0, 0])
+    /// ```
+    ///
+    /// [zeroed]: mem::MaybeUninit::zeroed
+    #[cfg(not(no_global_oom_handling))]
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    // #[unstable(feature = "new_uninit", issue = "63291")]
+    #[must_use]
+    pub fn new_zeroed_slice_in(len: usize, alloc: A) -> Box<[mem::MaybeUninit<T>], A> {
+        unsafe { RawVec::with_capacity_zeroed_in(len, alloc).into_box(len) }
+    }
+}
+
+impl<T, A: Allocator> Box<mem::MaybeUninit<T>, A> {
+    /// Converts to `Box<T, A>`.
+    ///
+    /// # Safety
+    ///
+    /// As with [`MaybeUninit::assume_init`],
+    /// it is up to the caller to guarantee that the value
+    /// really is in an initialized state.
+    /// Calling this when the content is not yet fully initialized
+    /// causes immediate undefined behavior.
+    ///
+    /// [`MaybeUninit::assume_init`]: mem::MaybeUninit::assume_init
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(new_uninit)]
+    ///
+    /// let mut five = Box::<u32>::new_uninit();
+    ///
+    /// let five: Box<u32> = unsafe {
+    ///     // Deferred initialization:
+    ///     five.as_mut_ptr().write(5);
+    ///
+    ///     five.assume_init()
+    /// };
+    ///
+    /// assert_eq!(*five, 5)
+    /// ```
+    #[unstable(feature = "new_uninit", issue = "63291")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[inline]
+    pub const unsafe fn assume_init(self) -> Box<T, A> {
+        let (raw, alloc) = Box::into_raw_with_allocator(self);
+        unsafe { Box::from_raw_in(raw as *mut T, alloc) }
+    }
+
+    /// Writes the value and converts to `Box<T, A>`.
+    ///
+    /// This method converts the box similarly to [`Box::assume_init`] but
+    /// writes `value` into it before conversion thus guaranteeing safety.
+    /// In some scenarios use of this method may improve performance because
+    /// the compiler may be able to optimize copying from stack.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(new_uninit)]
+    ///
+    /// let big_box = Box::<[usize; 1024]>::new_uninit();
+    ///
+    /// let mut array = [0; 1024];
+    /// for (i, place) in array.iter_mut().enumerate() {
+    ///     *place = i;
+    /// }
+    ///
+    /// // The optimizer may be able to elide this copy, so previous code writes
+    /// // to heap directly.
+    /// let big_box = Box::write(big_box, array);
+    ///
+    /// for (i, x) in big_box.iter().enumerate() {
+    ///     assert_eq!(*x, i);
+    /// }
+    /// ```
+    #[unstable(feature = "new_uninit", issue = "63291")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[inline]
+    pub const fn write(mut boxed: Self, value: T) -> Box<T, A> {
+        unsafe {
+            (*boxed).write(value);
+            boxed.assume_init()
+        }
+    }
+}
+
+impl<T, A: Allocator> Box<[mem::MaybeUninit<T>], A> {
+    /// Converts to `Box<[T], A>`.
+    ///
+    /// # Safety
+    ///
+    /// As with [`MaybeUninit::assume_init`],
+    /// it is up to the caller to guarantee that the values
+    /// really are in an initialized state.
+    /// Calling this when the content is not yet fully initialized
+    /// causes immediate undefined behavior.
+    ///
+    /// [`MaybeUninit::assume_init`]: mem::MaybeUninit::assume_init
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(new_uninit)]
+    ///
+    /// let mut values = Box::<[u32]>::new_uninit_slice(3);
+    ///
+    /// let values = unsafe {
+    ///     // Deferred initialization:
+    ///     values[0].as_mut_ptr().write(1);
+    ///     values[1].as_mut_ptr().write(2);
+    ///     values[2].as_mut_ptr().write(3);
+    ///
+    ///     values.assume_init()
+    /// };
+    ///
+    /// assert_eq!(*values, [1, 2, 3])
+    /// ```
+    #[unstable(feature = "new_uninit", issue = "63291")]
+    #[inline]
+    pub unsafe fn assume_init(self) -> Box<[T], A> {
+        let (raw, alloc) = Box::into_raw_with_allocator(self);
+        unsafe { Box::from_raw_in(raw as *mut [T], alloc) }
+    }
+}
+
+impl<T: ?Sized> Box<T> {
+    /// Constructs a box from a raw pointer.
+    ///
+    /// After calling this function, the raw pointer is owned by the
+    /// resulting `Box`. Specifically, the `Box` destructor will call
+    /// the destructor of `T` and free the allocated memory. For this
+    /// to be safe, the memory must have been allocated in accordance
+    /// with the [memory layout] used by `Box` .
+    ///
+    /// # Safety
+    ///
+    /// This function is unsafe because improper use may lead to
+    /// memory problems. For example, a double-free may occur if the
+    /// function is called twice on the same raw pointer.
+    ///
+    /// The safety conditions are described in the [memory layout] section.
+    ///
+    /// # Examples
+    ///
+    /// Recreate a `Box` which was previously converted to a raw pointer
+    /// using [`Box::into_raw`]:
+    /// ```
+    /// let x = Box::new(5);
+    /// let ptr = Box::into_raw(x);
+    /// let x = unsafe { Box::from_raw(ptr) };
+    /// ```
+    /// Manually create a `Box` from scratch by using the global allocator:
+    /// ```
+    /// use std::alloc::{alloc, Layout};
+    ///
+    /// unsafe {
+    ///     let ptr = alloc(Layout::new::<i32>()) as *mut i32;
+    ///     // In general .write is required to avoid attempting to destruct
+    ///     // the (uninitialized) previous contents of `ptr`, though for this
+    ///     // simple example `*ptr = 5` would have worked as well.
+    ///     ptr.write(5);
+    ///     let x = Box::from_raw(ptr);
+    /// }
+    /// ```
+    ///
+    /// [memory layout]: self#memory-layout
+    /// [`Layout`]: crate::Layout
+    #[stable(feature = "box_raw", since = "1.4.0")]
+    #[inline]
+    pub unsafe fn from_raw(raw: *mut T) -> Self {
+        unsafe { Self::from_raw_in(raw, Global) }
+    }
+}
+
+impl<T: ?Sized, A: Allocator> Box<T, A> {
+    /// Constructs a box from a raw pointer in the given allocator.
+    ///
+    /// After calling this function, the raw pointer is owned by the
+    /// resulting `Box`. Specifically, the `Box` destructor will call
+    /// the destructor of `T` and free the allocated memory. For this
+    /// to be safe, the memory must have been allocated in accordance
+    /// with the [memory layout] used by `Box` .
+    ///
+    /// # Safety
+    ///
+    /// This function is unsafe because improper use may lead to
+    /// memory problems. For example, a double-free may occur if the
+    /// function is called twice on the same raw pointer.
+    ///
+    ///
+    /// # Examples
+    ///
+    /// Recreate a `Box` which was previously converted to a raw pointer
+    /// using [`Box::into_raw_with_allocator`]:
+    /// ```
+    /// #![feature(allocator_api)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let x = Box::new_in(5, System);
+    /// let (ptr, alloc) = Box::into_raw_with_allocator(x);
+    /// let x = unsafe { Box::from_raw_in(ptr, alloc) };
+    /// ```
+    /// Manually create a `Box` from scratch by using the system allocator:
+    /// ```
+    /// #![feature(allocator_api, slice_ptr_get)]
+    ///
+    /// use std::alloc::{Allocator, Layout, System};
+    ///
+    /// unsafe {
+    ///     let ptr = System.allocate(Layout::new::<i32>())?.as_mut_ptr() as *mut i32;
+    ///     // In general .write is required to avoid attempting to destruct
+    ///     // the (uninitialized) previous contents of `ptr`, though for this
+    ///     // simple example `*ptr = 5` would have worked as well.
+    ///     ptr.write(5);
+    ///     let x = Box::from_raw_in(ptr, System);
+    /// }
+    /// # Ok::<(), std::alloc::AllocError>(())
+    /// ```
+    ///
+    /// [memory layout]: self#memory-layout
+    /// [`Layout`]: crate::Layout
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[inline]
+    pub const unsafe fn from_raw_in(raw: *mut T, alloc: A) -> Self {
+        Box(unsafe { Unique::new_unchecked(raw) }, alloc)
+    }
+
+    /// Consumes the `Box`, returning a wrapped raw pointer.
+    ///
+    /// The pointer will be properly aligned and non-null.
+    ///
+    /// After calling this function, the caller is responsible for the
+    /// memory previously managed by the `Box`. In particular, the
+    /// caller should properly destroy `T` and release the memory, taking
+    /// into account the [memory layout] used by `Box`. The easiest way to
+    /// do this is to convert the raw pointer back into a `Box` with the
+    /// [`Box::from_raw`] function, allowing the `Box` destructor to perform
+    /// the cleanup.
+    ///
+    /// Note: this is an associated function, which means that you have
+    /// to call it as `Box::into_raw(b)` instead of `b.into_raw()`. This
+    /// is so that there is no conflict with a method on the inner type.
+    ///
+    /// # Examples
+    /// Converting the raw pointer back into a `Box` with [`Box::from_raw`]
+    /// for automatic cleanup:
+    /// ```
+    /// let x = Box::new(String::from("Hello"));
+    /// let ptr = Box::into_raw(x);
+    /// let x = unsafe { Box::from_raw(ptr) };
+    /// ```
+    /// Manual cleanup by explicitly running the destructor and deallocating
+    /// the memory:
+    /// ```
+    /// use std::alloc::{dealloc, Layout};
+    /// use std::ptr;
+    ///
+    /// let x = Box::new(String::from("Hello"));
+    /// let p = Box::into_raw(x);
+    /// unsafe {
+    ///     ptr::drop_in_place(p);
+    ///     dealloc(p as *mut u8, Layout::new::<String>());
+    /// }
+    /// ```
+    ///
+    /// [memory layout]: self#memory-layout
+    #[stable(feature = "box_raw", since = "1.4.0")]
+    #[inline]
+    pub fn into_raw(b: Self) -> *mut T {
+        Self::into_raw_with_allocator(b).0
+    }
+
+    /// Consumes the `Box`, returning a wrapped raw pointer and the allocator.
+    ///
+    /// The pointer will be properly aligned and non-null.
+    ///
+    /// After calling this function, the caller is responsible for the
+    /// memory previously managed by the `Box`. In particular, the
+    /// caller should properly destroy `T` and release the memory, taking
+    /// into account the [memory layout] used by `Box`. The easiest way to
+    /// do this is to convert the raw pointer back into a `Box` with the
+    /// [`Box::from_raw_in`] function, allowing the `Box` destructor to perform
+    /// the cleanup.
+    ///
+    /// Note: this is an associated function, which means that you have
+    /// to call it as `Box::into_raw_with_allocator(b)` instead of `b.into_raw_with_allocator()`. This
+    /// is so that there is no conflict with a method on the inner type.
+    ///
+    /// # Examples
+    /// Converting the raw pointer back into a `Box` with [`Box::from_raw_in`]
+    /// for automatic cleanup:
+    /// ```
+    /// #![feature(allocator_api)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let x = Box::new_in(String::from("Hello"), System);
+    /// let (ptr, alloc) = Box::into_raw_with_allocator(x);
+    /// let x = unsafe { Box::from_raw_in(ptr, alloc) };
+    /// ```
+    /// Manual cleanup by explicitly running the destructor and deallocating
+    /// the memory:
+    /// ```
+    /// #![feature(allocator_api)]
+    ///
+    /// use std::alloc::{Allocator, Layout, System};
+    /// use std::ptr::{self, NonNull};
+    ///
+    /// let x = Box::new_in(String::from("Hello"), System);
+    /// let (ptr, alloc) = Box::into_raw_with_allocator(x);
+    /// unsafe {
+    ///     ptr::drop_in_place(ptr);
+    ///     let non_null = NonNull::new_unchecked(ptr);
+    ///     alloc.deallocate(non_null.cast(), Layout::new::<String>());
+    /// }
+    /// ```
+    ///
+    /// [memory layout]: self#memory-layout
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[inline]
+    pub const fn into_raw_with_allocator(b: Self) -> (*mut T, A) {
+        let (leaked, alloc) = Box::into_unique(b);
+        (leaked.as_ptr(), alloc)
+    }
+
+    #[unstable(
+        feature = "ptr_internals",
+        issue = "none",
+        reason = "use `Box::leak(b).into()` or `Unique::from(Box::leak(b))` instead"
+    )]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[inline]
+    #[doc(hidden)]
+    pub const fn into_unique(b: Self) -> (Unique<T>, A) {
+        // Box is recognized as a "unique pointer" by Stacked Borrows, but internally it is a
+        // raw pointer for the type system. Turning it directly into a raw pointer would not be
+        // recognized as "releasing" the unique pointer to permit aliased raw accesses,
+        // so all raw pointer methods have to go through `Box::leak`. Turning *that* to a raw pointer
+        // behaves correctly.
+        let alloc = unsafe { ptr::read(&b.1) };
+        (Unique::from(Box::leak(b)), alloc)
+    }
+
+    /// Returns a reference to the underlying allocator.
+    ///
+    /// Note: this is an associated function, which means that you have
+    /// to call it as `Box::allocator(&b)` instead of `b.allocator()`. This
+    /// is so that there is no conflict with a method on the inner type.
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[inline]
+    pub const fn allocator(b: &Self) -> &A {
+        &b.1
+    }
+
+    /// Consumes and leaks the `Box`, returning a mutable reference,
+    /// `&'a mut T`. Note that the type `T` must outlive the chosen lifetime
+    /// `'a`. If the type has only static references, or none at all, then this
+    /// may be chosen to be `'static`.
+    ///
+    /// This function is mainly useful for data that lives for the remainder of
+    /// the program's life. Dropping the returned reference will cause a memory
+    /// leak. If this is not acceptable, the reference should first be wrapped
+    /// with the [`Box::from_raw`] function producing a `Box`. This `Box` can
+    /// then be dropped which will properly destroy `T` and release the
+    /// allocated memory.
+    ///
+    /// Note: this is an associated function, which means that you have
+    /// to call it as `Box::leak(b)` instead of `b.leak()`. This
+    /// is so that there is no conflict with a method on the inner type.
+    ///
+    /// # Examples
+    ///
+    /// Simple usage:
+    ///
+    /// ```
+    /// let x = Box::new(41);
+    /// let static_ref: &'static mut usize = Box::leak(x);
+    /// *static_ref += 1;
+    /// assert_eq!(*static_ref, 42);
+    /// ```
+    ///
+    /// Unsized data:
+    ///
+    /// ```
+    /// let x = vec![1, 2, 3].into_boxed_slice();
+    /// let static_ref = Box::leak(x);
+    /// static_ref[0] = 4;
+    /// assert_eq!(*static_ref, [4, 2, 3]);
+    /// ```
+    #[stable(feature = "box_leak", since = "1.26.0")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    #[inline]
+    pub const fn leak<'a>(b: Self) -> &'a mut T
+    where
+        A: 'a,
+    {
+        unsafe { &mut *mem::ManuallyDrop::new(b).0.as_ptr() }
+    }
+
+    /// Converts a `Box<T>` into a `Pin<Box<T>>`
+    ///
+    /// This conversion does not allocate on the heap and happens in place.
+    ///
+    /// This is also available via [`From`].
+    #[unstable(feature = "box_into_pin", issue = "62370")]
+    #[rustc_const_unstable(feature = "const_box", issue = "92521")]
+    pub const fn into_pin(boxed: Self) -> Pin<Self>
+    where
+        A: 'static,
+    {
+        // It's not possible to move or replace the insides of a `Pin<Box<T>>`
+        // when `T: !Unpin`,  so it's safe to pin it directly without any
+        // additional requirements.
+        unsafe { Pin::new_unchecked(boxed) }
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+unsafe impl<#[may_dangle] T: ?Sized, A: Allocator> Drop for Box<T, A> {
+    fn drop(&mut self) {
+        // FIXME: Do nothing, drop is currently performed by compiler.
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: Default> Default for Box<T> {
+    /// Creates a `Box<T>`, with the `Default` value for T.
+    fn default() -> Self {
+        box T::default()
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+#[rustc_const_unstable(feature = "const_default_impls", issue = "87864")]
+impl<T> const Default for Box<[T]> {
+    fn default() -> Self {
+        let ptr: Unique<[T]> = Unique::<[T; 0]>::dangling();
+        Box(ptr, Global)
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "default_box_extra", since = "1.17.0")]
+#[rustc_const_unstable(feature = "const_default_impls", issue = "87864")]
+impl const Default for Box<str> {
+    fn default() -> Self {
+        // SAFETY: This is the same as `Unique::cast<U>` but with an unsized `U = str`.
+        let ptr: Unique<str> = unsafe {
+            let bytes: Unique<[u8]> = Unique::<[u8; 0]>::dangling();
+            Unique::new_unchecked(bytes.as_ptr() as *mut str)
+        };
+        Box(ptr, Global)
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: Clone, A: Allocator + Clone> Clone for Box<T, A> {
+    /// Returns a new box with a `clone()` of this box's contents.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let x = Box::new(5);
+    /// let y = x.clone();
+    ///
+    /// // The value is the same
+    /// assert_eq!(x, y);
+    ///
+    /// // But they are unique objects
+    /// assert_ne!(&*x as *const i32, &*y as *const i32);
+    /// ```
+    #[inline]
+    fn clone(&self) -> Self {
+        // Pre-allocate memory to allow writing the cloned value directly.
+        let mut boxed = Self::new_uninit_in(self.1.clone());
+        unsafe {
+            (**self).write_clone_into_raw(boxed.as_mut_ptr());
+            boxed.assume_init()
+        }
+    }
+
+    /// Copies `source`'s contents into `self` without creating a new allocation.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let x = Box::new(5);
+    /// let mut y = Box::new(10);
+    /// let yp: *const i32 = &*y;
+    ///
+    /// y.clone_from(&x);
+    ///
+    /// // The value is the same
+    /// assert_eq!(x, y);
+    ///
+    /// // And no allocation occurred
+    /// assert_eq!(yp, &*y);
+    /// ```
+    #[inline]
+    fn clone_from(&mut self, source: &Self) {
+        (**self).clone_from(&(**source));
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "box_slice_clone", since = "1.3.0")]
+impl Clone for Box<str> {
+    fn clone(&self) -> Self {
+        // this makes a copy of the data
+        let buf: Box<[u8]> = self.as_bytes().into();
+        unsafe { from_boxed_utf8_unchecked(buf) }
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: ?Sized + PartialEq, A: Allocator> PartialEq for Box<T, A> {
+    #[inline]
+    fn eq(&self, other: &Self) -> bool {
+        PartialEq::eq(&**self, &**other)
+    }
+    #[inline]
+    fn ne(&self, other: &Self) -> bool {
+        PartialEq::ne(&**self, &**other)
+    }
+}
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: ?Sized + PartialOrd, A: Allocator> PartialOrd for Box<T, A> {
+    #[inline]
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        PartialOrd::partial_cmp(&**self, &**other)
+    }
+    #[inline]
+    fn lt(&self, other: &Self) -> bool {
+        PartialOrd::lt(&**self, &**other)
+    }
+    #[inline]
+    fn le(&self, other: &Self) -> bool {
+        PartialOrd::le(&**self, &**other)
+    }
+    #[inline]
+    fn ge(&self, other: &Self) -> bool {
+        PartialOrd::ge(&**self, &**other)
+    }
+    #[inline]
+    fn gt(&self, other: &Self) -> bool {
+        PartialOrd::gt(&**self, &**other)
+    }
+}
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: ?Sized + Ord, A: Allocator> Ord for Box<T, A> {
+    #[inline]
+    fn cmp(&self, other: &Self) -> Ordering {
+        Ord::cmp(&**self, &**other)
+    }
+}
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: ?Sized + Eq, A: Allocator> Eq for Box<T, A> {}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: ?Sized + Hash, A: Allocator> Hash for Box<T, A> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        (**self).hash(state);
+    }
+}
+
+#[stable(feature = "indirect_hasher_impl", since = "1.22.0")]
+impl<T: ?Sized + Hasher, A: Allocator> Hasher for Box<T, A> {
+    fn finish(&self) -> u64 {
+        (**self).finish()
+    }
+    fn write(&mut self, bytes: &[u8]) {
+        (**self).write(bytes)
+    }
+    fn write_u8(&mut self, i: u8) {
+        (**self).write_u8(i)
+    }
+    fn write_u16(&mut self, i: u16) {
+        (**self).write_u16(i)
+    }
+    fn write_u32(&mut self, i: u32) {
+        (**self).write_u32(i)
+    }
+    fn write_u64(&mut self, i: u64) {
+        (**self).write_u64(i)
+    }
+    fn write_u128(&mut self, i: u128) {
+        (**self).write_u128(i)
+    }
+    fn write_usize(&mut self, i: usize) {
+        (**self).write_usize(i)
+    }
+    fn write_i8(&mut self, i: i8) {
+        (**self).write_i8(i)
+    }
+    fn write_i16(&mut self, i: i16) {
+        (**self).write_i16(i)
+    }
+    fn write_i32(&mut self, i: i32) {
+        (**self).write_i32(i)
+    }
+    fn write_i64(&mut self, i: i64) {
+        (**self).write_i64(i)
+    }
+    fn write_i128(&mut self, i: i128) {
+        (**self).write_i128(i)
+    }
+    fn write_isize(&mut self, i: isize) {
+        (**self).write_isize(i)
+    }
+    fn write_length_prefix(&mut self, len: usize) {
+        (**self).write_length_prefix(len)
+    }
+    fn write_str(&mut self, s: &str) {
+        (**self).write_str(s)
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "from_for_ptrs", since = "1.6.0")]
+impl<T> From<T> for Box<T> {
+    /// Converts a `T` into a `Box<T>`
+    ///
+    /// The conversion allocates on the heap and moves `t`
+    /// from the stack into it.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// let x = 5;
+    /// let boxed = Box::new(5);
+    ///
+    /// assert_eq!(Box::from(x), boxed);
+    /// ```
+    fn from(t: T) -> Self {
+        Box::new(t)
+    }
+}
+
+#[stable(feature = "pin", since = "1.33.0")]
+#[rustc_const_unstable(feature = "const_box", issue = "92521")]
+impl<T: ?Sized, A: Allocator> const From<Box<T, A>> for Pin<Box<T, A>>
+where
+    A: 'static,
+{
+    /// Converts a `Box<T>` into a `Pin<Box<T>>`
+    ///
+    /// This conversion does not allocate on the heap and happens in place.
+    fn from(boxed: Box<T, A>) -> Self {
+        Box::into_pin(boxed)
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "box_from_slice", since = "1.17.0")]
+impl<T: Copy> From<&[T]> for Box<[T]> {
+    /// Converts a `&[T]` into a `Box<[T]>`
+    ///
+    /// This conversion allocates on the heap
+    /// and performs a copy of `slice`.
+    ///
+    /// # Examples
+    /// ```rust
+    /// // create a &[u8] which will be used to create a Box<[u8]>
+    /// let slice: &[u8] = &[104, 101, 108, 108, 111];
+    /// let boxed_slice: Box<[u8]> = Box::from(slice);
+    ///
+    /// println!("{boxed_slice:?}");
+    /// ```
+    fn from(slice: &[T]) -> Box<[T]> {
+        let len = slice.len();
+        let buf = RawVec::with_capacity(len);
+        unsafe {
+            ptr::copy_nonoverlapping(slice.as_ptr(), buf.ptr(), len);
+            buf.into_box(slice.len()).assume_init()
+        }
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "box_from_cow", since = "1.45.0")]
+impl<T: Copy> From<Cow<'_, [T]>> for Box<[T]> {
+    /// Converts a `Cow<'_, [T]>` into a `Box<[T]>`
+    ///
+    /// When `cow` is the `Cow::Borrowed` variant, this
+    /// conversion allocates on the heap and copies the
+    /// underlying slice. Otherwise, it will try to reuse the owned
+    /// `Vec`'s allocation.
+    #[inline]
+    fn from(cow: Cow<'_, [T]>) -> Box<[T]> {
+        match cow {
+            Cow::Borrowed(slice) => Box::from(slice),
+            Cow::Owned(slice) => Box::from(slice),
+        }
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "box_from_slice", since = "1.17.0")]
+impl From<&str> for Box<str> {
+    /// Converts a `&str` into a `Box<str>`
+    ///
+    /// This conversion allocates on the heap
+    /// and performs a copy of `s`.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// let boxed: Box<str> = Box::from("hello");
+    /// println!("{boxed}");
+    /// ```
+    #[inline]
+    fn from(s: &str) -> Box<str> {
+        unsafe { from_boxed_utf8_unchecked(Box::from(s.as_bytes())) }
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "box_from_cow", since = "1.45.0")]
+impl From<Cow<'_, str>> for Box<str> {
+    /// Converts a `Cow<'_, str>` into a `Box<str>`
+    ///
+    /// When `cow` is the `Cow::Borrowed` variant, this
+    /// conversion allocates on the heap and copies the
+    /// underlying `str`. Otherwise, it will try to reuse the owned
+    /// `String`'s allocation.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// use std::borrow::Cow;
+    ///
+    /// let unboxed = Cow::Borrowed("hello");
+    /// let boxed: Box<str> = Box::from(unboxed);
+    /// println!("{boxed}");
+    /// ```
+    ///
+    /// ```rust
+    /// # use std::borrow::Cow;
+    /// let unboxed = Cow::Owned("hello".to_string());
+    /// let boxed: Box<str> = Box::from(unboxed);
+    /// println!("{boxed}");
+    /// ```
+    #[inline]
+    fn from(cow: Cow<'_, str>) -> Box<str> {
+        match cow {
+            Cow::Borrowed(s) => Box::from(s),
+            Cow::Owned(s) => Box::from(s),
+        }
+    }
+}
+
+#[stable(feature = "boxed_str_conv", since = "1.19.0")]
+impl<A: Allocator> From<Box<str, A>> for Box<[u8], A> {
+    /// Converts a `Box<str>` into a `Box<[u8]>`
+    ///
+    /// This conversion does not allocate on the heap and happens in place.
+    ///
+    /// # Examples
+    /// ```rust
+    /// // create a Box<str> which will be used to create a Box<[u8]>
+    /// let boxed: Box<str> = Box::from("hello");
+    /// let boxed_str: Box<[u8]> = Box::from(boxed);
+    ///
+    /// // create a &[u8] which will be used to create a Box<[u8]>
+    /// let slice: &[u8] = &[104, 101, 108, 108, 111];
+    /// let boxed_slice = Box::from(slice);
+    ///
+    /// assert_eq!(boxed_slice, boxed_str);
+    /// ```
+    #[inline]
+    fn from(s: Box<str, A>) -> Self {
+        let (raw, alloc) = Box::into_raw_with_allocator(s);
+        unsafe { Box::from_raw_in(raw as *mut [u8], alloc) }
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "box_from_array", since = "1.45.0")]
+impl<T, const N: usize> From<[T; N]> for Box<[T]> {
+    /// Converts a `[T; N]` into a `Box<[T]>`
+    ///
+    /// This conversion moves the array to newly heap-allocated memory.
+    ///
+    /// # Examples
+    ///
+    /// ```rust
+    /// let boxed: Box<[u8]> = Box::from([4, 2]);
+    /// println!("{boxed:?}");
+    /// ```
+    fn from(array: [T; N]) -> Box<[T]> {
+        box array
+    }
+}
+
+#[stable(feature = "boxed_slice_try_from", since = "1.43.0")]
+impl<T, const N: usize> TryFrom<Box<[T]>> for Box<[T; N]> {
+    type Error = Box<[T]>;
+
+    /// Attempts to convert a `Box<[T]>` into a `Box<[T; N]>`.
+    ///
+    /// The conversion occurs in-place and does not require a
+    /// new memory allocation.
+    ///
+    /// # Errors
+    ///
+    /// Returns the old `Box<[T]>` in the `Err` variant if
+    /// `boxed_slice.len()` does not equal `N`.
+    fn try_from(boxed_slice: Box<[T]>) -> Result<Self, Self::Error> {
+        if boxed_slice.len() == N {
+            Ok(unsafe { Box::from_raw(Box::into_raw(boxed_slice) as *mut [T; N]) })
+        } else {
+            Err(boxed_slice)
+        }
+    }
+}
+
+impl<A: Allocator> Box<dyn Any, A> {
+    /// Attempt to downcast the box to a concrete type.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use std::any::Any;
+    ///
+    /// fn print_if_string(value: Box<dyn Any>) {
+    ///     if let Ok(string) = value.downcast::<String>() {
+    ///         println!("String ({}): {}", string.len(), string);
+    ///     }
+    /// }
+    ///
+    /// let my_string = "Hello World".to_string();
+    /// print_if_string(Box::new(my_string));
+    /// print_if_string(Box::new(0i8));
+    /// ```
+    #[inline]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn downcast<T: Any>(self) -> Result<Box<T, A>, Self> {
+        if self.is::<T>() { unsafe { Ok(self.downcast_unchecked::<T>()) } } else { Err(self) }
+    }
+
+    /// Downcasts the box to a concrete type.
+    ///
+    /// For a safe alternative see [`downcast`].
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(downcast_unchecked)]
+    ///
+    /// use std::any::Any;
+    ///
+    /// let x: Box<dyn Any> = Box::new(1_usize);
+    ///
+    /// unsafe {
+    ///     assert_eq!(*x.downcast_unchecked::<usize>(), 1);
+    /// }
+    /// ```
+    ///
+    /// # Safety
+    ///
+    /// The contained value must be of type `T`. Calling this method
+    /// with the incorrect type is *undefined behavior*.
+    ///
+    /// [`downcast`]: Self::downcast
+    #[inline]
+    #[unstable(feature = "downcast_unchecked", issue = "90850")]
+    pub unsafe fn downcast_unchecked<T: Any>(self) -> Box<T, A> {
+        debug_assert!(self.is::<T>());
+        unsafe {
+            let (raw, alloc): (*mut dyn Any, _) = Box::into_raw_with_allocator(self);
+            Box::from_raw_in(raw as *mut T, alloc)
+        }
+    }
+}
+
+impl<A: Allocator> Box<dyn Any + Send, A> {
+    /// Attempt to downcast the box to a concrete type.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use std::any::Any;
+    ///
+    /// fn print_if_string(value: Box<dyn Any + Send>) {
+    ///     if let Ok(string) = value.downcast::<String>() {
+    ///         println!("String ({}): {}", string.len(), string);
+    ///     }
+    /// }
+    ///
+    /// let my_string = "Hello World".to_string();
+    /// print_if_string(Box::new(my_string));
+    /// print_if_string(Box::new(0i8));
+    /// ```
+    #[inline]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn downcast<T: Any>(self) -> Result<Box<T, A>, Self> {
+        if self.is::<T>() { unsafe { Ok(self.downcast_unchecked::<T>()) } } else { Err(self) }
+    }
+
+    /// Downcasts the box to a concrete type.
+    ///
+    /// For a safe alternative see [`downcast`].
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(downcast_unchecked)]
+    ///
+    /// use std::any::Any;
+    ///
+    /// let x: Box<dyn Any + Send> = Box::new(1_usize);
+    ///
+    /// unsafe {
+    ///     assert_eq!(*x.downcast_unchecked::<usize>(), 1);
+    /// }
+    /// ```
+    ///
+    /// # Safety
+    ///
+    /// The contained value must be of type `T`. Calling this method
+    /// with the incorrect type is *undefined behavior*.
+    ///
+    /// [`downcast`]: Self::downcast
+    #[inline]
+    #[unstable(feature = "downcast_unchecked", issue = "90850")]
+    pub unsafe fn downcast_unchecked<T: Any>(self) -> Box<T, A> {
+        debug_assert!(self.is::<T>());
+        unsafe {
+            let (raw, alloc): (*mut (dyn Any + Send), _) = Box::into_raw_with_allocator(self);
+            Box::from_raw_in(raw as *mut T, alloc)
+        }
+    }
+}
+
+impl<A: Allocator> Box<dyn Any + Send + Sync, A> {
+    /// Attempt to downcast the box to a concrete type.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use std::any::Any;
+    ///
+    /// fn print_if_string(value: Box<dyn Any + Send + Sync>) {
+    ///     if let Ok(string) = value.downcast::<String>() {
+    ///         println!("String ({}): {}", string.len(), string);
+    ///     }
+    /// }
+    ///
+    /// let my_string = "Hello World".to_string();
+    /// print_if_string(Box::new(my_string));
+    /// print_if_string(Box::new(0i8));
+    /// ```
+    #[inline]
+    #[stable(feature = "box_send_sync_any_downcast", since = "1.51.0")]
+    pub fn downcast<T: Any>(self) -> Result<Box<T, A>, Self> {
+        if self.is::<T>() { unsafe { Ok(self.downcast_unchecked::<T>()) } } else { Err(self) }
+    }
+
+    /// Downcasts the box to a concrete type.
+    ///
+    /// For a safe alternative see [`downcast`].
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(downcast_unchecked)]
+    ///
+    /// use std::any::Any;
+    ///
+    /// let x: Box<dyn Any + Send + Sync> = Box::new(1_usize);
+    ///
+    /// unsafe {
+    ///     assert_eq!(*x.downcast_unchecked::<usize>(), 1);
+    /// }
+    /// ```
+    ///
+    /// # Safety
+    ///
+    /// The contained value must be of type `T`. Calling this method
+    /// with the incorrect type is *undefined behavior*.
+    ///
+    /// [`downcast`]: Self::downcast
+    #[inline]
+    #[unstable(feature = "downcast_unchecked", issue = "90850")]
+    pub unsafe fn downcast_unchecked<T: Any>(self) -> Box<T, A> {
+        debug_assert!(self.is::<T>());
+        unsafe {
+            let (raw, alloc): (*mut (dyn Any + Send + Sync), _) =
+                Box::into_raw_with_allocator(self);
+            Box::from_raw_in(raw as *mut T, alloc)
+        }
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: fmt::Display + ?Sized, A: Allocator> fmt::Display for Box<T, A> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Display::fmt(&**self, f)
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: fmt::Debug + ?Sized, A: Allocator> fmt::Debug for Box<T, A> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(&**self, f)
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: ?Sized, A: Allocator> fmt::Pointer for Box<T, A> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // It's not possible to extract the inner Uniq directly from the Box,
+        // instead we cast it to a *const which aliases the Unique
+        let ptr: *const T = &**self;
+        fmt::Pointer::fmt(&ptr, f)
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+#[rustc_const_unstable(feature = "const_box", issue = "92521")]
+impl<T: ?Sized, A: Allocator> const Deref for Box<T, A> {
+    type Target = T;
+
+    fn deref(&self) -> &T {
+        &**self
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+#[rustc_const_unstable(feature = "const_box", issue = "92521")]
+impl<T: ?Sized, A: Allocator> const DerefMut for Box<T, A> {
+    fn deref_mut(&mut self) -> &mut T {
+        &mut **self
+    }
+}
+
+#[unstable(feature = "receiver_trait", issue = "none")]
+impl<T: ?Sized, A: Allocator> Receiver for Box<T, A> {}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<I: Iterator + ?Sized, A: Allocator> Iterator for Box<I, A> {
+    type Item = I::Item;
+    fn next(&mut self) -> Option<I::Item> {
+        (**self).next()
+    }
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (**self).size_hint()
+    }
+    fn nth(&mut self, n: usize) -> Option<I::Item> {
+        (**self).nth(n)
+    }
+    fn last(self) -> Option<I::Item> {
+        BoxIter::last(self)
+    }
+}
+
+trait BoxIter {
+    type Item;
+    fn last(self) -> Option<Self::Item>;
+}
+
+impl<I: Iterator + ?Sized, A: Allocator> BoxIter for Box<I, A> {
+    type Item = I::Item;
+    default fn last(self) -> Option<I::Item> {
+        #[inline]
+        fn some<T>(_: Option<T>, x: T) -> Option<T> {
+            Some(x)
+        }
+
+        self.fold(None, some)
+    }
+}
+
+/// Specialization for sized `I`s that uses `I`s implementation of `last()`
+/// instead of the default.
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<I: Iterator, A: Allocator> BoxIter for Box<I, A> {
+    fn last(self) -> Option<I::Item> {
+        (*self).last()
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<I: DoubleEndedIterator + ?Sized, A: Allocator> DoubleEndedIterator for Box<I, A> {
+    fn next_back(&mut self) -> Option<I::Item> {
+        (**self).next_back()
+    }
+    fn nth_back(&mut self, n: usize) -> Option<I::Item> {
+        (**self).nth_back(n)
+    }
+}
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<I: ExactSizeIterator + ?Sized, A: Allocator> ExactSizeIterator for Box<I, A> {
+    fn len(&self) -> usize {
+        (**self).len()
+    }
+    fn is_empty(&self) -> bool {
+        (**self).is_empty()
+    }
+}
+
+#[stable(feature = "fused", since = "1.26.0")]
+impl<I: FusedIterator + ?Sized, A: Allocator> FusedIterator for Box<I, A> {}
+
+#[stable(feature = "boxed_closure_impls", since = "1.35.0")]
+impl<Args, F: FnOnce<Args> + ?Sized, A: Allocator> FnOnce<Args> for Box<F, A> {
+    type Output = <F as FnOnce<Args>>::Output;
+
+    extern "rust-call" fn call_once(self, args: Args) -> Self::Output {
+        <F as FnOnce<Args>>::call_once(*self, args)
+    }
+}
+
+#[stable(feature = "boxed_closure_impls", since = "1.35.0")]
+impl<Args, F: FnMut<Args> + ?Sized, A: Allocator> FnMut<Args> for Box<F, A> {
+    extern "rust-call" fn call_mut(&mut self, args: Args) -> Self::Output {
+        <F as FnMut<Args>>::call_mut(self, args)
+    }
+}
+
+#[stable(feature = "boxed_closure_impls", since = "1.35.0")]
+impl<Args, F: Fn<Args> + ?Sized, A: Allocator> Fn<Args> for Box<F, A> {
+    extern "rust-call" fn call(&self, args: Args) -> Self::Output {
+        <F as Fn<Args>>::call(self, args)
+    }
+}
+
+#[unstable(feature = "coerce_unsized", issue = "27732")]
+impl<T: ?Sized + Unsize<U>, U: ?Sized, A: Allocator> CoerceUnsized<Box<U, A>> for Box<T, A> {}
+
+#[unstable(feature = "dispatch_from_dyn", issue = "none")]
+impl<T: ?Sized + Unsize<U>, U: ?Sized> DispatchFromDyn<Box<U>> for Box<T, Global> {}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "boxed_slice_from_iter", since = "1.32.0")]
+impl<I> FromIterator<I> for Box<[I]> {
+    fn from_iter<T: IntoIterator<Item = I>>(iter: T) -> Self {
+        iter.into_iter().collect::<Vec<_>>().into_boxed_slice()
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "box_slice_clone", since = "1.3.0")]
+impl<T: Clone, A: Allocator + Clone> Clone for Box<[T], A> {
+    fn clone(&self) -> Self {
+        let alloc = Box::allocator(self).clone();
+        self.to_vec_in(alloc).into_boxed_slice()
+    }
+
+    fn clone_from(&mut self, other: &Self) {
+        if self.len() == other.len() {
+            self.clone_from_slice(&other);
+        } else {
+            *self = other.clone();
+        }
+    }
+}
+
+#[stable(feature = "box_borrow", since = "1.1.0")]
+impl<T: ?Sized, A: Allocator> borrow::Borrow<T> for Box<T, A> {
+    fn borrow(&self) -> &T {
+        &**self
+    }
+}
+
+#[stable(feature = "box_borrow", since = "1.1.0")]
+impl<T: ?Sized, A: Allocator> borrow::BorrowMut<T> for Box<T, A> {
+    fn borrow_mut(&mut self) -> &mut T {
+        &mut **self
+    }
+}
+
+#[stable(since = "1.5.0", feature = "smart_ptr_as_ref")]
+impl<T: ?Sized, A: Allocator> AsRef<T> for Box<T, A> {
+    fn as_ref(&self) -> &T {
+        &**self
+    }
+}
+
+#[stable(since = "1.5.0", feature = "smart_ptr_as_ref")]
+impl<T: ?Sized, A: Allocator> AsMut<T> for Box<T, A> {
+    fn as_mut(&mut self) -> &mut T {
+        &mut **self
+    }
+}
+
+/* Nota bene
+ *
+ *  We could have chosen not to add this impl, and instead have written a
+ *  function of Pin<Box<T>> to Pin<T>. Such a function would not be sound,
+ *  because Box<T> implements Unpin even when T does not, as a result of
+ *  this impl.
+ *
+ *  We chose this API instead of the alternative for a few reasons:
+ *      - Logically, it is helpful to understand pinning in regard to the
+ *        memory region being pointed to. For this reason none of the
+ *        standard library pointer types support projecting through a pin
+ *        (Box<T> is the only pointer type in std for which this would be
+ *        safe.)
+ *      - It is in practice very useful to have Box<T> be unconditionally
+ *        Unpin because of trait objects, for which the structural auto
+ *        trait functionality does not apply (e.g., Box<dyn Foo> would
+ *        otherwise not be Unpin).
+ *
+ *  Another type with the same semantics as Box but only a conditional
+ *  implementation of `Unpin` (where `T: Unpin`) would be valid/safe, and
+ *  could have a method to project a Pin<T> from it.
+ */
+#[stable(feature = "pin", since = "1.33.0")]
+#[rustc_const_unstable(feature = "const_box", issue = "92521")]
+impl<T: ?Sized, A: Allocator> const Unpin for Box<T, A> where A: 'static {}
+
+#[unstable(feature = "generator_trait", issue = "43122")]
+impl<G: ?Sized + Generator<R> + Unpin, R, A: Allocator> Generator<R> for Box<G, A>
+where
+    A: 'static,
+{
+    type Yield = G::Yield;
+    type Return = G::Return;
+
+    fn resume(mut self: Pin<&mut Self>, arg: R) -> GeneratorState<Self::Yield, Self::Return> {
+        G::resume(Pin::new(&mut *self), arg)
+    }
+}
+
+#[unstable(feature = "generator_trait", issue = "43122")]
+impl<G: ?Sized + Generator<R>, R, A: Allocator> Generator<R> for Pin<Box<G, A>>
+where
+    A: 'static,
+{
+    type Yield = G::Yield;
+    type Return = G::Return;
+
+    fn resume(mut self: Pin<&mut Self>, arg: R) -> GeneratorState<Self::Yield, Self::Return> {
+        G::resume((*self).as_mut(), arg)
+    }
+}
+
+#[stable(feature = "futures_api", since = "1.36.0")]
+impl<F: ?Sized + Future + Unpin, A: Allocator> Future for Box<F, A>
+where
+    A: 'static,
+{
+    type Output = F::Output;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        F::poll(Pin::new(&mut *self), cx)
+    }
+}
+
+#[unstable(feature = "async_iterator", issue = "79024")]
+impl<S: ?Sized + AsyncIterator + Unpin> AsyncIterator for Box<S> {
+    type Item = S::Item;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        Pin::new(&mut **self).poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (**self).size_hint()
+    }
+}
diff --git a/rust/alloc/collections/mod.rs b/rust/alloc/collections/mod.rs
new file mode 100644
index 000000000000..628a5b155673
--- /dev/null
+++ b/rust/alloc/collections/mod.rs
@@ -0,0 +1,154 @@
+//! Collection types.
+
+#![stable(feature = "rust1", since = "1.0.0")]
+
+#[cfg(not(no_global_oom_handling))]
+pub mod binary_heap;
+#[cfg(not(no_global_oom_handling))]
+mod btree;
+#[cfg(not(no_global_oom_handling))]
+pub mod linked_list;
+#[cfg(not(no_global_oom_handling))]
+pub mod vec_deque;
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+pub mod btree_map {
+    //! An ordered map based on a B-Tree.
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub use super::btree::map::*;
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+pub mod btree_set {
+    //! An ordered set based on a B-Tree.
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub use super::btree::set::*;
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+#[doc(no_inline)]
+pub use binary_heap::BinaryHeap;
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+#[doc(no_inline)]
+pub use btree_map::BTreeMap;
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+#[doc(no_inline)]
+pub use btree_set::BTreeSet;
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+#[doc(no_inline)]
+pub use linked_list::LinkedList;
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+#[doc(no_inline)]
+pub use vec_deque::VecDeque;
+
+use crate::alloc::{Layout, LayoutError};
+use core::fmt::Display;
+
+/// The error type for `try_reserve` methods.
+#[derive(Clone, PartialEq, Eq, Debug)]
+#[stable(feature = "try_reserve", since = "1.57.0")]
+pub struct TryReserveError {
+    kind: TryReserveErrorKind,
+}
+
+impl TryReserveError {
+    /// Details about the allocation that caused the error
+    #[inline]
+    #[must_use]
+    #[unstable(
+        feature = "try_reserve_kind",
+        reason = "Uncertain how much info should be exposed",
+        issue = "48043"
+    )]
+    pub fn kind(&self) -> TryReserveErrorKind {
+        self.kind.clone()
+    }
+}
+
+/// Details of the allocation that caused a `TryReserveError`
+#[derive(Clone, PartialEq, Eq, Debug)]
+#[unstable(
+    feature = "try_reserve_kind",
+    reason = "Uncertain how much info should be exposed",
+    issue = "48043"
+)]
+pub enum TryReserveErrorKind {
+    /// Error due to the computed capacity exceeding the collection's maximum
+    /// (usually `isize::MAX` bytes).
+    CapacityOverflow,
+
+    /// The memory allocator returned an error
+    AllocError {
+        /// The layout of allocation request that failed
+        layout: Layout,
+
+        #[doc(hidden)]
+        #[unstable(
+            feature = "container_error_extra",
+            issue = "none",
+            reason = "\
+            Enable exposing the allocator’s custom error value \
+            if an associated type is added in the future: \
+            https://github.com/rust-lang/wg-allocators/issues/23"
+        )]
+        non_exhaustive: (),
+    },
+}
+
+#[unstable(
+    feature = "try_reserve_kind",
+    reason = "Uncertain how much info should be exposed",
+    issue = "48043"
+)]
+impl From<TryReserveErrorKind> for TryReserveError {
+    #[inline]
+    fn from(kind: TryReserveErrorKind) -> Self {
+        Self { kind }
+    }
+}
+
+#[unstable(feature = "try_reserve_kind", reason = "new API", issue = "48043")]
+impl From<LayoutError> for TryReserveErrorKind {
+    /// Always evaluates to [`TryReserveErrorKind::CapacityOverflow`].
+    #[inline]
+    fn from(_: LayoutError) -> Self {
+        TryReserveErrorKind::CapacityOverflow
+    }
+}
+
+#[stable(feature = "try_reserve", since = "1.57.0")]
+impl Display for TryReserveError {
+    fn fmt(
+        &self,
+        fmt: &mut core::fmt::Formatter<'_>,
+    ) -> core::result::Result<(), core::fmt::Error> {
+        fmt.write_str("memory allocation failed")?;
+        let reason = match self.kind {
+            TryReserveErrorKind::CapacityOverflow => {
+                " because the computed capacity exceeded the collection's maximum"
+            }
+            TryReserveErrorKind::AllocError { .. } => {
+                " because the memory allocator returned a error"
+            }
+        };
+        fmt.write_str(reason)
+    }
+}
+
+/// An intermediate trait for specialization of `Extend`.
+#[doc(hidden)]
+trait SpecExtend<I: IntoIterator> {
+    /// Extends `self` with the contents of the given iterator.
+    fn spec_extend(&mut self, iter: I);
+}
diff --git a/rust/alloc/lib.rs b/rust/alloc/lib.rs
new file mode 100644
index 000000000000..fd21b3671182
--- /dev/null
+++ b/rust/alloc/lib.rs
@@ -0,0 +1,236 @@
+//! # The Rust core allocation and collections library
+//!
+//! This library provides smart pointers and collections for managing
+//! heap-allocated values.
+//!
+//! This library, like libcore, normally doesn’t need to be used directly
+//! since its contents are re-exported in the [`std` crate](../std/index.html).
+//! Crates that use the `#![no_std]` attribute however will typically
+//! not depend on `std`, so they’d use this crate instead.
+//!
+//! ## Boxed values
+//!
+//! The [`Box`] type is a smart pointer type. There can only be one owner of a
+//! [`Box`], and the owner can decide to mutate the contents, which live on the
+//! heap.
+//!
+//! This type can be sent among threads efficiently as the size of a `Box` value
+//! is the same as that of a pointer. Tree-like data structures are often built
+//! with boxes because each node often has only one owner, the parent.
+//!
+//! ## Reference counted pointers
+//!
+//! The [`Rc`] type is a non-threadsafe reference-counted pointer type intended
+//! for sharing memory within a thread. An [`Rc`] pointer wraps a type, `T`, and
+//! only allows access to `&T`, a shared reference.
+//!
+//! This type is useful when inherited mutability (such as using [`Box`]) is too
+//! constraining for an application, and is often paired with the [`Cell`] or
+//! [`RefCell`] types in order to allow mutation.
+//!
+//! ## Atomically reference counted pointers
+//!
+//! The [`Arc`] type is the threadsafe equivalent of the [`Rc`] type. It
+//! provides all the same functionality of [`Rc`], except it requires that the
+//! contained type `T` is shareable. Additionally, [`Arc<T>`][`Arc`] is itself
+//! sendable while [`Rc<T>`][`Rc`] is not.
+//!
+//! This type allows for shared access to the contained data, and is often
+//! paired with synchronization primitives such as mutexes to allow mutation of
+//! shared resources.
+//!
+//! ## Collections
+//!
+//! Implementations of the most common general purpose data structures are
+//! defined in this library. They are re-exported through the
+//! [standard collections library](../std/collections/index.html).
+//!
+//! ## Heap interfaces
+//!
+//! The [`alloc`](alloc/index.html) module defines the low-level interface to the
+//! default global allocator. It is not compatible with the libc allocator API.
+//!
+//! [`Arc`]: sync
+//! [`Box`]: boxed
+//! [`Cell`]: core::cell
+//! [`Rc`]: rc
+//! [`RefCell`]: core::cell
+
+// To run liballoc tests without x.py without ending up with two copies of liballoc, Miri needs to be
+// able to "empty" this crate. See <https://github.com/rust-lang/miri-test-libstd/issues/4>.
+// rustc itself never sets the feature, so this line has no affect there.
+#![cfg(any(not(feature = "miri-test-libstd"), test, doctest))]
+#![allow(unused_attributes)]
+#![stable(feature = "alloc", since = "1.36.0")]
+#![doc(
+    html_playground_url = "https://play.rust-lang.org/",
+    issue_tracker_base_url = "https://github.com/rust-lang/rust/issues/",
+    test(no_crate_inject, attr(allow(unused_variables), deny(warnings)))
+)]
+#![doc(cfg_hide(
+    not(test),
+    not(any(test, bootstrap)),
+    any(not(feature = "miri-test-libstd"), test, doctest),
+    no_global_oom_handling,
+    not(no_global_oom_handling),
+    target_has_atomic = "ptr"
+))]
+#![no_std]
+#![needs_allocator]
+//
+// Lints:
+#![deny(unsafe_op_in_unsafe_fn)]
+#![warn(deprecated_in_future)]
+#![warn(missing_debug_implementations)]
+#![warn(missing_docs)]
+#![allow(explicit_outlives_requirements)]
+//
+// Library features:
+#![cfg_attr(not(no_global_oom_handling), feature(alloc_c_string))]
+#![feature(alloc_layout_extra)]
+#![feature(allocator_api)]
+#![feature(array_chunks)]
+#![feature(array_methods)]
+#![feature(array_windows)]
+#![feature(assert_matches)]
+#![feature(async_iterator)]
+#![feature(coerce_unsized)]
+#![cfg_attr(not(no_global_oom_handling), feature(const_alloc_error))]
+#![feature(const_box)]
+#![cfg_attr(not(no_global_oom_handling), feature(const_btree_new))]
+#![feature(const_cow_is_borrowed)]
+#![feature(const_convert)]
+#![feature(const_size_of_val)]
+#![feature(const_align_of_val)]
+#![feature(const_ptr_read)]
+#![feature(const_maybe_uninit_write)]
+#![feature(const_maybe_uninit_as_mut_ptr)]
+#![feature(const_refs_to_cell)]
+#![feature(core_c_str)]
+#![feature(core_intrinsics)]
+#![feature(core_ffi_c)]
+#![feature(const_eval_select)]
+#![feature(const_pin)]
+#![feature(cstr_from_bytes_until_nul)]
+#![feature(dispatch_from_dyn)]
+#![feature(exact_size_is_empty)]
+#![feature(extend_one)]
+#![feature(fmt_internals)]
+#![feature(fn_traits)]
+#![feature(hasher_prefixfree_extras)]
+#![feature(inplace_iteration)]
+#![feature(iter_advance_by)]
+#![feature(layout_for_ptr)]
+#![feature(maybe_uninit_slice)]
+#![cfg_attr(test, feature(new_uninit))]
+#![feature(nonnull_slice_from_raw_parts)]
+#![feature(pattern)]
+#![feature(ptr_internals)]
+#![feature(ptr_metadata)]
+#![feature(ptr_sub_ptr)]
+#![feature(receiver_trait)]
+#![feature(set_ptr_value)]
+#![feature(slice_group_by)]
+#![feature(slice_ptr_get)]
+#![feature(slice_ptr_len)]
+#![feature(slice_range)]
+#![feature(str_internals)]
+#![feature(strict_provenance)]
+#![feature(trusted_len)]
+#![feature(trusted_random_access)]
+#![feature(try_trait_v2)]
+#![feature(unchecked_math)]
+#![feature(unicode_internals)]
+#![feature(unsize)]
+//
+// Language features:
+#![feature(allocator_internals)]
+#![feature(allow_internal_unstable)]
+#![feature(associated_type_bounds)]
+#![feature(box_syntax)]
+#![feature(cfg_sanitize)]
+#![feature(const_deref)]
+#![feature(const_mut_refs)]
+#![feature(const_ptr_write)]
+#![feature(const_precise_live_drops)]
+#![feature(const_trait_impl)]
+#![feature(const_try)]
+#![feature(dropck_eyepatch)]
+#![feature(exclusive_range_pattern)]
+#![feature(fundamental)]
+#![cfg_attr(not(test), feature(generator_trait))]
+#![feature(hashmap_internals)]
+#![feature(lang_items)]
+#![feature(let_else)]
+#![feature(min_specialization)]
+#![feature(negative_impls)]
+#![feature(never_type)]
+#![feature(nll)] // Not necessary, but here to test the `nll` feature.
+#![feature(rustc_allow_const_fn_unstable)]
+#![feature(rustc_attrs)]
+#![feature(slice_internals)]
+#![feature(staged_api)]
+#![cfg_attr(test, feature(test))]
+#![feature(unboxed_closures)]
+#![feature(unsized_fn_params)]
+#![feature(c_unwind)]
+//
+// Rustdoc features:
+#![feature(doc_cfg)]
+#![feature(doc_cfg_hide)]
+// Technically, this is a bug in rustdoc: rustdoc sees the documentation on `#[lang = slice_alloc]`
+// blocks is for `&[T]`, which also has documentation using this feature in `core`, and gets mad
+// that the feature-gate isn't enabled. Ideally, it wouldn't check for the feature gate for docs
+// from other crates, but since this can only appear for lang items, it doesn't seem worth fixing.
+#![feature(intra_doc_pointers)]
+
+// Allow testing this library
+#[cfg(test)]
+#[macro_use]
+extern crate std;
+#[cfg(test)]
+extern crate test;
+
+// Module with internal macros used by other modules (needs to be included before other modules).
+#[macro_use]
+mod macros;
+
+mod raw_vec;
+
+// Heaps provided for low-level allocation strategies
+
+pub mod alloc;
+
+// Primitive types using the heaps above
+
+// Need to conditionally define the mod from `boxed.rs` to avoid
+// duplicating the lang-items when building in test cfg; but also need
+// to allow code to have `use boxed::Box;` declarations.
+#[cfg(not(test))]
+pub mod boxed;
+#[cfg(test)]
+mod boxed {
+    pub use std::boxed::Box;
+}
+pub mod borrow;
+pub mod collections;
+#[cfg(not(no_global_oom_handling))]
+pub mod ffi;
+pub mod fmt;
+pub mod rc;
+pub mod slice;
+pub mod str;
+pub mod string;
+#[cfg(target_has_atomic = "ptr")]
+pub mod sync;
+#[cfg(all(not(no_global_oom_handling), target_has_atomic = "ptr"))]
+pub mod task;
+#[cfg(test)]
+mod tests;
+pub mod vec;
+
+#[doc(hidden)]
+#[unstable(feature = "liballoc_internals", issue = "none", reason = "implementation detail")]
+pub mod __export {
+    pub use core::format_args;
+}
diff --git a/rust/alloc/raw_vec.rs b/rust/alloc/raw_vec.rs
new file mode 100644
index 000000000000..4be5f6cf9ca5
--- /dev/null
+++ b/rust/alloc/raw_vec.rs
@@ -0,0 +1,518 @@
+#![unstable(feature = "raw_vec_internals", reason = "unstable const warnings", issue = "none")]
+
+use core::alloc::LayoutError;
+use core::cmp;
+use core::intrinsics;
+use core::mem::{self, ManuallyDrop, MaybeUninit};
+use core::ops::Drop;
+use core::ptr::{self, NonNull, Unique};
+use core::slice;
+
+#[cfg(not(no_global_oom_handling))]
+use crate::alloc::handle_alloc_error;
+use crate::alloc::{Allocator, Global, Layout};
+use crate::boxed::Box;
+use crate::collections::TryReserveError;
+use crate::collections::TryReserveErrorKind::*;
+
+#[cfg(test)]
+mod tests;
+
+#[cfg(not(no_global_oom_handling))]
+enum AllocInit {
+    /// The contents of the new memory are uninitialized.
+    Uninitialized,
+    /// The new memory is guaranteed to be zeroed.
+    Zeroed,
+}
+
+/// A low-level utility for more ergonomically allocating, reallocating, and deallocating
+/// a buffer of memory on the heap without having to worry about all the corner cases
+/// involved. This type is excellent for building your own data structures like Vec and VecDeque.
+/// In particular:
+///
+/// * Produces `Unique::dangling()` on zero-sized types.
+/// * Produces `Unique::dangling()` on zero-length allocations.
+/// * Avoids freeing `Unique::dangling()`.
+/// * Catches all overflows in capacity computations (promotes them to "capacity overflow" panics).
+/// * Guards against 32-bit systems allocating more than isize::MAX bytes.
+/// * Guards against overflowing your length.
+/// * Calls `handle_alloc_error` for fallible allocations.
+/// * Contains a `ptr::Unique` and thus endows the user with all related benefits.
+/// * Uses the excess returned from the allocator to use the largest available capacity.
+///
+/// This type does not in anyway inspect the memory that it manages. When dropped it *will*
+/// free its memory, but it *won't* try to drop its contents. It is up to the user of `RawVec`
+/// to handle the actual things *stored* inside of a `RawVec`.
+///
+/// Note that the excess of a zero-sized types is always infinite, so `capacity()` always returns
+/// `usize::MAX`. This means that you need to be careful when round-tripping this type with a
+/// `Box<[T]>`, since `capacity()` won't yield the length.
+#[allow(missing_debug_implementations)]
+pub(crate) struct RawVec<T, A: Allocator = Global> {
+    ptr: Unique<T>,
+    cap: usize,
+    alloc: A,
+}
+
+impl<T> RawVec<T, Global> {
+    /// HACK(Centril): This exists because stable `const fn` can only call stable `const fn`, so
+    /// they cannot call `Self::new()`.
+    ///
+    /// If you change `RawVec<T>::new` or dependencies, please take care to not introduce anything
+    /// that would truly const-call something unstable.
+    pub const NEW: Self = Self::new();
+
+    /// Creates the biggest possible `RawVec` (on the system heap)
+    /// without allocating. If `T` has positive size, then this makes a
+    /// `RawVec` with capacity `0`. If `T` is zero-sized, then it makes a
+    /// `RawVec` with capacity `usize::MAX`. Useful for implementing
+    /// delayed allocation.
+    #[must_use]
+    pub const fn new() -> Self {
+        Self::new_in(Global)
+    }
+
+    /// Creates a `RawVec` (on the system heap) with exactly the
+    /// capacity and alignment requirements for a `[T; capacity]`. This is
+    /// equivalent to calling `RawVec::new` when `capacity` is `0` or `T` is
+    /// zero-sized. Note that if `T` is zero-sized this means you will
+    /// *not* get a `RawVec` with the requested capacity.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the requested capacity exceeds `isize::MAX` bytes.
+    ///
+    /// # Aborts
+    ///
+    /// Aborts on OOM.
+    #[cfg(not(any(no_global_oom_handling, test)))]
+    #[must_use]
+    #[inline]
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self::with_capacity_in(capacity, Global)
+    }
+
+    /// Like `with_capacity`, but guarantees the buffer is zeroed.
+    #[cfg(not(any(no_global_oom_handling, test)))]
+    #[must_use]
+    #[inline]
+    pub fn with_capacity_zeroed(capacity: usize) -> Self {
+        Self::with_capacity_zeroed_in(capacity, Global)
+    }
+}
+
+impl<T, A: Allocator> RawVec<T, A> {
+    // Tiny Vecs are dumb. Skip to:
+    // - 8 if the element size is 1, because any heap allocators is likely
+    //   to round up a request of less than 8 bytes to at least 8 bytes.
+    // - 4 if elements are moderate-sized (<= 1 KiB).
+    // - 1 otherwise, to avoid wasting too much space for very short Vecs.
+    pub(crate) const MIN_NON_ZERO_CAP: usize = if mem::size_of::<T>() == 1 {
+        8
+    } else if mem::size_of::<T>() <= 1024 {
+        4
+    } else {
+        1
+    };
+
+    /// Like `new`, but parameterized over the choice of allocator for
+    /// the returned `RawVec`.
+    pub const fn new_in(alloc: A) -> Self {
+        // `cap: 0` means "unallocated". zero-sized types are ignored.
+        Self { ptr: Unique::dangling(), cap: 0, alloc }
+    }
+
+    /// Like `with_capacity`, but parameterized over the choice of
+    /// allocator for the returned `RawVec`.
+    #[cfg(not(no_global_oom_handling))]
+    #[inline]
+    pub fn with_capacity_in(capacity: usize, alloc: A) -> Self {
+        Self::allocate_in(capacity, AllocInit::Uninitialized, alloc)
+    }
+
+    /// Like `with_capacity_zeroed`, but parameterized over the choice
+    /// of allocator for the returned `RawVec`.
+    #[cfg(not(no_global_oom_handling))]
+    #[inline]
+    pub fn with_capacity_zeroed_in(capacity: usize, alloc: A) -> Self {
+        Self::allocate_in(capacity, AllocInit::Zeroed, alloc)
+    }
+
+    /// Converts the entire buffer into `Box<[MaybeUninit<T>]>` with the specified `len`.
+    ///
+    /// Note that this will correctly reconstitute any `cap` changes
+    /// that may have been performed. (See description of type for details.)
+    ///
+    /// # Safety
+    ///
+    /// * `len` must be greater than or equal to the most recently requested capacity, and
+    /// * `len` must be less than or equal to `self.capacity()`.
+    ///
+    /// Note, that the requested capacity and `self.capacity()` could differ, as
+    /// an allocator could overallocate and return a greater memory block than requested.
+    pub unsafe fn into_box(self, len: usize) -> Box<[MaybeUninit<T>], A> {
+        // Sanity-check one half of the safety requirement (we cannot check the other half).
+        debug_assert!(
+            len <= self.capacity(),
+            "`len` must be smaller than or equal to `self.capacity()`"
+        );
+
+        let me = ManuallyDrop::new(self);
+        unsafe {
+            let slice = slice::from_raw_parts_mut(me.ptr() as *mut MaybeUninit<T>, len);
+            Box::from_raw_in(slice, ptr::read(&me.alloc))
+        }
+    }
+
+    #[cfg(not(no_global_oom_handling))]
+    fn allocate_in(capacity: usize, init: AllocInit, alloc: A) -> Self {
+        // Don't allocate here because `Drop` will not deallocate when `capacity` is 0.
+        if mem::size_of::<T>() == 0 || capacity == 0 {
+            Self::new_in(alloc)
+        } else {
+            // We avoid `unwrap_or_else` here because it bloats the amount of
+            // LLVM IR generated.
+            let layout = match Layout::array::<T>(capacity) {
+                Ok(layout) => layout,
+                Err(_) => capacity_overflow(),
+            };
+            match alloc_guard(layout.size()) {
+                Ok(_) => {}
+                Err(_) => capacity_overflow(),
+            }
+            let result = match init {
+                AllocInit::Uninitialized => alloc.allocate(layout),
+                AllocInit::Zeroed => alloc.allocate_zeroed(layout),
+            };
+            let ptr = match result {
+                Ok(ptr) => ptr,
+                Err(_) => handle_alloc_error(layout),
+            };
+
+            // Allocators currently return a `NonNull<[u8]>` whose length
+            // matches the size requested. If that ever changes, the capacity
+            // here should change to `ptr.len() / mem::size_of::<T>()`.
+            Self {
+                ptr: unsafe { Unique::new_unchecked(ptr.cast().as_ptr()) },
+                cap: capacity,
+                alloc,
+            }
+        }
+    }
+
+    /// Reconstitutes a `RawVec` from a pointer, capacity, and allocator.
+    ///
+    /// # Safety
+    ///
+    /// The `ptr` must be allocated (via the given allocator `alloc`), and with the given
+    /// `capacity`.
+    /// The `capacity` cannot exceed `isize::MAX` for sized types. (only a concern on 32-bit
+    /// systems). ZST vectors may have a capacity up to `usize::MAX`.
+    /// If the `ptr` and `capacity` come from a `RawVec` created via `alloc`, then this is
+    /// guaranteed.
+    #[inline]
+    pub unsafe fn from_raw_parts_in(ptr: *mut T, capacity: usize, alloc: A) -> Self {
+        Self { ptr: unsafe { Unique::new_unchecked(ptr) }, cap: capacity, alloc }
+    }
+
+    /// Gets a raw pointer to the start of the allocation. Note that this is
+    /// `Unique::dangling()` if `capacity == 0` or `T` is zero-sized. In the former case, you must
+    /// be careful.
+    #[inline]
+    pub fn ptr(&self) -> *mut T {
+        self.ptr.as_ptr()
+    }
+
+    /// Gets the capacity of the allocation.
+    ///
+    /// This will always be `usize::MAX` if `T` is zero-sized.
+    #[inline(always)]
+    pub fn capacity(&self) -> usize {
+        if mem::size_of::<T>() == 0 { usize::MAX } else { self.cap }
+    }
+
+    /// Returns a shared reference to the allocator backing this `RawVec`.
+    pub fn allocator(&self) -> &A {
+        &self.alloc
+    }
+
+    fn current_memory(&self) -> Option<(NonNull<u8>, Layout)> {
+        if mem::size_of::<T>() == 0 || self.cap == 0 {
+            None
+        } else {
+            // We have an allocated chunk of memory, so we can bypass runtime
+            // checks to get our current layout.
+            unsafe {
+                let layout = Layout::array::<T>(self.cap).unwrap_unchecked();
+                Some((self.ptr.cast().into(), layout))
+            }
+        }
+    }
+
+    /// Ensures that the buffer contains at least enough space to hold `len +
+    /// additional` elements. If it doesn't already have enough capacity, will
+    /// reallocate enough space plus comfortable slack space to get amortized
+    /// *O*(1) behavior. Will limit this behavior if it would needlessly cause
+    /// itself to panic.
+    ///
+    /// If `len` exceeds `self.capacity()`, this may fail to actually allocate
+    /// the requested space. This is not really unsafe, but the unsafe
+    /// code *you* write that relies on the behavior of this function may break.
+    ///
+    /// This is ideal for implementing a bulk-push operation like `extend`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` bytes.
+    ///
+    /// # Aborts
+    ///
+    /// Aborts on OOM.
+    #[cfg(not(no_global_oom_handling))]
+    #[inline]
+    pub fn reserve(&mut self, len: usize, additional: usize) {
+        // Callers expect this function to be very cheap when there is already sufficient capacity.
+        // Therefore, we move all the resizing and error-handling logic from grow_amortized and
+        // handle_reserve behind a call, while making sure that this function is likely to be
+        // inlined as just a comparison and a call if the comparison fails.
+        #[cold]
+        fn do_reserve_and_handle<T, A: Allocator>(
+            slf: &mut RawVec<T, A>,
+            len: usize,
+            additional: usize,
+        ) {
+            handle_reserve(slf.grow_amortized(len, additional));
+        }
+
+        if self.needs_to_grow(len, additional) {
+            do_reserve_and_handle(self, len, additional);
+        }
+    }
+
+    /// A specialized version of `reserve()` used only by the hot and
+    /// oft-instantiated `Vec::push()`, which does its own capacity check.
+    #[cfg(not(no_global_oom_handling))]
+    #[inline(never)]
+    pub fn reserve_for_push(&mut self, len: usize) {
+        handle_reserve(self.grow_amortized(len, 1));
+    }
+
+    /// The same as `reserve`, but returns on errors instead of panicking or aborting.
+    pub fn try_reserve(&mut self, len: usize, additional: usize) -> Result<(), TryReserveError> {
+        if self.needs_to_grow(len, additional) {
+            self.grow_amortized(len, additional)
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Ensures that the buffer contains at least enough space to hold `len +
+    /// additional` elements. If it doesn't already, will reallocate the
+    /// minimum possible amount of memory necessary. Generally this will be
+    /// exactly the amount of memory necessary, but in principle the allocator
+    /// is free to give back more than we asked for.
+    ///
+    /// If `len` exceeds `self.capacity()`, this may fail to actually allocate
+    /// the requested space. This is not really unsafe, but the unsafe code
+    /// *you* write that relies on the behavior of this function may break.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` bytes.
+    ///
+    /// # Aborts
+    ///
+    /// Aborts on OOM.
+    #[cfg(not(no_global_oom_handling))]
+    pub fn reserve_exact(&mut self, len: usize, additional: usize) {
+        handle_reserve(self.try_reserve_exact(len, additional));
+    }
+
+    /// The same as `reserve_exact`, but returns on errors instead of panicking or aborting.
+    pub fn try_reserve_exact(
+        &mut self,
+        len: usize,
+        additional: usize,
+    ) -> Result<(), TryReserveError> {
+        if self.needs_to_grow(len, additional) { self.grow_exact(len, additional) } else { Ok(()) }
+    }
+
+    /// Shrinks the buffer down to the specified capacity. If the given amount
+    /// is 0, actually completely deallocates.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the given amount is *larger* than the current capacity.
+    ///
+    /// # Aborts
+    ///
+    /// Aborts on OOM.
+    #[cfg(not(no_global_oom_handling))]
+    pub fn shrink_to_fit(&mut self, cap: usize) {
+        handle_reserve(self.shrink(cap));
+    }
+}
+
+impl<T, A: Allocator> RawVec<T, A> {
+    /// Returns if the buffer needs to grow to fulfill the needed extra capacity.
+    /// Mainly used to make inlining reserve-calls possible without inlining `grow`.
+    fn needs_to_grow(&self, len: usize, additional: usize) -> bool {
+        additional > self.capacity().wrapping_sub(len)
+    }
+
+    fn set_ptr_and_cap(&mut self, ptr: NonNull<[u8]>, cap: usize) {
+        // Allocators currently return a `NonNull<[u8]>` whose length matches
+        // the size requested. If that ever changes, the capacity here should
+        // change to `ptr.len() / mem::size_of::<T>()`.
+        self.ptr = unsafe { Unique::new_unchecked(ptr.cast().as_ptr()) };
+        self.cap = cap;
+    }
+
+    // This method is usually instantiated many times. So we want it to be as
+    // small as possible, to improve compile times. But we also want as much of
+    // its contents to be statically computable as possible, to make the
+    // generated code run faster. Therefore, this method is carefully written
+    // so that all of the code that depends on `T` is within it, while as much
+    // of the code that doesn't depend on `T` as possible is in functions that
+    // are non-generic over `T`.
+    fn grow_amortized(&mut self, len: usize, additional: usize) -> Result<(), TryReserveError> {
+        // This is ensured by the calling contexts.
+        debug_assert!(additional > 0);
+
+        if mem::size_of::<T>() == 0 {
+            // Since we return a capacity of `usize::MAX` when `elem_size` is
+            // 0, getting to here necessarily means the `RawVec` is overfull.
+            return Err(CapacityOverflow.into());
+        }
+
+        // Nothing we can really do about these checks, sadly.
+        let required_cap = len.checked_add(additional).ok_or(CapacityOverflow)?;
+
+        // This guarantees exponential growth. The doubling cannot overflow
+        // because `cap <= isize::MAX` and the type of `cap` is `usize`.
+        let cap = cmp::max(self.cap * 2, required_cap);
+        let cap = cmp::max(Self::MIN_NON_ZERO_CAP, cap);
+
+        let new_layout = Layout::array::<T>(cap);
+
+        // `finish_grow` is non-generic over `T`.
+        let ptr = finish_grow(new_layout, self.current_memory(), &mut self.alloc)?;
+        self.set_ptr_and_cap(ptr, cap);
+        Ok(())
+    }
+
+    // The constraints on this method are much the same as those on
+    // `grow_amortized`, but this method is usually instantiated less often so
+    // it's less critical.
+    fn grow_exact(&mut self, len: usize, additional: usize) -> Result<(), TryReserveError> {
+        if mem::size_of::<T>() == 0 {
+            // Since we return a capacity of `usize::MAX` when the type size is
+            // 0, getting to here necessarily means the `RawVec` is overfull.
+            return Err(CapacityOverflow.into());
+        }
+
+        let cap = len.checked_add(additional).ok_or(CapacityOverflow)?;
+        let new_layout = Layout::array::<T>(cap);
+
+        // `finish_grow` is non-generic over `T`.
+        let ptr = finish_grow(new_layout, self.current_memory(), &mut self.alloc)?;
+        self.set_ptr_and_cap(ptr, cap);
+        Ok(())
+    }
+
+    fn shrink(&mut self, cap: usize) -> Result<(), TryReserveError> {
+        assert!(cap <= self.capacity(), "Tried to shrink to a larger capacity");
+
+        let (ptr, layout) = if let Some(mem) = self.current_memory() { mem } else { return Ok(()) };
+
+        let ptr = unsafe {
+            // `Layout::array` cannot overflow here because it would have
+            // overflowed earlier when capacity was larger.
+            let new_layout = Layout::array::<T>(cap).unwrap_unchecked();
+            self.alloc
+                .shrink(ptr, layout, new_layout)
+                .map_err(|_| AllocError { layout: new_layout, non_exhaustive: () })?
+        };
+        self.set_ptr_and_cap(ptr, cap);
+        Ok(())
+    }
+}
+
+// This function is outside `RawVec` to minimize compile times. See the comment
+// above `RawVec::grow_amortized` for details. (The `A` parameter isn't
+// significant, because the number of different `A` types seen in practice is
+// much smaller than the number of `T` types.)
+#[inline(never)]
+fn finish_grow<A>(
+    new_layout: Result<Layout, LayoutError>,
+    current_memory: Option<(NonNull<u8>, Layout)>,
+    alloc: &mut A,
+) -> Result<NonNull<[u8]>, TryReserveError>
+where
+    A: Allocator,
+{
+    // Check for the error here to minimize the size of `RawVec::grow_*`.
+    let new_layout = new_layout.map_err(|_| CapacityOverflow)?;
+
+    alloc_guard(new_layout.size())?;
+
+    let memory = if let Some((ptr, old_layout)) = current_memory {
+        debug_assert_eq!(old_layout.align(), new_layout.align());
+        unsafe {
+            // The allocator checks for alignment equality
+            intrinsics::assume(old_layout.align() == new_layout.align());
+            alloc.grow(ptr, old_layout, new_layout)
+        }
+    } else {
+        alloc.allocate(new_layout)
+    };
+
+    memory.map_err(|_| AllocError { layout: new_layout, non_exhaustive: () }.into())
+}
+
+unsafe impl<#[may_dangle] T, A: Allocator> Drop for RawVec<T, A> {
+    /// Frees the memory owned by the `RawVec` *without* trying to drop its contents.
+    fn drop(&mut self) {
+        if let Some((ptr, layout)) = self.current_memory() {
+            unsafe { self.alloc.deallocate(ptr, layout) }
+        }
+    }
+}
+
+// Central function for reserve error handling.
+#[cfg(not(no_global_oom_handling))]
+#[inline]
+fn handle_reserve(result: Result<(), TryReserveError>) {
+    match result.map_err(|e| e.kind()) {
+        Err(CapacityOverflow) => capacity_overflow(),
+        Err(AllocError { layout, .. }) => handle_alloc_error(layout),
+        Ok(()) => { /* yay */ }
+    }
+}
+
+// We need to guarantee the following:
+// * We don't ever allocate `> isize::MAX` byte-size objects.
+// * We don't overflow `usize::MAX` and actually allocate too little.
+//
+// On 64-bit we just need to check for overflow since trying to allocate
+// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add
+// an extra guard for this in case we're running on a platform which can use
+// all 4GB in user-space, e.g., PAE or x32.
+
+#[inline]
+fn alloc_guard(alloc_size: usize) -> Result<(), TryReserveError> {
+    if usize::BITS < 64 && alloc_size > isize::MAX as usize {
+        Err(CapacityOverflow.into())
+    } else {
+        Ok(())
+    }
+}
+
+// One central function responsible for reporting capacity overflows. This'll
+// ensure that the code generation related to these panics is minimal as there's
+// only one location which panics rather than a bunch throughout the module.
+#[cfg(not(no_global_oom_handling))]
+fn capacity_overflow() -> ! {
+    panic!("capacity overflow");
+}
diff --git a/rust/alloc/slice.rs b/rust/alloc/slice.rs
new file mode 100644
index 000000000000..199b3c9d0290
--- /dev/null
+++ b/rust/alloc/slice.rs
@@ -0,0 +1,1202 @@
+//! A dynamically-sized view into a contiguous sequence, `[T]`.
+//!
+//! *[See also the slice primitive type](slice).*
+//!
+//! Slices are a view into a block of memory represented as a pointer and a
+//! length.
+//!
+//! ```
+//! // slicing a Vec
+//! let vec = vec![1, 2, 3];
+//! let int_slice = &vec[..];
+//! // coercing an array to a slice
+//! let str_slice: &[&str] = &["one", "two", "three"];
+//! ```
+//!
+//! Slices are either mutable or shared. The shared slice type is `&[T]`,
+//! while the mutable slice type is `&mut [T]`, where `T` represents the element
+//! type. For example, you can mutate the block of memory that a mutable slice
+//! points to:
+//!
+//! ```
+//! let x = &mut [1, 2, 3];
+//! x[1] = 7;
+//! assert_eq!(x, &[1, 7, 3]);
+//! ```
+//!
+//! Here are some of the things this module contains:
+//!
+//! ## Structs
+//!
+//! There are several structs that are useful for slices, such as [`Iter`], which
+//! represents iteration over a slice.
+//!
+//! ## Trait Implementations
+//!
+//! There are several implementations of common traits for slices. Some examples
+//! include:
+//!
+//! * [`Clone`]
+//! * [`Eq`], [`Ord`] - for slices whose element type are [`Eq`] or [`Ord`].
+//! * [`Hash`] - for slices whose element type is [`Hash`].
+//!
+//! ## Iteration
+//!
+//! The slices implement `IntoIterator`. The iterator yields references to the
+//! slice elements.
+//!
+//! ```
+//! let numbers = &[0, 1, 2];
+//! for n in numbers {
+//!     println!("{n} is a number!");
+//! }
+//! ```
+//!
+//! The mutable slice yields mutable references to the elements:
+//!
+//! ```
+//! let mut scores = [7, 8, 9];
+//! for score in &mut scores[..] {
+//!     *score += 1;
+//! }
+//! ```
+//!
+//! This iterator yields mutable references to the slice's elements, so while
+//! the element type of the slice is `i32`, the element type of the iterator is
+//! `&mut i32`.
+//!
+//! * [`.iter`] and [`.iter_mut`] are the explicit methods to return the default
+//!   iterators.
+//! * Further methods that return iterators are [`.split`], [`.splitn`],
+//!   [`.chunks`], [`.windows`] and more.
+//!
+//! [`Hash`]: core::hash::Hash
+//! [`.iter`]: slice::iter
+//! [`.iter_mut`]: slice::iter_mut
+//! [`.split`]: slice::split
+//! [`.splitn`]: slice::splitn
+//! [`.chunks`]: slice::chunks
+//! [`.windows`]: slice::windows
+#![stable(feature = "rust1", since = "1.0.0")]
+// Many of the usings in this module are only used in the test configuration.
+// It's cleaner to just turn off the unused_imports warning than to fix them.
+#![cfg_attr(test, allow(unused_imports, dead_code))]
+
+use core::borrow::{Borrow, BorrowMut};
+#[cfg(not(no_global_oom_handling))]
+use core::cmp::Ordering::{self, Less};
+#[cfg(not(no_global_oom_handling))]
+use core::mem;
+#[cfg(not(no_global_oom_handling))]
+use core::mem::size_of;
+#[cfg(not(no_global_oom_handling))]
+use core::ptr;
+
+use crate::alloc::Allocator;
+#[cfg(not(no_global_oom_handling))]
+use crate::alloc::Global;
+#[cfg(not(no_global_oom_handling))]
+use crate::borrow::ToOwned;
+use crate::boxed::Box;
+use crate::vec::Vec;
+
+#[unstable(feature = "slice_range", issue = "76393")]
+pub use core::slice::range;
+#[unstable(feature = "array_chunks", issue = "74985")]
+pub use core::slice::ArrayChunks;
+#[unstable(feature = "array_chunks", issue = "74985")]
+pub use core::slice::ArrayChunksMut;
+#[unstable(feature = "array_windows", issue = "75027")]
+pub use core::slice::ArrayWindows;
+#[stable(feature = "inherent_ascii_escape", since = "1.60.0")]
+pub use core::slice::EscapeAscii;
+#[stable(feature = "slice_get_slice", since = "1.28.0")]
+pub use core::slice::SliceIndex;
+#[stable(feature = "from_ref", since = "1.28.0")]
+pub use core::slice::{from_mut, from_ref};
+#[stable(feature = "rust1", since = "1.0.0")]
+pub use core::slice::{from_raw_parts, from_raw_parts_mut};
+#[stable(feature = "rust1", since = "1.0.0")]
+pub use core::slice::{Chunks, Windows};
+#[stable(feature = "chunks_exact", since = "1.31.0")]
+pub use core::slice::{ChunksExact, ChunksExactMut};
+#[stable(feature = "rust1", since = "1.0.0")]
+pub use core::slice::{ChunksMut, Split, SplitMut};
+#[unstable(feature = "slice_group_by", issue = "80552")]
+pub use core::slice::{GroupBy, GroupByMut};
+#[stable(feature = "rust1", since = "1.0.0")]
+pub use core::slice::{Iter, IterMut};
+#[stable(feature = "rchunks", since = "1.31.0")]
+pub use core::slice::{RChunks, RChunksExact, RChunksExactMut, RChunksMut};
+#[stable(feature = "slice_rsplit", since = "1.27.0")]
+pub use core::slice::{RSplit, RSplitMut};
+#[stable(feature = "rust1", since = "1.0.0")]
+pub use core::slice::{RSplitN, RSplitNMut, SplitN, SplitNMut};
+#[stable(feature = "split_inclusive", since = "1.51.0")]
+pub use core::slice::{SplitInclusive, SplitInclusiveMut};
+
+////////////////////////////////////////////////////////////////////////////////
+// Basic slice extension methods
+////////////////////////////////////////////////////////////////////////////////
+
+// HACK(japaric) needed for the implementation of `vec!` macro during testing
+// N.B., see the `hack` module in this file for more details.
+#[cfg(test)]
+pub use hack::into_vec;
+
+// HACK(japaric) needed for the implementation of `Vec::clone` during testing
+// N.B., see the `hack` module in this file for more details.
+#[cfg(test)]
+pub use hack::to_vec;
+
+// HACK(japaric): With cfg(test) `impl [T]` is not available, these three
+// functions are actually methods that are in `impl [T]` but not in
+// `core::slice::SliceExt` - we need to supply these functions for the
+// `test_permutations` test
+pub(crate) mod hack {
+    use core::alloc::Allocator;
+
+    use crate::boxed::Box;
+    use crate::vec::Vec;
+
+    // We shouldn't add inline attribute to this since this is used in
+    // `vec!` macro mostly and causes perf regression. See #71204 for
+    // discussion and perf results.
+    pub fn into_vec<T, A: Allocator>(b: Box<[T], A>) -> Vec<T, A> {
+        unsafe {
+            let len = b.len();
+            let (b, alloc) = Box::into_raw_with_allocator(b);
+            Vec::from_raw_parts_in(b as *mut T, len, len, alloc)
+        }
+    }
+
+    #[cfg(not(no_global_oom_handling))]
+    #[inline]
+    pub fn to_vec<T: ConvertVec, A: Allocator>(s: &[T], alloc: A) -> Vec<T, A> {
+        T::to_vec(s, alloc)
+    }
+
+    #[cfg(not(no_global_oom_handling))]
+    pub trait ConvertVec {
+        fn to_vec<A: Allocator>(s: &[Self], alloc: A) -> Vec<Self, A>
+        where
+            Self: Sized;
+    }
+
+    #[cfg(not(no_global_oom_handling))]
+    impl<T: Clone> ConvertVec for T {
+        #[inline]
+        default fn to_vec<A: Allocator>(s: &[Self], alloc: A) -> Vec<Self, A> {
+            struct DropGuard<'a, T, A: Allocator> {
+                vec: &'a mut Vec<T, A>,
+                num_init: usize,
+            }
+            impl<'a, T, A: Allocator> Drop for DropGuard<'a, T, A> {
+                #[inline]
+                fn drop(&mut self) {
+                    // SAFETY:
+                    // items were marked initialized in the loop below
+                    unsafe {
+                        self.vec.set_len(self.num_init);
+                    }
+                }
+            }
+            let mut vec = Vec::with_capacity_in(s.len(), alloc);
+            let mut guard = DropGuard { vec: &mut vec, num_init: 0 };
+            let slots = guard.vec.spare_capacity_mut();
+            // .take(slots.len()) is necessary for LLVM to remove bounds checks
+            // and has better codegen than zip.
+            for (i, b) in s.iter().enumerate().take(slots.len()) {
+                guard.num_init = i;
+                slots[i].write(b.clone());
+            }
+            core::mem::forget(guard);
+            // SAFETY:
+            // the vec was allocated and initialized above to at least this length.
+            unsafe {
+                vec.set_len(s.len());
+            }
+            vec
+        }
+    }
+
+    #[cfg(not(no_global_oom_handling))]
+    impl<T: Copy> ConvertVec for T {
+        #[inline]
+        fn to_vec<A: Allocator>(s: &[Self], alloc: A) -> Vec<Self, A> {
+            let mut v = Vec::with_capacity_in(s.len(), alloc);
+            // SAFETY:
+            // allocated above with the capacity of `s`, and initialize to `s.len()` in
+            // ptr::copy_to_non_overlapping below.
+            unsafe {
+                s.as_ptr().copy_to_nonoverlapping(v.as_mut_ptr(), s.len());
+                v.set_len(s.len());
+            }
+            v
+        }
+    }
+}
+
+#[cfg(not(test))]
+impl<T> [T] {
+    /// Sorts the slice.
+    ///
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case.
+    ///
+    /// When applicable, unstable sorting is preferred because it is generally faster than stable
+    /// sorting and it doesn't allocate auxiliary memory.
+    /// See [`sort_unstable`](slice::sort_unstable).
+    ///
+    /// # Current implementation
+    ///
+    /// The current algorithm is an adaptive, iterative merge sort inspired by
+    /// [timsort](https://en.wikipedia.org/wiki/Timsort).
+    /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
+    /// two or more sorted sequences concatenated one after another.
+    ///
+    /// Also, it allocates temporary storage half the size of `self`, but for short slices a
+    /// non-allocating insertion sort is used instead.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut v = [-5, 4, 1, -3, 2];
+    ///
+    /// v.sort();
+    /// assert!(v == [-5, -3, 1, 2, 4]);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[rustc_allow_incoherent_impl]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    #[inline]
+    pub fn sort(&mut self)
+    where
+        T: Ord,
+    {
+        merge_sort(self, |a, b| a.lt(b));
+    }
+
+    /// Sorts the slice with a comparator function.
+    ///
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case.
+    ///
+    /// The comparator function must define a total ordering for the elements in the slice. If
+    /// the ordering is not total, the order of the elements is unspecified. An order is a
+    /// total order if it is (for all `a`, `b` and `c`):
+    ///
+    /// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and
+    /// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`.
+    ///
+    /// For example, while [`f64`] doesn't implement [`Ord`] because `NaN != NaN`, we can use
+    /// `partial_cmp` as our sort function when we know the slice doesn't contain a `NaN`.
+    ///
+    /// ```
+    /// let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0];
+    /// floats.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    /// assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]);
+    /// ```
+    ///
+    /// When applicable, unstable sorting is preferred because it is generally faster than stable
+    /// sorting and it doesn't allocate auxiliary memory.
+    /// See [`sort_unstable_by`](slice::sort_unstable_by).
+    ///
+    /// # Current implementation
+    ///
+    /// The current algorithm is an adaptive, iterative merge sort inspired by
+    /// [timsort](https://en.wikipedia.org/wiki/Timsort).
+    /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
+    /// two or more sorted sequences concatenated one after another.
+    ///
+    /// Also, it allocates temporary storage half the size of `self`, but for short slices a
+    /// non-allocating insertion sort is used instead.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut v = [5, 4, 1, 3, 2];
+    /// v.sort_by(|a, b| a.cmp(b));
+    /// assert!(v == [1, 2, 3, 4, 5]);
+    ///
+    /// // reverse sorting
+    /// v.sort_by(|a, b| b.cmp(a));
+    /// assert!(v == [5, 4, 3, 2, 1]);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[rustc_allow_incoherent_impl]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    #[inline]
+    pub fn sort_by<F>(&mut self, mut compare: F)
+    where
+        F: FnMut(&T, &T) -> Ordering,
+    {
+        merge_sort(self, |a, b| compare(a, b) == Less);
+    }
+
+    /// Sorts the slice with a key extraction function.
+    ///
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* \* log(*n*))
+    /// worst-case, where the key function is *O*(*m*).
+    ///
+    /// For expensive key functions (e.g. functions that are not simple property accesses or
+    /// basic operations), [`sort_by_cached_key`](slice::sort_by_cached_key) is likely to be
+    /// significantly faster, as it does not recompute element keys.
+    ///
+    /// When applicable, unstable sorting is preferred because it is generally faster than stable
+    /// sorting and it doesn't allocate auxiliary memory.
+    /// See [`sort_unstable_by_key`](slice::sort_unstable_by_key).
+    ///
+    /// # Current implementation
+    ///
+    /// The current algorithm is an adaptive, iterative merge sort inspired by
+    /// [timsort](https://en.wikipedia.org/wiki/Timsort).
+    /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of
+    /// two or more sorted sequences concatenated one after another.
+    ///
+    /// Also, it allocates temporary storage half the size of `self`, but for short slices a
+    /// non-allocating insertion sort is used instead.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut v = [-5i32, 4, 1, -3, 2];
+    ///
+    /// v.sort_by_key(|k| k.abs());
+    /// assert!(v == [1, 2, -3, 4, -5]);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[rustc_allow_incoherent_impl]
+    #[stable(feature = "slice_sort_by_key", since = "1.7.0")]
+    #[inline]
+    pub fn sort_by_key<K, F>(&mut self, mut f: F)
+    where
+        F: FnMut(&T) -> K,
+        K: Ord,
+    {
+        merge_sort(self, |a, b| f(a).lt(&f(b)));
+    }
+
+    /// Sorts the slice with a key extraction function.
+    ///
+    /// During sorting, the key function is called at most once per element, by using
+    /// temporary storage to remember the results of key evaluation.
+    /// The order of calls to the key function is unspecified and may change in future versions
+    /// of the standard library.
+    ///
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \* log(*n*))
+    /// worst-case, where the key function is *O*(*m*).
+    ///
+    /// For simple key functions (e.g., functions that are property accesses or
+    /// basic operations), [`sort_by_key`](slice::sort_by_key) is likely to be
+    /// faster.
+    ///
+    /// # Current implementation
+    ///
+    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
+    /// which combines the fast average case of randomized quicksort with the fast worst case of
+    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
+    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
+    /// deterministic behavior.
+    ///
+    /// In the worst case, the algorithm allocates temporary storage in a `Vec<(K, usize)>` the
+    /// length of the slice.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut v = [-5i32, 4, 32, -3, 2];
+    ///
+    /// v.sort_by_cached_key(|k| k.to_string());
+    /// assert!(v == [-3, -5, 2, 32, 4]);
+    /// ```
+    ///
+    /// [pdqsort]: https://github.com/orlp/pdqsort
+    #[cfg(not(no_global_oom_handling))]
+    #[rustc_allow_incoherent_impl]
+    #[stable(feature = "slice_sort_by_cached_key", since = "1.34.0")]
+    #[inline]
+    pub fn sort_by_cached_key<K, F>(&mut self, f: F)
+    where
+        F: FnMut(&T) -> K,
+        K: Ord,
+    {
+        // Helper macro for indexing our vector by the smallest possible type, to reduce allocation.
+        macro_rules! sort_by_key {
+            ($t:ty, $slice:ident, $f:ident) => {{
+                let mut indices: Vec<_> =
+                    $slice.iter().map($f).enumerate().map(|(i, k)| (k, i as $t)).collect();
+                // The elements of `indices` are unique, as they are indexed, so any sort will be
+                // stable with respect to the original slice. We use `sort_unstable` here because
+                // it requires less memory allocation.
+                indices.sort_unstable();
+                for i in 0..$slice.len() {
+                    let mut index = indices[i].1;
+                    while (index as usize) < i {
+                        index = indices[index as usize].1;
+                    }
+                    indices[i].1 = index;
+                    $slice.swap(i, index as usize);
+                }
+            }};
+        }
+
+        let sz_u8 = mem::size_of::<(K, u8)>();
+        let sz_u16 = mem::size_of::<(K, u16)>();
+        let sz_u32 = mem::size_of::<(K, u32)>();
+        let sz_usize = mem::size_of::<(K, usize)>();
+
+        let len = self.len();
+        if len < 2 {
+            return;
+        }
+        if sz_u8 < sz_u16 && len <= (u8::MAX as usize) {
+            return sort_by_key!(u8, self, f);
+        }
+        if sz_u16 < sz_u32 && len <= (u16::MAX as usize) {
+            return sort_by_key!(u16, self, f);
+        }
+        if sz_u32 < sz_usize && len <= (u32::MAX as usize) {
+            return sort_by_key!(u32, self, f);
+        }
+        sort_by_key!(usize, self, f)
+    }
+
+    /// Copies `self` into a new `Vec`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let s = [10, 40, 30];
+    /// let x = s.to_vec();
+    /// // Here, `s` and `x` can be modified independently.
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[rustc_allow_incoherent_impl]
+    #[rustc_conversion_suggestion]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    #[inline]
+    pub fn to_vec(&self) -> Vec<T>
+    where
+        T: Clone,
+    {
+        self.to_vec_in(Global)
+    }
+
+    /// Copies `self` into a new `Vec` with an allocator.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let s = [10, 40, 30];
+    /// let x = s.to_vec_in(System);
+    /// // Here, `s` and `x` can be modified independently.
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[rustc_allow_incoherent_impl]
+    #[inline]
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    pub fn to_vec_in<A: Allocator>(&self, alloc: A) -> Vec<T, A>
+    where
+        T: Clone,
+    {
+        // N.B., see the `hack` module in this file for more details.
+        hack::to_vec(self, alloc)
+    }
+
+    /// Converts `self` into a vector without clones or allocation.
+    ///
+    /// The resulting vector can be converted back into a box via
+    /// `Vec<T>`'s `into_boxed_slice` method.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let s: Box<[i32]> = Box::new([10, 40, 30]);
+    /// let x = s.into_vec();
+    /// // `s` cannot be used anymore because it has been converted into `x`.
+    ///
+    /// assert_eq!(x, vec![10, 40, 30]);
+    /// ```
+    #[rustc_allow_incoherent_impl]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    #[inline]
+    pub fn into_vec<A: Allocator>(self: Box<Self, A>) -> Vec<T, A> {
+        // N.B., see the `hack` module in this file for more details.
+        hack::into_vec(self)
+    }
+
+    /// Creates a vector by repeating a slice `n` times.
+    ///
+    /// # Panics
+    ///
+    /// This function will panic if the capacity would overflow.
+    ///
+    /// # Examples
+    ///
+    /// Basic usage:
+    ///
+    /// ```
+    /// assert_eq!([1, 2].repeat(3), vec![1, 2, 1, 2, 1, 2]);
+    /// ```
+    ///
+    /// A panic upon overflow:
+    ///
+    /// ```should_panic
+    /// // this will panic at runtime
+    /// b"0123456789abcdef".repeat(usize::MAX);
+    /// ```
+    #[rustc_allow_incoherent_impl]
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "repeat_generic_slice", since = "1.40.0")]
+    pub fn repeat(&self, n: usize) -> Vec<T>
+    where
+        T: Copy,
+    {
+        if n == 0 {
+            return Vec::new();
+        }
+
+        // If `n` is larger than zero, it can be split as
+        // `n = 2^expn + rem (2^expn > rem, expn >= 0, rem >= 0)`.
+        // `2^expn` is the number represented by the leftmost '1' bit of `n`,
+        // and `rem` is the remaining part of `n`.
+
+        // Using `Vec` to access `set_len()`.
+        let capacity = self.len().checked_mul(n).expect("capacity overflow");
+        let mut buf = Vec::with_capacity(capacity);
+
+        // `2^expn` repetition is done by doubling `buf` `expn`-times.
+        buf.extend(self);
+        {
+            let mut m = n >> 1;
+            // If `m > 0`, there are remaining bits up to the leftmost '1'.
+            while m > 0 {
+                // `buf.extend(buf)`:
+                unsafe {
+                    ptr::copy_nonoverlapping(
+                        buf.as_ptr(),
+                        (buf.as_mut_ptr() as *mut T).add(buf.len()),
+                        buf.len(),
+                    );
+                    // `buf` has capacity of `self.len() * n`.
+                    let buf_len = buf.len();
+                    buf.set_len(buf_len * 2);
+                }
+
+                m >>= 1;
+            }
+        }
+
+        // `rem` (`= n - 2^expn`) repetition is done by copying
+        // first `rem` repetitions from `buf` itself.
+        let rem_len = capacity - buf.len(); // `self.len() * rem`
+        if rem_len > 0 {
+            // `buf.extend(buf[0 .. rem_len])`:
+            unsafe {
+                // This is non-overlapping since `2^expn > rem`.
+                ptr::copy_nonoverlapping(
+                    buf.as_ptr(),
+                    (buf.as_mut_ptr() as *mut T).add(buf.len()),
+                    rem_len,
+                );
+                // `buf.len() + rem_len` equals to `buf.capacity()` (`= self.len() * n`).
+                buf.set_len(capacity);
+            }
+        }
+        buf
+    }
+
+    /// Flattens a slice of `T` into a single value `Self::Output`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// assert_eq!(["hello", "world"].concat(), "helloworld");
+    /// assert_eq!([[1, 2], [3, 4]].concat(), [1, 2, 3, 4]);
+    /// ```
+    #[rustc_allow_incoherent_impl]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn concat<Item: ?Sized>(&self) -> <Self as Concat<Item>>::Output
+    where
+        Self: Concat<Item>,
+    {
+        Concat::concat(self)
+    }
+
+    /// Flattens a slice of `T` into a single value `Self::Output`, placing a
+    /// given separator between each.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// assert_eq!(["hello", "world"].join(" "), "hello world");
+    /// assert_eq!([[1, 2], [3, 4]].join(&0), [1, 2, 0, 3, 4]);
+    /// assert_eq!([[1, 2], [3, 4]].join(&[0, 0][..]), [1, 2, 0, 0, 3, 4]);
+    /// ```
+    #[rustc_allow_incoherent_impl]
+    #[stable(feature = "rename_connect_to_join", since = "1.3.0")]
+    pub fn join<Separator>(&self, sep: Separator) -> <Self as Join<Separator>>::Output
+    where
+        Self: Join<Separator>,
+    {
+        Join::join(self, sep)
+    }
+
+    /// Flattens a slice of `T` into a single value `Self::Output`, placing a
+    /// given separator between each.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![allow(deprecated)]
+    /// assert_eq!(["hello", "world"].connect(" "), "hello world");
+    /// assert_eq!([[1, 2], [3, 4]].connect(&0), [1, 2, 0, 3, 4]);
+    /// ```
+    #[rustc_allow_incoherent_impl]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    #[deprecated(since = "1.3.0", note = "renamed to join")]
+    pub fn connect<Separator>(&self, sep: Separator) -> <Self as Join<Separator>>::Output
+    where
+        Self: Join<Separator>,
+    {
+        Join::join(self, sep)
+    }
+}
+
+#[cfg(not(test))]
+impl [u8] {
+    /// Returns a vector containing a copy of this slice where each byte
+    /// is mapped to its ASCII upper case equivalent.
+    ///
+    /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
+    /// but non-ASCII letters are unchanged.
+    ///
+    /// To uppercase the value in-place, use [`make_ascii_uppercase`].
+    ///
+    /// [`make_ascii_uppercase`]: slice::make_ascii_uppercase
+    #[cfg(not(no_global_oom_handling))]
+    #[rustc_allow_incoherent_impl]
+    #[must_use = "this returns the uppercase bytes as a new Vec, \
+                  without modifying the original"]
+    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
+    #[inline]
+    pub fn to_ascii_uppercase(&self) -> Vec<u8> {
+        let mut me = self.to_vec();
+        me.make_ascii_uppercase();
+        me
+    }
+
+    /// Returns a vector containing a copy of this slice where each byte
+    /// is mapped to its ASCII lower case equivalent.
+    ///
+    /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
+    /// but non-ASCII letters are unchanged.
+    ///
+    /// To lowercase the value in-place, use [`make_ascii_lowercase`].
+    ///
+    /// [`make_ascii_lowercase`]: slice::make_ascii_lowercase
+    #[cfg(not(no_global_oom_handling))]
+    #[rustc_allow_incoherent_impl]
+    #[must_use = "this returns the lowercase bytes as a new Vec, \
+                  without modifying the original"]
+    #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")]
+    #[inline]
+    pub fn to_ascii_lowercase(&self) -> Vec<u8> {
+        let mut me = self.to_vec();
+        me.make_ascii_lowercase();
+        me
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Extension traits for slices over specific kinds of data
+////////////////////////////////////////////////////////////////////////////////
+
+/// Helper trait for [`[T]::concat`](slice::concat).
+///
+/// Note: the `Item` type parameter is not used in this trait,
+/// but it allows impls to be more generic.
+/// Without it, we get this error:
+///
+/// ```error
+/// error[E0207]: the type parameter `T` is not constrained by the impl trait, self type, or predica
+///    --> src/liballoc/slice.rs:608:6
+///     |
+/// 608 | impl<T: Clone, V: Borrow<[T]>> Concat for [V] {
+///     |      ^ unconstrained type parameter
+/// ```
+///
+/// This is because there could exist `V` types with multiple `Borrow<[_]>` impls,
+/// such that multiple `T` types would apply:
+///
+/// ```
+/// # #[allow(dead_code)]
+/// pub struct Foo(Vec<u32>, Vec<String>);
+///
+/// impl std::borrow::Borrow<[u32]> for Foo {
+///     fn borrow(&self) -> &[u32] { &self.0 }
+/// }
+///
+/// impl std::borrow::Borrow<[String]> for Foo {
+///     fn borrow(&self) -> &[String] { &self.1 }
+/// }
+/// ```
+#[unstable(feature = "slice_concat_trait", issue = "27747")]
+pub trait Concat<Item: ?Sized> {
+    #[unstable(feature = "slice_concat_trait", issue = "27747")]
+    /// The resulting type after concatenation
+    type Output;
+
+    /// Implementation of [`[T]::concat`](slice::concat)
+    #[unstable(feature = "slice_concat_trait", issue = "27747")]
+    fn concat(slice: &Self) -> Self::Output;
+}
+
+/// Helper trait for [`[T]::join`](slice::join)
+#[unstable(feature = "slice_concat_trait", issue = "27747")]
+pub trait Join<Separator> {
+    #[unstable(feature = "slice_concat_trait", issue = "27747")]
+    /// The resulting type after concatenation
+    type Output;
+
+    /// Implementation of [`[T]::join`](slice::join)
+    #[unstable(feature = "slice_concat_trait", issue = "27747")]
+    fn join(slice: &Self, sep: Separator) -> Self::Output;
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[unstable(feature = "slice_concat_ext", issue = "27747")]
+impl<T: Clone, V: Borrow<[T]>> Concat<T> for [V] {
+    type Output = Vec<T>;
+
+    fn concat(slice: &Self) -> Vec<T> {
+        let size = slice.iter().map(|slice| slice.borrow().len()).sum();
+        let mut result = Vec::with_capacity(size);
+        for v in slice {
+            result.extend_from_slice(v.borrow())
+        }
+        result
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[unstable(feature = "slice_concat_ext", issue = "27747")]
+impl<T: Clone, V: Borrow<[T]>> Join<&T> for [V] {
+    type Output = Vec<T>;
+
+    fn join(slice: &Self, sep: &T) -> Vec<T> {
+        let mut iter = slice.iter();
+        let first = match iter.next() {
+            Some(first) => first,
+            None => return vec![],
+        };
+        let size = slice.iter().map(|v| v.borrow().len()).sum::<usize>() + slice.len() - 1;
+        let mut result = Vec::with_capacity(size);
+        result.extend_from_slice(first.borrow());
+
+        for v in iter {
+            result.push(sep.clone());
+            result.extend_from_slice(v.borrow())
+        }
+        result
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[unstable(feature = "slice_concat_ext", issue = "27747")]
+impl<T: Clone, V: Borrow<[T]>> Join<&[T]> for [V] {
+    type Output = Vec<T>;
+
+    fn join(slice: &Self, sep: &[T]) -> Vec<T> {
+        let mut iter = slice.iter();
+        let first = match iter.next() {
+            Some(first) => first,
+            None => return vec![],
+        };
+        let size =
+            slice.iter().map(|v| v.borrow().len()).sum::<usize>() + sep.len() * (slice.len() - 1);
+        let mut result = Vec::with_capacity(size);
+        result.extend_from_slice(first.borrow());
+
+        for v in iter {
+            result.extend_from_slice(sep);
+            result.extend_from_slice(v.borrow())
+        }
+        result
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Standard trait implementations for slices
+////////////////////////////////////////////////////////////////////////////////
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T> Borrow<[T]> for Vec<T> {
+    fn borrow(&self) -> &[T] {
+        &self[..]
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T> BorrowMut<[T]> for Vec<T> {
+    fn borrow_mut(&mut self) -> &mut [T] {
+        &mut self[..]
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: Clone> ToOwned for [T] {
+    type Owned = Vec<T>;
+    #[cfg(not(test))]
+    fn to_owned(&self) -> Vec<T> {
+        self.to_vec()
+    }
+
+    #[cfg(test)]
+    fn to_owned(&self) -> Vec<T> {
+        hack::to_vec(self, Global)
+    }
+
+    fn clone_into(&self, target: &mut Vec<T>) {
+        // drop anything in target that will not be overwritten
+        target.truncate(self.len());
+
+        // target.len <= self.len due to the truncate above, so the
+        // slices here are always in-bounds.
+        let (init, tail) = self.split_at(target.len());
+
+        // reuse the contained values' allocations/resources.
+        target.clone_from_slice(init);
+        target.extend_from_slice(tail);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Sorting
+////////////////////////////////////////////////////////////////////////////////
+
+/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
+///
+/// This is the integral subroutine of insertion sort.
+#[cfg(not(no_global_oom_handling))]
+fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    if v.len() >= 2 && is_less(&v[1], &v[0]) {
+        unsafe {
+            // There are three ways to implement insertion here:
+            //
+            // 1. Swap adjacent elements until the first one gets to its final destination.
+            //    However, this way we copy data around more than is necessary. If elements are big
+            //    structures (costly to copy), this method will be slow.
+            //
+            // 2. Iterate until the right place for the first element is found. Then shift the
+            //    elements succeeding it to make room for it and finally place it into the
+            //    remaining hole. This is a good method.
+            //
+            // 3. Copy the first element into a temporary variable. Iterate until the right place
+            //    for it is found. As we go along, copy every traversed element into the slot
+            //    preceding it. Finally, copy data from the temporary variable into the remaining
+            //    hole. This method is very good. Benchmarks demonstrated slightly better
+            //    performance than with the 2nd method.
+            //
+            // All methods were benchmarked, and the 3rd showed best results. So we chose that one.
+            let tmp = mem::ManuallyDrop::new(ptr::read(&v[0]));
+
+            // Intermediate state of the insertion process is always tracked by `hole`, which
+            // serves two purposes:
+            // 1. Protects integrity of `v` from panics in `is_less`.
+            // 2. Fills the remaining hole in `v` in the end.
+            //
+            // Panic safety:
+            //
+            // If `is_less` panics at any point during the process, `hole` will get dropped and
+            // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
+            // initially held exactly once.
+            let mut hole = InsertionHole { src: &*tmp, dest: &mut v[1] };
+            ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
+
+            for i in 2..v.len() {
+                if !is_less(&v[i], &*tmp) {
+                    break;
+                }
+                ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
+                hole.dest = &mut v[i];
+            }
+            // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
+        }
+    }
+
+    // When dropped, copies from `src` into `dest`.
+    struct InsertionHole<T> {
+        src: *const T,
+        dest: *mut T,
+    }
+
+    impl<T> Drop for InsertionHole<T> {
+        fn drop(&mut self) {
+            unsafe {
+                ptr::copy_nonoverlapping(self.src, self.dest, 1);
+            }
+        }
+    }
+}
+
+/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and
+/// stores the result into `v[..]`.
+///
+/// # Safety
+///
+/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
+/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
+#[cfg(not(no_global_oom_handling))]
+unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    let len = v.len();
+    let v = v.as_mut_ptr();
+    let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) };
+
+    // The merge process first copies the shorter run into `buf`. Then it traces the newly copied
+    // run and the longer run forwards (or backwards), comparing their next unconsumed elements and
+    // copying the lesser (or greater) one into `v`.
+    //
+    // As soon as the shorter run is fully consumed, the process is done. If the longer run gets
+    // consumed first, then we must copy whatever is left of the shorter run into the remaining
+    // hole in `v`.
+    //
+    // Intermediate state of the process is always tracked by `hole`, which serves two purposes:
+    // 1. Protects integrity of `v` from panics in `is_less`.
+    // 2. Fills the remaining hole in `v` if the longer run gets consumed first.
+    //
+    // Panic safety:
+    //
+    // If `is_less` panics at any point during the process, `hole` will get dropped and fill the
+    // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
+    // object it initially held exactly once.
+    let mut hole;
+
+    if mid <= len - mid {
+        // The left run is shorter.
+        unsafe {
+            ptr::copy_nonoverlapping(v, buf, mid);
+            hole = MergeHole { start: buf, end: buf.add(mid), dest: v };
+        }
+
+        // Initially, these pointers point to the beginnings of their arrays.
+        let left = &mut hole.start;
+        let mut right = v_mid;
+        let out = &mut hole.dest;
+
+        while *left < hole.end && right < v_end {
+            // Consume the lesser side.
+            // If equal, prefer the left run to maintain stability.
+            unsafe {
+                let to_copy = if is_less(&*right, &**left) {
+                    get_and_increment(&mut right)
+                } else {
+                    get_and_increment(left)
+                };
+                ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1);
+            }
+        }
+    } else {
+        // The right run is shorter.
+        unsafe {
+            ptr::copy_nonoverlapping(v_mid, buf, len - mid);
+            hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid };
+        }
+
+        // Initially, these pointers point past the ends of their arrays.
+        let left = &mut hole.dest;
+        let right = &mut hole.end;
+        let mut out = v_end;
+
+        while v < *left && buf < *right {
+            // Consume the greater side.
+            // If equal, prefer the right run to maintain stability.
+            unsafe {
+                let to_copy = if is_less(&*right.offset(-1), &*left.offset(-1)) {
+                    decrement_and_get(left)
+                } else {
+                    decrement_and_get(right)
+                };
+                ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1);
+            }
+        }
+    }
+    // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of
+    // it will now be copied into the hole in `v`.
+
+    unsafe fn get_and_increment<T>(ptr: &mut *mut T) -> *mut T {
+        let old = *ptr;
+        *ptr = unsafe { ptr.offset(1) };
+        old
+    }
+
+    unsafe fn decrement_and_get<T>(ptr: &mut *mut T) -> *mut T {
+        *ptr = unsafe { ptr.offset(-1) };
+        *ptr
+    }
+
+    // When dropped, copies the range `start..end` into `dest..`.
+    struct MergeHole<T> {
+        start: *mut T,
+        end: *mut T,
+        dest: *mut T,
+    }
+
+    impl<T> Drop for MergeHole<T> {
+        fn drop(&mut self) {
+            // `T` is not a zero-sized type, and these are pointers into a slice's elements.
+            unsafe {
+                let len = self.end.sub_ptr(self.start);
+                ptr::copy_nonoverlapping(self.start, self.dest, len);
+            }
+        }
+    }
+}
+
+/// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail
+/// [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt).
+///
+/// The algorithm identifies strictly descending and non-descending subsequences, which are called
+/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
+/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are
+/// satisfied:
+///
+/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
+/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
+///
+/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case.
+#[cfg(not(no_global_oom_handling))]
+fn merge_sort<T, F>(v: &mut [T], mut is_less: F)
+where
+    F: FnMut(&T, &T) -> bool,
+{
+    // Slices of up to this length get sorted using insertion sort.
+    const MAX_INSERTION: usize = 20;
+    // Very short runs are extended using insertion sort to span at least this many elements.
+    const MIN_RUN: usize = 10;
+
+    // Sorting has no meaningful behavior on zero-sized types.
+    if size_of::<T>() == 0 {
+        return;
+    }
+
+    let len = v.len();
+
+    // Short arrays get sorted in-place via insertion sort to avoid allocations.
+    if len <= MAX_INSERTION {
+        if len >= 2 {
+            for i in (0..len - 1).rev() {
+                insert_head(&mut v[i..], &mut is_less);
+            }
+        }
+        return;
+    }
+
+    // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
+    // shallow copies of the contents of `v` without risking the dtors running on copies if
+    // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
+    // which will always have length at most `len / 2`.
+    let mut buf = Vec::with_capacity(len / 2);
+
+    // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a
+    // strange decision, but consider the fact that merges more often go in the opposite direction
+    // (forwards). According to benchmarks, merging forwards is slightly faster than merging
+    // backwards. To conclude, identifying runs by traversing backwards improves performance.
+    let mut runs = vec![];
+    let mut end = len;
+    while end > 0 {
+        // Find the next natural run, and reverse it if it's strictly descending.
+        let mut start = end - 1;
+        if start > 0 {
+            start -= 1;
+            unsafe {
+                if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
+                    while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) {
+                        start -= 1;
+                    }
+                    v[start..end].reverse();
+                } else {
+                    while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1))
+                    {
+                        start -= 1;
+                    }
+                }
+            }
+        }
+
+        // Insert some more elements into the run if it's too short. Insertion sort is faster than
+        // merge sort on short sequences, so this significantly improves performance.
+        while start > 0 && end - start < MIN_RUN {
+            start -= 1;
+            insert_head(&mut v[start..end], &mut is_less);
+        }
+
+        // Push this run onto the stack.
+        runs.push(Run { start, len: end - start });
+        end = start;
+
+        // Merge some pairs of adjacent runs to satisfy the invariants.
+        while let Some(r) = collapse(&runs) {
+            let left = runs[r + 1];
+            let right = runs[r];
+            unsafe {
+                merge(
+                    &mut v[left.start..right.start + right.len],
+                    left.len,
+                    buf.as_mut_ptr(),
+                    &mut is_less,
+                );
+            }
+            runs[r] = Run { start: left.start, len: left.len + right.len };
+            runs.remove(r + 1);
+        }
+    }
+
+    // Finally, exactly one run must remain in the stack.
+    debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len);
+
+    // Examines the stack of runs and identifies the next pair of runs to merge. More specifically,
+    // if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the
+    // algorithm should continue building a new run instead, `None` is returned.
+    //
+    // TimSort is infamous for its buggy implementations, as described here:
+    // http://envisage-project.eu/timsort-specification-and-verification/
+    //
+    // The gist of the story is: we must enforce the invariants on the top four runs on the stack.
+    // Enforcing them on just top three is not sufficient to ensure that the invariants will still
+    // hold for *all* runs in the stack.
+    //
+    // This function correctly checks invariants for the top four runs. Additionally, if the top
+    // run starts at index 0, it will always demand a merge operation until the stack is fully
+    // collapsed, in order to complete the sort.
+    #[inline]
+    fn collapse(runs: &[Run]) -> Option<usize> {
+        let n = runs.len();
+        if n >= 2
+            && (runs[n - 1].start == 0
+                || runs[n - 2].len <= runs[n - 1].len
+                || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len)
+                || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len))
+        {
+            if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) }
+        } else {
+            None
+        }
+    }
+
+    #[derive(Clone, Copy)]
+    struct Run {
+        start: usize,
+        len: usize,
+    }
+}
diff --git a/rust/alloc/vec/drain.rs b/rust/alloc/vec/drain.rs
new file mode 100644
index 000000000000..5cdee0bd4da4
--- /dev/null
+++ b/rust/alloc/vec/drain.rs
@@ -0,0 +1,184 @@
+use crate::alloc::{Allocator, Global};
+use core::fmt;
+use core::iter::{FusedIterator, TrustedLen};
+use core::mem;
+use core::ptr::{self, NonNull};
+use core::slice::{self};
+
+use super::Vec;
+
+/// A draining iterator for `Vec<T>`.
+///
+/// This `struct` is created by [`Vec::drain`].
+/// See its documentation for more.
+///
+/// # Example
+///
+/// ```
+/// let mut v = vec![0, 1, 2];
+/// let iter: std::vec::Drain<_> = v.drain(..);
+/// ```
+#[stable(feature = "drain", since = "1.6.0")]
+pub struct Drain<
+    'a,
+    T: 'a,
+    #[unstable(feature = "allocator_api", issue = "32838")] A: Allocator + 'a = Global,
+> {
+    /// Index of tail to preserve
+    pub(super) tail_start: usize,
+    /// Length of tail
+    pub(super) tail_len: usize,
+    /// Current remaining range to remove
+    pub(super) iter: slice::Iter<'a, T>,
+    pub(super) vec: NonNull<Vec<T, A>>,
+}
+
+#[stable(feature = "collection_debug", since = "1.17.0")]
+impl<T: fmt::Debug, A: Allocator> fmt::Debug for Drain<'_, T, A> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_tuple("Drain").field(&self.iter.as_slice()).finish()
+    }
+}
+
+impl<'a, T, A: Allocator> Drain<'a, T, A> {
+    /// Returns the remaining items of this iterator as a slice.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec!['a', 'b', 'c'];
+    /// let mut drain = vec.drain(..);
+    /// assert_eq!(drain.as_slice(), &['a', 'b', 'c']);
+    /// let _ = drain.next().unwrap();
+    /// assert_eq!(drain.as_slice(), &['b', 'c']);
+    /// ```
+    #[must_use]
+    #[stable(feature = "vec_drain_as_slice", since = "1.46.0")]
+    pub fn as_slice(&self) -> &[T] {
+        self.iter.as_slice()
+    }
+
+    /// Returns a reference to the underlying allocator.
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[must_use]
+    #[inline]
+    pub fn allocator(&self) -> &A {
+        unsafe { self.vec.as_ref().allocator() }
+    }
+}
+
+#[stable(feature = "vec_drain_as_slice", since = "1.46.0")]
+impl<'a, T, A: Allocator> AsRef<[T]> for Drain<'a, T, A> {
+    fn as_ref(&self) -> &[T] {
+        self.as_slice()
+    }
+}
+
+#[stable(feature = "drain", since = "1.6.0")]
+unsafe impl<T: Sync, A: Sync + Allocator> Sync for Drain<'_, T, A> {}
+#[stable(feature = "drain", since = "1.6.0")]
+unsafe impl<T: Send, A: Send + Allocator> Send for Drain<'_, T, A> {}
+
+#[stable(feature = "drain", since = "1.6.0")]
+impl<T, A: Allocator> Iterator for Drain<'_, T, A> {
+    type Item = T;
+
+    #[inline]
+    fn next(&mut self) -> Option<T> {
+        self.iter.next().map(|elt| unsafe { ptr::read(elt as *const _) })
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.iter.size_hint()
+    }
+}
+
+#[stable(feature = "drain", since = "1.6.0")]
+impl<T, A: Allocator> DoubleEndedIterator for Drain<'_, T, A> {
+    #[inline]
+    fn next_back(&mut self) -> Option<T> {
+        self.iter.next_back().map(|elt| unsafe { ptr::read(elt as *const _) })
+    }
+}
+
+#[stable(feature = "drain", since = "1.6.0")]
+impl<T, A: Allocator> Drop for Drain<'_, T, A> {
+    fn drop(&mut self) {
+        /// Moves back the un-`Drain`ed elements to restore the original `Vec`.
+        struct DropGuard<'r, 'a, T, A: Allocator>(&'r mut Drain<'a, T, A>);
+
+        impl<'r, 'a, T, A: Allocator> Drop for DropGuard<'r, 'a, T, A> {
+            fn drop(&mut self) {
+                if self.0.tail_len > 0 {
+                    unsafe {
+                        let source_vec = self.0.vec.as_mut();
+                        // memmove back untouched tail, update to new length
+                        let start = source_vec.len();
+                        let tail = self.0.tail_start;
+                        if tail != start {
+                            let src = source_vec.as_ptr().add(tail);
+                            let dst = source_vec.as_mut_ptr().add(start);
+                            ptr::copy(src, dst, self.0.tail_len);
+                        }
+                        source_vec.set_len(start + self.0.tail_len);
+                    }
+                }
+            }
+        }
+
+        let iter = mem::replace(&mut self.iter, (&mut []).iter());
+        let drop_len = iter.len();
+
+        let mut vec = self.vec;
+
+        if mem::size_of::<T>() == 0 {
+            // ZSTs have no identity, so we don't need to move them around, we only need to drop the correct amount.
+            // this can be achieved by manipulating the Vec length instead of moving values out from `iter`.
+            unsafe {
+                let vec = vec.as_mut();
+                let old_len = vec.len();
+                vec.set_len(old_len + drop_len + self.tail_len);
+                vec.truncate(old_len + self.tail_len);
+            }
+
+            return;
+        }
+
+        // ensure elements are moved back into their appropriate places, even when drop_in_place panics
+        let _guard = DropGuard(self);
+
+        if drop_len == 0 {
+            return;
+        }
+
+        // as_slice() must only be called when iter.len() is > 0 because
+        // vec::Splice modifies vec::Drain fields and may grow the vec which would invalidate
+        // the iterator's internal pointers. Creating a reference to deallocated memory
+        // is invalid even when it is zero-length
+        let drop_ptr = iter.as_slice().as_ptr();
+
+        unsafe {
+            // drop_ptr comes from a slice::Iter which only gives us a &[T] but for drop_in_place
+            // a pointer with mutable provenance is necessary. Therefore we must reconstruct
+            // it from the original vec but also avoid creating a &mut to the front since that could
+            // invalidate raw pointers to it which some unsafe code might rely on.
+            let vec_ptr = vec.as_mut().as_mut_ptr();
+            let drop_offset = drop_ptr.sub_ptr(vec_ptr);
+            let to_drop = ptr::slice_from_raw_parts_mut(vec_ptr.add(drop_offset), drop_len);
+            ptr::drop_in_place(to_drop);
+        }
+    }
+}
+
+#[stable(feature = "drain", since = "1.6.0")]
+impl<T, A: Allocator> ExactSizeIterator for Drain<'_, T, A> {
+    fn is_empty(&self) -> bool {
+        self.iter.is_empty()
+    }
+}
+
+#[unstable(feature = "trusted_len", issue = "37572")]
+unsafe impl<T, A: Allocator> TrustedLen for Drain<'_, T, A> {}
+
+#[stable(feature = "fused", since = "1.26.0")]
+impl<T, A: Allocator> FusedIterator for Drain<'_, T, A> {}
diff --git a/rust/alloc/vec/drain_filter.rs b/rust/alloc/vec/drain_filter.rs
new file mode 100644
index 000000000000..3c37c92ae44b
--- /dev/null
+++ b/rust/alloc/vec/drain_filter.rs
@@ -0,0 +1,143 @@
+use crate::alloc::{Allocator, Global};
+use core::ptr::{self};
+use core::slice::{self};
+
+use super::Vec;
+
+/// An iterator which uses a closure to determine if an element should be removed.
+///
+/// This struct is created by [`Vec::drain_filter`].
+/// See its documentation for more.
+///
+/// # Example
+///
+/// ```
+/// #![feature(drain_filter)]
+///
+/// let mut v = vec![0, 1, 2];
+/// let iter: std::vec::DrainFilter<_, _> = v.drain_filter(|x| *x % 2 == 0);
+/// ```
+#[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")]
+#[derive(Debug)]
+pub struct DrainFilter<
+    'a,
+    T,
+    F,
+    #[unstable(feature = "allocator_api", issue = "32838")] A: Allocator = Global,
+> where
+    F: FnMut(&mut T) -> bool,
+{
+    pub(super) vec: &'a mut Vec<T, A>,
+    /// The index of the item that will be inspected by the next call to `next`.
+    pub(super) idx: usize,
+    /// The number of items that have been drained (removed) thus far.
+    pub(super) del: usize,
+    /// The original length of `vec` prior to draining.
+    pub(super) old_len: usize,
+    /// The filter test predicate.
+    pub(super) pred: F,
+    /// A flag that indicates a panic has occurred in the filter test predicate.
+    /// This is used as a hint in the drop implementation to prevent consumption
+    /// of the remainder of the `DrainFilter`. Any unprocessed items will be
+    /// backshifted in the `vec`, but no further items will be dropped or
+    /// tested by the filter predicate.
+    pub(super) panic_flag: bool,
+}
+
+impl<T, F, A: Allocator> DrainFilter<'_, T, F, A>
+where
+    F: FnMut(&mut T) -> bool,
+{
+    /// Returns a reference to the underlying allocator.
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[inline]
+    pub fn allocator(&self) -> &A {
+        self.vec.allocator()
+    }
+}
+
+#[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")]
+impl<T, F, A: Allocator> Iterator for DrainFilter<'_, T, F, A>
+where
+    F: FnMut(&mut T) -> bool,
+{
+    type Item = T;
+
+    fn next(&mut self) -> Option<T> {
+        unsafe {
+            while self.idx < self.old_len {
+                let i = self.idx;
+                let v = slice::from_raw_parts_mut(self.vec.as_mut_ptr(), self.old_len);
+                self.panic_flag = true;
+                let drained = (self.pred)(&mut v[i]);
+                self.panic_flag = false;
+                // Update the index *after* the predicate is called. If the index
+                // is updated prior and the predicate panics, the element at this
+                // index would be leaked.
+                self.idx += 1;
+                if drained {
+                    self.del += 1;
+                    return Some(ptr::read(&v[i]));
+                } else if self.del > 0 {
+                    let del = self.del;
+                    let src: *const T = &v[i];
+                    let dst: *mut T = &mut v[i - del];
+                    ptr::copy_nonoverlapping(src, dst, 1);
+                }
+            }
+            None
+        }
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        (0, Some(self.old_len - self.idx))
+    }
+}
+
+#[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")]
+impl<T, F, A: Allocator> Drop for DrainFilter<'_, T, F, A>
+where
+    F: FnMut(&mut T) -> bool,
+{
+    fn drop(&mut self) {
+        struct BackshiftOnDrop<'a, 'b, T, F, A: Allocator>
+        where
+            F: FnMut(&mut T) -> bool,
+        {
+            drain: &'b mut DrainFilter<'a, T, F, A>,
+        }
+
+        impl<'a, 'b, T, F, A: Allocator> Drop for BackshiftOnDrop<'a, 'b, T, F, A>
+        where
+            F: FnMut(&mut T) -> bool,
+        {
+            fn drop(&mut self) {
+                unsafe {
+                    if self.drain.idx < self.drain.old_len && self.drain.del > 0 {
+                        // This is a pretty messed up state, and there isn't really an
+                        // obviously right thing to do. We don't want to keep trying
+                        // to execute `pred`, so we just backshift all the unprocessed
+                        // elements and tell the vec that they still exist. The backshift
+                        // is required to prevent a double-drop of the last successfully
+                        // drained item prior to a panic in the predicate.
+                        let ptr = self.drain.vec.as_mut_ptr();
+                        let src = ptr.add(self.drain.idx);
+                        let dst = src.sub(self.drain.del);
+                        let tail_len = self.drain.old_len - self.drain.idx;
+                        src.copy_to(dst, tail_len);
+                    }
+                    self.drain.vec.set_len(self.drain.old_len - self.drain.del);
+                }
+            }
+        }
+
+        let backshift = BackshiftOnDrop { drain: self };
+
+        // Attempt to consume any remaining elements if the filter predicate
+        // has not yet panicked. We'll backshift any remaining elements
+        // whether we've already panicked or if the consumption here panics.
+        if !backshift.drain.panic_flag {
+            backshift.drain.for_each(drop);
+        }
+    }
+}
diff --git a/rust/alloc/vec/into_iter.rs b/rust/alloc/vec/into_iter.rs
new file mode 100644
index 000000000000..9b84a1d9b4b6
--- /dev/null
+++ b/rust/alloc/vec/into_iter.rs
@@ -0,0 +1,362 @@
+#[cfg(not(no_global_oom_handling))]
+use super::AsVecIntoIter;
+use crate::alloc::{Allocator, Global};
+use crate::raw_vec::RawVec;
+use core::fmt;
+use core::intrinsics::arith_offset;
+use core::iter::{
+    FusedIterator, InPlaceIterable, SourceIter, TrustedLen, TrustedRandomAccessNoCoerce,
+};
+use core::marker::PhantomData;
+use core::mem::{self, ManuallyDrop};
+use core::ops::Deref;
+use core::ptr::{self, NonNull};
+use core::slice::{self};
+
+/// An iterator that moves out of a vector.
+///
+/// This `struct` is created by the `into_iter` method on [`Vec`](super::Vec)
+/// (provided by the [`IntoIterator`] trait).
+///
+/// # Example
+///
+/// ```
+/// let v = vec![0, 1, 2];
+/// let iter: std::vec::IntoIter<_> = v.into_iter();
+/// ```
+#[stable(feature = "rust1", since = "1.0.0")]
+#[rustc_insignificant_dtor]
+pub struct IntoIter<
+    T,
+    #[unstable(feature = "allocator_api", issue = "32838")] A: Allocator = Global,
+> {
+    pub(super) buf: NonNull<T>,
+    pub(super) phantom: PhantomData<T>,
+    pub(super) cap: usize,
+    // the drop impl reconstructs a RawVec from buf, cap and alloc
+    // to avoid dropping the allocator twice we need to wrap it into ManuallyDrop
+    pub(super) alloc: ManuallyDrop<A>,
+    pub(super) ptr: *const T,
+    pub(super) end: *const T,
+}
+
+#[stable(feature = "vec_intoiter_debug", since = "1.13.0")]
+impl<T: fmt::Debug, A: Allocator> fmt::Debug for IntoIter<T, A> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_tuple("IntoIter").field(&self.as_slice()).finish()
+    }
+}
+
+impl<T, A: Allocator> IntoIter<T, A> {
+    /// Returns the remaining items of this iterator as a slice.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let vec = vec!['a', 'b', 'c'];
+    /// let mut into_iter = vec.into_iter();
+    /// assert_eq!(into_iter.as_slice(), &['a', 'b', 'c']);
+    /// let _ = into_iter.next().unwrap();
+    /// assert_eq!(into_iter.as_slice(), &['b', 'c']);
+    /// ```
+    #[stable(feature = "vec_into_iter_as_slice", since = "1.15.0")]
+    pub fn as_slice(&self) -> &[T] {
+        unsafe { slice::from_raw_parts(self.ptr, self.len()) }
+    }
+
+    /// Returns the remaining items of this iterator as a mutable slice.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let vec = vec!['a', 'b', 'c'];
+    /// let mut into_iter = vec.into_iter();
+    /// assert_eq!(into_iter.as_slice(), &['a', 'b', 'c']);
+    /// into_iter.as_mut_slice()[2] = 'z';
+    /// assert_eq!(into_iter.next().unwrap(), 'a');
+    /// assert_eq!(into_iter.next().unwrap(), 'b');
+    /// assert_eq!(into_iter.next().unwrap(), 'z');
+    /// ```
+    #[stable(feature = "vec_into_iter_as_slice", since = "1.15.0")]
+    pub fn as_mut_slice(&mut self) -> &mut [T] {
+        unsafe { &mut *self.as_raw_mut_slice() }
+    }
+
+    /// Returns a reference to the underlying allocator.
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[inline]
+    pub fn allocator(&self) -> &A {
+        &self.alloc
+    }
+
+    fn as_raw_mut_slice(&mut self) -> *mut [T] {
+        ptr::slice_from_raw_parts_mut(self.ptr as *mut T, self.len())
+    }
+
+    /// Drops remaining elements and relinquishes the backing allocation.
+    ///
+    /// This is roughly equivalent to the following, but more efficient
+    ///
+    /// ```
+    /// # let mut into_iter = Vec::<u8>::with_capacity(10).into_iter();
+    /// (&mut into_iter).for_each(core::mem::drop);
+    /// unsafe { core::ptr::write(&mut into_iter, Vec::new().into_iter()); }
+    /// ```
+    ///
+    /// This method is used by in-place iteration, refer to the vec::in_place_collect
+    /// documentation for an overview.
+    #[cfg(not(no_global_oom_handling))]
+    pub(super) fn forget_allocation_drop_remaining(&mut self) {
+        let remaining = self.as_raw_mut_slice();
+
+        // overwrite the individual fields instead of creating a new
+        // struct and then overwriting &mut self.
+        // this creates less assembly
+        self.cap = 0;
+        self.buf = unsafe { NonNull::new_unchecked(RawVec::NEW.ptr()) };
+        self.ptr = self.buf.as_ptr();
+        self.end = self.buf.as_ptr();
+
+        unsafe {
+            ptr::drop_in_place(remaining);
+        }
+    }
+
+    /// Forgets to Drop the remaining elements while still allowing the backing allocation to be freed.
+    pub(crate) fn forget_remaining_elements(&mut self) {
+        self.ptr = self.end;
+    }
+}
+
+#[stable(feature = "vec_intoiter_as_ref", since = "1.46.0")]
+impl<T, A: Allocator> AsRef<[T]> for IntoIter<T, A> {
+    fn as_ref(&self) -> &[T] {
+        self.as_slice()
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+unsafe impl<T: Send, A: Allocator + Send> Send for IntoIter<T, A> {}
+#[stable(feature = "rust1", since = "1.0.0")]
+unsafe impl<T: Sync, A: Allocator + Sync> Sync for IntoIter<T, A> {}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T, A: Allocator> Iterator for IntoIter<T, A> {
+    type Item = T;
+
+    #[inline]
+    fn next(&mut self) -> Option<T> {
+        if self.ptr as *const _ == self.end {
+            None
+        } else if mem::size_of::<T>() == 0 {
+            // purposefully don't use 'ptr.offset' because for
+            // vectors with 0-size elements this would return the
+            // same pointer.
+            self.ptr = unsafe { arith_offset(self.ptr as *const i8, 1) as *mut T };
+
+            // Make up a value of this ZST.
+            Some(unsafe { mem::zeroed() })
+        } else {
+            let old = self.ptr;
+            self.ptr = unsafe { self.ptr.offset(1) };
+
+            Some(unsafe { ptr::read(old) })
+        }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let exact = if mem::size_of::<T>() == 0 {
+            self.end.addr().wrapping_sub(self.ptr.addr())
+        } else {
+            unsafe { self.end.sub_ptr(self.ptr) }
+        };
+        (exact, Some(exact))
+    }
+
+    #[inline]
+    fn advance_by(&mut self, n: usize) -> Result<(), usize> {
+        let step_size = self.len().min(n);
+        let to_drop = ptr::slice_from_raw_parts_mut(self.ptr as *mut T, step_size);
+        if mem::size_of::<T>() == 0 {
+            // SAFETY: due to unchecked casts of unsigned amounts to signed offsets the wraparound
+            // effectively results in unsigned pointers representing positions 0..usize::MAX,
+            // which is valid for ZSTs.
+            self.ptr = unsafe { arith_offset(self.ptr as *const i8, step_size as isize) as *mut T }
+        } else {
+            // SAFETY: the min() above ensures that step_size is in bounds
+            self.ptr = unsafe { self.ptr.add(step_size) };
+        }
+        // SAFETY: the min() above ensures that step_size is in bounds
+        unsafe {
+            ptr::drop_in_place(to_drop);
+        }
+        if step_size < n {
+            return Err(step_size);
+        }
+        Ok(())
+    }
+
+    #[inline]
+    fn count(self) -> usize {
+        self.len()
+    }
+
+    unsafe fn __iterator_get_unchecked(&mut self, i: usize) -> Self::Item
+    where
+        Self: TrustedRandomAccessNoCoerce,
+    {
+        // SAFETY: the caller must guarantee that `i` is in bounds of the
+        // `Vec<T>`, so `i` cannot overflow an `isize`, and the `self.ptr.add(i)`
+        // is guaranteed to pointer to an element of the `Vec<T>` and
+        // thus guaranteed to be valid to dereference.
+        //
+        // Also note the implementation of `Self: TrustedRandomAccess` requires
+        // that `T: Copy` so reading elements from the buffer doesn't invalidate
+        // them for `Drop`.
+        unsafe {
+            if mem::size_of::<T>() == 0 { mem::zeroed() } else { ptr::read(self.ptr.add(i)) }
+        }
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T, A: Allocator> DoubleEndedIterator for IntoIter<T, A> {
+    #[inline]
+    fn next_back(&mut self) -> Option<T> {
+        if self.end == self.ptr {
+            None
+        } else if mem::size_of::<T>() == 0 {
+            // See above for why 'ptr.offset' isn't used
+            self.end = unsafe { arith_offset(self.end as *const i8, -1) as *mut T };
+
+            // Make up a value of this ZST.
+            Some(unsafe { mem::zeroed() })
+        } else {
+            self.end = unsafe { self.end.offset(-1) };
+
+            Some(unsafe { ptr::read(self.end) })
+        }
+    }
+
+    #[inline]
+    fn advance_back_by(&mut self, n: usize) -> Result<(), usize> {
+        let step_size = self.len().min(n);
+        if mem::size_of::<T>() == 0 {
+            // SAFETY: same as for advance_by()
+            self.end = unsafe {
+                arith_offset(self.end as *const i8, step_size.wrapping_neg() as isize) as *mut T
+            }
+        } else {
+            // SAFETY: same as for advance_by()
+            self.end = unsafe { self.end.offset(step_size.wrapping_neg() as isize) };
+        }
+        let to_drop = ptr::slice_from_raw_parts_mut(self.end as *mut T, step_size);
+        // SAFETY: same as for advance_by()
+        unsafe {
+            ptr::drop_in_place(to_drop);
+        }
+        if step_size < n {
+            return Err(step_size);
+        }
+        Ok(())
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T, A: Allocator> ExactSizeIterator for IntoIter<T, A> {
+    fn is_empty(&self) -> bool {
+        self.ptr == self.end
+    }
+}
+
+#[stable(feature = "fused", since = "1.26.0")]
+impl<T, A: Allocator> FusedIterator for IntoIter<T, A> {}
+
+#[unstable(feature = "trusted_len", issue = "37572")]
+unsafe impl<T, A: Allocator> TrustedLen for IntoIter<T, A> {}
+
+#[doc(hidden)]
+#[unstable(issue = "none", feature = "std_internals")]
+#[rustc_unsafe_specialization_marker]
+pub trait NonDrop {}
+
+// T: Copy as approximation for !Drop since get_unchecked does not advance self.ptr
+// and thus we can't implement drop-handling
+#[unstable(issue = "none", feature = "std_internals")]
+impl<T: Copy> NonDrop for T {}
+
+#[doc(hidden)]
+#[unstable(issue = "none", feature = "std_internals")]
+// TrustedRandomAccess (without NoCoerce) must not be implemented because
+// subtypes/supertypes of `T` might not be `NonDrop`
+unsafe impl<T, A: Allocator> TrustedRandomAccessNoCoerce for IntoIter<T, A>
+where
+    T: NonDrop,
+{
+    const MAY_HAVE_SIDE_EFFECT: bool = false;
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "vec_into_iter_clone", since = "1.8.0")]
+impl<T: Clone, A: Allocator + Clone> Clone for IntoIter<T, A> {
+    #[cfg(not(test))]
+    fn clone(&self) -> Self {
+        self.as_slice().to_vec_in(self.alloc.deref().clone()).into_iter()
+    }
+    #[cfg(test)]
+    fn clone(&self) -> Self {
+        crate::slice::to_vec(self.as_slice(), self.alloc.deref().clone()).into_iter()
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+unsafe impl<#[may_dangle] T, A: Allocator> Drop for IntoIter<T, A> {
+    fn drop(&mut self) {
+        struct DropGuard<'a, T, A: Allocator>(&'a mut IntoIter<T, A>);
+
+        impl<T, A: Allocator> Drop for DropGuard<'_, T, A> {
+            fn drop(&mut self) {
+                unsafe {
+                    // `IntoIter::alloc` is not used anymore after this and will be dropped by RawVec
+                    let alloc = ManuallyDrop::take(&mut self.0.alloc);
+                    // RawVec handles deallocation
+                    let _ = RawVec::from_raw_parts_in(self.0.buf.as_ptr(), self.0.cap, alloc);
+                }
+            }
+        }
+
+        let guard = DropGuard(self);
+        // destroy the remaining elements
+        unsafe {
+            ptr::drop_in_place(guard.0.as_raw_mut_slice());
+        }
+        // now `guard` will be dropped and do the rest
+    }
+}
+
+// In addition to the SAFETY invariants of the following three unsafe traits
+// also refer to the vec::in_place_collect module documentation to get an overview
+#[unstable(issue = "none", feature = "inplace_iteration")]
+#[doc(hidden)]
+unsafe impl<T, A: Allocator> InPlaceIterable for IntoIter<T, A> {}
+
+#[unstable(issue = "none", feature = "inplace_iteration")]
+#[doc(hidden)]
+unsafe impl<T, A: Allocator> SourceIter for IntoIter<T, A> {
+    type Source = Self;
+
+    #[inline]
+    unsafe fn as_inner(&mut self) -> &mut Self::Source {
+        self
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+unsafe impl<T> AsVecIntoIter for IntoIter<T> {
+    type Item = T;
+
+    fn as_into_iter(&mut self) -> &mut IntoIter<Self::Item> {
+        self
+    }
+}
diff --git a/rust/alloc/vec/is_zero.rs b/rust/alloc/vec/is_zero.rs
new file mode 100644
index 000000000000..edf270db81d4
--- /dev/null
+++ b/rust/alloc/vec/is_zero.rs
@@ -0,0 +1,118 @@
+use crate::boxed::Box;
+
+#[rustc_specialization_trait]
+pub(super) unsafe trait IsZero {
+    /// Whether this value's representation is all zeros
+    fn is_zero(&self) -> bool;
+}
+
+macro_rules! impl_is_zero {
+    ($t:ty, $is_zero:expr) => {
+        unsafe impl IsZero for $t {
+            #[inline]
+            fn is_zero(&self) -> bool {
+                $is_zero(*self)
+            }
+        }
+    };
+}
+
+impl_is_zero!(i16, |x| x == 0);
+impl_is_zero!(i32, |x| x == 0);
+impl_is_zero!(i64, |x| x == 0);
+impl_is_zero!(i128, |x| x == 0);
+impl_is_zero!(isize, |x| x == 0);
+
+impl_is_zero!(u16, |x| x == 0);
+impl_is_zero!(u32, |x| x == 0);
+impl_is_zero!(u64, |x| x == 0);
+impl_is_zero!(u128, |x| x == 0);
+impl_is_zero!(usize, |x| x == 0);
+
+impl_is_zero!(bool, |x| x == false);
+impl_is_zero!(char, |x| x == '\0');
+
+impl_is_zero!(f32, |x: f32| x.to_bits() == 0);
+impl_is_zero!(f64, |x: f64| x.to_bits() == 0);
+
+unsafe impl<T> IsZero for *const T {
+    #[inline]
+    fn is_zero(&self) -> bool {
+        (*self).is_null()
+    }
+}
+
+unsafe impl<T> IsZero for *mut T {
+    #[inline]
+    fn is_zero(&self) -> bool {
+        (*self).is_null()
+    }
+}
+
+unsafe impl<T: IsZero, const N: usize> IsZero for [T; N] {
+    #[inline]
+    fn is_zero(&self) -> bool {
+        // Because this is generated as a runtime check, it's not obvious that
+        // it's worth doing if the array is really long.  The threshold here
+        // is largely arbitrary, but was picked because as of 2022-05-01 LLVM
+        // can const-fold the check in `vec![[0; 32]; n]` but not in
+        // `vec![[0; 64]; n]`: https://godbolt.org/z/WTzjzfs5b
+        // Feel free to tweak if you have better evidence.
+
+        N <= 32 && self.iter().all(IsZero::is_zero)
+    }
+}
+
+// `Option<&T>` and `Option<Box<T>>` are guaranteed to represent `None` as null.
+// For fat pointers, the bytes that would be the pointer metadata in the `Some`
+// variant are padding in the `None` variant, so ignoring them and
+// zero-initializing instead is ok.
+// `Option<&mut T>` never implements `Clone`, so there's no need for an impl of
+// `SpecFromElem`.
+
+unsafe impl<T: ?Sized> IsZero for Option<&T> {
+    #[inline]
+    fn is_zero(&self) -> bool {
+        self.is_none()
+    }
+}
+
+unsafe impl<T: ?Sized> IsZero for Option<Box<T>> {
+    #[inline]
+    fn is_zero(&self) -> bool {
+        self.is_none()
+    }
+}
+
+// `Option<num::NonZeroU32>` and similar have a representation guarantee that
+// they're the same size as the corresponding `u32` type, as well as a guarantee
+// that transmuting between `NonZeroU32` and `Option<num::NonZeroU32>` works.
+// While the documentation officially makes it UB to transmute from `None`,
+// we're the standard library so we can make extra inferences, and we know that
+// the only niche available to represent `None` is the one that's all zeros.
+
+macro_rules! impl_is_zero_option_of_nonzero {
+    ($($t:ident,)+) => {$(
+        unsafe impl IsZero for Option<core::num::$t> {
+            #[inline]
+            fn is_zero(&self) -> bool {
+                self.is_none()
+            }
+        }
+    )+};
+}
+
+impl_is_zero_option_of_nonzero!(
+    NonZeroU8,
+    NonZeroU16,
+    NonZeroU32,
+    NonZeroU64,
+    NonZeroU128,
+    NonZeroI8,
+    NonZeroI16,
+    NonZeroI32,
+    NonZeroI64,
+    NonZeroI128,
+    NonZeroUsize,
+    NonZeroIsize,
+);
diff --git a/rust/alloc/vec/mod.rs b/rust/alloc/vec/mod.rs
new file mode 100644
index 000000000000..3dc8a4fbba86
--- /dev/null
+++ b/rust/alloc/vec/mod.rs
@@ -0,0 +1,3115 @@
+//! A contiguous growable array type with heap-allocated contents, written
+//! `Vec<T>`.
+//!
+//! Vectors have *O*(1) indexing, amortized *O*(1) push (to the end) and
+//! *O*(1) pop (from the end).
+//!
+//! Vectors ensure they never allocate more than `isize::MAX` bytes.
+//!
+//! # Examples
+//!
+//! You can explicitly create a [`Vec`] with [`Vec::new`]:
+//!
+//! ```
+//! let v: Vec<i32> = Vec::new();
+//! ```
+//!
+//! ...or by using the [`vec!`] macro:
+//!
+//! ```
+//! let v: Vec<i32> = vec![];
+//!
+//! let v = vec![1, 2, 3, 4, 5];
+//!
+//! let v = vec![0; 10]; // ten zeroes
+//! ```
+//!
+//! You can [`push`] values onto the end of a vector (which will grow the vector
+//! as needed):
+//!
+//! ```
+//! let mut v = vec![1, 2];
+//!
+//! v.push(3);
+//! ```
+//!
+//! Popping values works in much the same way:
+//!
+//! ```
+//! let mut v = vec![1, 2];
+//!
+//! let two = v.pop();
+//! ```
+//!
+//! Vectors also support indexing (through the [`Index`] and [`IndexMut`] traits):
+//!
+//! ```
+//! let mut v = vec![1, 2, 3];
+//! let three = v[2];
+//! v[1] = v[1] + 5;
+//! ```
+//!
+//! [`push`]: Vec::push
+
+#![stable(feature = "rust1", since = "1.0.0")]
+
+#[cfg(not(no_global_oom_handling))]
+use core::cmp;
+use core::cmp::Ordering;
+use core::convert::TryFrom;
+use core::fmt;
+use core::hash::{Hash, Hasher};
+use core::intrinsics::{arith_offset, assume};
+use core::iter;
+#[cfg(not(no_global_oom_handling))]
+use core::iter::FromIterator;
+use core::marker::PhantomData;
+use core::mem::{self, ManuallyDrop, MaybeUninit};
+use core::ops::{self, Index, IndexMut, Range, RangeBounds};
+use core::ptr::{self, NonNull};
+use core::slice::{self, SliceIndex};
+
+use crate::alloc::{Allocator, Global};
+use crate::borrow::{Cow, ToOwned};
+use crate::boxed::Box;
+use crate::collections::TryReserveError;
+use crate::raw_vec::RawVec;
+
+#[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")]
+pub use self::drain_filter::DrainFilter;
+
+mod drain_filter;
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "vec_splice", since = "1.21.0")]
+pub use self::splice::Splice;
+
+#[cfg(not(no_global_oom_handling))]
+mod splice;
+
+#[stable(feature = "drain", since = "1.6.0")]
+pub use self::drain::Drain;
+
+mod drain;
+
+#[cfg(not(no_global_oom_handling))]
+mod cow;
+
+#[cfg(not(no_global_oom_handling))]
+pub(crate) use self::in_place_collect::AsVecIntoIter;
+#[stable(feature = "rust1", since = "1.0.0")]
+pub use self::into_iter::IntoIter;
+
+mod into_iter;
+
+#[cfg(not(no_global_oom_handling))]
+use self::is_zero::IsZero;
+
+mod is_zero;
+
+#[cfg(not(no_global_oom_handling))]
+mod in_place_collect;
+
+mod partial_eq;
+
+#[cfg(not(no_global_oom_handling))]
+use self::spec_from_elem::SpecFromElem;
+
+#[cfg(not(no_global_oom_handling))]
+mod spec_from_elem;
+
+#[cfg(not(no_global_oom_handling))]
+use self::set_len_on_drop::SetLenOnDrop;
+
+#[cfg(not(no_global_oom_handling))]
+mod set_len_on_drop;
+
+#[cfg(not(no_global_oom_handling))]
+use self::in_place_drop::InPlaceDrop;
+
+#[cfg(not(no_global_oom_handling))]
+mod in_place_drop;
+
+#[cfg(not(no_global_oom_handling))]
+use self::spec_from_iter_nested::SpecFromIterNested;
+
+#[cfg(not(no_global_oom_handling))]
+mod spec_from_iter_nested;
+
+#[cfg(not(no_global_oom_handling))]
+use self::spec_from_iter::SpecFromIter;
+
+#[cfg(not(no_global_oom_handling))]
+mod spec_from_iter;
+
+#[cfg(not(no_global_oom_handling))]
+use self::spec_extend::SpecExtend;
+
+#[cfg(not(no_global_oom_handling))]
+mod spec_extend;
+
+/// A contiguous growable array type, written as `Vec<T>`, short for 'vector'.
+///
+/// # Examples
+///
+/// ```
+/// let mut vec = Vec::new();
+/// vec.push(1);
+/// vec.push(2);
+///
+/// assert_eq!(vec.len(), 2);
+/// assert_eq!(vec[0], 1);
+///
+/// assert_eq!(vec.pop(), Some(2));
+/// assert_eq!(vec.len(), 1);
+///
+/// vec[0] = 7;
+/// assert_eq!(vec[0], 7);
+///
+/// vec.extend([1, 2, 3].iter().copied());
+///
+/// for x in &vec {
+///     println!("{x}");
+/// }
+/// assert_eq!(vec, [7, 1, 2, 3]);
+/// ```
+///
+/// The [`vec!`] macro is provided for convenient initialization:
+///
+/// ```
+/// let mut vec1 = vec![1, 2, 3];
+/// vec1.push(4);
+/// let vec2 = Vec::from([1, 2, 3, 4]);
+/// assert_eq!(vec1, vec2);
+/// ```
+///
+/// It can also initialize each element of a `Vec<T>` with a given value.
+/// This may be more efficient than performing allocation and initialization
+/// in separate steps, especially when initializing a vector of zeros:
+///
+/// ```
+/// let vec = vec![0; 5];
+/// assert_eq!(vec, [0, 0, 0, 0, 0]);
+///
+/// // The following is equivalent, but potentially slower:
+/// let mut vec = Vec::with_capacity(5);
+/// vec.resize(5, 0);
+/// assert_eq!(vec, [0, 0, 0, 0, 0]);
+/// ```
+///
+/// For more information, see
+/// [Capacity and Reallocation](#capacity-and-reallocation).
+///
+/// Use a `Vec<T>` as an efficient stack:
+///
+/// ```
+/// let mut stack = Vec::new();
+///
+/// stack.push(1);
+/// stack.push(2);
+/// stack.push(3);
+///
+/// while let Some(top) = stack.pop() {
+///     // Prints 3, 2, 1
+///     println!("{top}");
+/// }
+/// ```
+///
+/// # Indexing
+///
+/// The `Vec` type allows to access values by index, because it implements the
+/// [`Index`] trait. An example will be more explicit:
+///
+/// ```
+/// let v = vec![0, 2, 4, 6];
+/// println!("{}", v[1]); // it will display '2'
+/// ```
+///
+/// However be careful: if you try to access an index which isn't in the `Vec`,
+/// your software will panic! You cannot do this:
+///
+/// ```should_panic
+/// let v = vec![0, 2, 4, 6];
+/// println!("{}", v[6]); // it will panic!
+/// ```
+///
+/// Use [`get`] and [`get_mut`] if you want to check whether the index is in
+/// the `Vec`.
+///
+/// # Slicing
+///
+/// A `Vec` can be mutable. On the other hand, slices are read-only objects.
+/// To get a [slice][prim@slice], use [`&`]. Example:
+///
+/// ```
+/// fn read_slice(slice: &[usize]) {
+///     // ...
+/// }
+///
+/// let v = vec![0, 1];
+/// read_slice(&v);
+///
+/// // ... and that's all!
+/// // you can also do it like this:
+/// let u: &[usize] = &v;
+/// // or like this:
+/// let u: &[_] = &v;
+/// ```
+///
+/// In Rust, it's more common to pass slices as arguments rather than vectors
+/// when you just want to provide read access. The same goes for [`String`] and
+/// [`&str`].
+///
+/// # Capacity and reallocation
+///
+/// The capacity of a vector is the amount of space allocated for any future
+/// elements that will be added onto the vector. This is not to be confused with
+/// the *length* of a vector, which specifies the number of actual elements
+/// within the vector. If a vector's length exceeds its capacity, its capacity
+/// will automatically be increased, but its elements will have to be
+/// reallocated.
+///
+/// For example, a vector with capacity 10 and length 0 would be an empty vector
+/// with space for 10 more elements. Pushing 10 or fewer elements onto the
+/// vector will not change its capacity or cause reallocation to occur. However,
+/// if the vector's length is increased to 11, it will have to reallocate, which
+/// can be slow. For this reason, it is recommended to use [`Vec::with_capacity`]
+/// whenever possible to specify how big the vector is expected to get.
+///
+/// # Guarantees
+///
+/// Due to its incredibly fundamental nature, `Vec` makes a lot of guarantees
+/// about its design. This ensures that it's as low-overhead as possible in
+/// the general case, and can be correctly manipulated in primitive ways
+/// by unsafe code. Note that these guarantees refer to an unqualified `Vec<T>`.
+/// If additional type parameters are added (e.g., to support custom allocators),
+/// overriding their defaults may change the behavior.
+///
+/// Most fundamentally, `Vec` is and always will be a (pointer, capacity, length)
+/// triplet. No more, no less. The order of these fields is completely
+/// unspecified, and you should use the appropriate methods to modify these.
+/// The pointer will never be null, so this type is null-pointer-optimized.
+///
+/// However, the pointer might not actually point to allocated memory. In particular,
+/// if you construct a `Vec` with capacity 0 via [`Vec::new`], [`vec![]`][`vec!`],
+/// [`Vec::with_capacity(0)`][`Vec::with_capacity`], or by calling [`shrink_to_fit`]
+/// on an empty Vec, it will not allocate memory. Similarly, if you store zero-sized
+/// types inside a `Vec`, it will not allocate space for them. *Note that in this case
+/// the `Vec` might not report a [`capacity`] of 0*. `Vec` will allocate if and only
+/// if <code>[mem::size_of::\<T>]\() * [capacity]\() > 0</code>. In general, `Vec`'s allocation
+/// details are very subtle --- if you intend to allocate memory using a `Vec`
+/// and use it for something else (either to pass to unsafe code, or to build your
+/// own memory-backed collection), be sure to deallocate this memory by using
+/// `from_raw_parts` to recover the `Vec` and then dropping it.
+///
+/// If a `Vec` *has* allocated memory, then the memory it points to is on the heap
+/// (as defined by the allocator Rust is configured to use by default), and its
+/// pointer points to [`len`] initialized, contiguous elements in order (what
+/// you would see if you coerced it to a slice), followed by <code>[capacity] - [len]</code>
+/// logically uninitialized, contiguous elements.
+///
+/// A vector containing the elements `'a'` and `'b'` with capacity 4 can be
+/// visualized as below. The top part is the `Vec` struct, it contains a
+/// pointer to the head of the allocation in the heap, length and capacity.
+/// The bottom part is the allocation on the heap, a contiguous memory block.
+///
+/// ```text
+///             ptr      len  capacity
+///        +--------+--------+--------+
+///        | 0x0123 |      2 |      4 |
+///        +--------+--------+--------+
+///             |
+///             v
+/// Heap   +--------+--------+--------+--------+
+///        |    'a' |    'b' | uninit | uninit |
+///        +--------+--------+--------+--------+
+/// ```
+///
+/// - **uninit** represents memory that is not initialized, see [`MaybeUninit`].
+/// - Note: the ABI is not stable and `Vec` makes no guarantees about its memory
+///   layout (including the order of fields).
+///
+/// `Vec` will never perform a "small optimization" where elements are actually
+/// stored on the stack for two reasons:
+///
+/// * It would make it more difficult for unsafe code to correctly manipulate
+///   a `Vec`. The contents of a `Vec` wouldn't have a stable address if it were
+///   only moved, and it would be more difficult to determine if a `Vec` had
+///   actually allocated memory.
+///
+/// * It would penalize the general case, incurring an additional branch
+///   on every access.
+///
+/// `Vec` will never automatically shrink itself, even if completely empty. This
+/// ensures no unnecessary allocations or deallocations occur. Emptying a `Vec`
+/// and then filling it back up to the same [`len`] should incur no calls to
+/// the allocator. If you wish to free up unused memory, use
+/// [`shrink_to_fit`] or [`shrink_to`].
+///
+/// [`push`] and [`insert`] will never (re)allocate if the reported capacity is
+/// sufficient. [`push`] and [`insert`] *will* (re)allocate if
+/// <code>[len] == [capacity]</code>. That is, the reported capacity is completely
+/// accurate, and can be relied on. It can even be used to manually free the memory
+/// allocated by a `Vec` if desired. Bulk insertion methods *may* reallocate, even
+/// when not necessary.
+///
+/// `Vec` does not guarantee any particular growth strategy when reallocating
+/// when full, nor when [`reserve`] is called. The current strategy is basic
+/// and it may prove desirable to use a non-constant growth factor. Whatever
+/// strategy is used will of course guarantee *O*(1) amortized [`push`].
+///
+/// `vec![x; n]`, `vec![a, b, c, d]`, and
+/// [`Vec::with_capacity(n)`][`Vec::with_capacity`], will all produce a `Vec`
+/// with exactly the requested capacity. If <code>[len] == [capacity]</code>,
+/// (as is the case for the [`vec!`] macro), then a `Vec<T>` can be converted to
+/// and from a [`Box<[T]>`][owned slice] without reallocating or moving the elements.
+///
+/// `Vec` will not specifically overwrite any data that is removed from it,
+/// but also won't specifically preserve it. Its uninitialized memory is
+/// scratch space that it may use however it wants. It will generally just do
+/// whatever is most efficient or otherwise easy to implement. Do not rely on
+/// removed data to be erased for security purposes. Even if you drop a `Vec`, its
+/// buffer may simply be reused by another allocation. Even if you zero a `Vec`'s memory
+/// first, that might not actually happen because the optimizer does not consider
+/// this a side-effect that must be preserved. There is one case which we will
+/// not break, however: using `unsafe` code to write to the excess capacity,
+/// and then increasing the length to match, is always valid.
+///
+/// Currently, `Vec` does not guarantee the order in which elements are dropped.
+/// The order has changed in the past and may change again.
+///
+/// [`get`]: ../../std/vec/struct.Vec.html#method.get
+/// [`get_mut`]: ../../std/vec/struct.Vec.html#method.get_mut
+/// [`String`]: crate::string::String
+/// [`&str`]: type@str
+/// [`shrink_to_fit`]: Vec::shrink_to_fit
+/// [`shrink_to`]: Vec::shrink_to
+/// [capacity]: Vec::capacity
+/// [`capacity`]: Vec::capacity
+/// [mem::size_of::\<T>]: core::mem::size_of
+/// [len]: Vec::len
+/// [`len`]: Vec::len
+/// [`push`]: Vec::push
+/// [`insert`]: Vec::insert
+/// [`reserve`]: Vec::reserve
+/// [`MaybeUninit`]: core::mem::MaybeUninit
+/// [owned slice]: Box
+#[stable(feature = "rust1", since = "1.0.0")]
+#[cfg_attr(not(test), rustc_diagnostic_item = "Vec")]
+#[rustc_insignificant_dtor]
+pub struct Vec<T, #[unstable(feature = "allocator_api", issue = "32838")] A: Allocator = Global> {
+    buf: RawVec<T, A>,
+    len: usize,
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Inherent methods
+////////////////////////////////////////////////////////////////////////////////
+
+impl<T> Vec<T> {
+    /// Constructs a new, empty `Vec<T>`.
+    ///
+    /// The vector will not allocate until elements are pushed onto it.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # #![allow(unused_mut)]
+    /// let mut vec: Vec<i32> = Vec::new();
+    /// ```
+    #[inline]
+    #[rustc_const_stable(feature = "const_vec_new", since = "1.39.0")]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    #[must_use]
+    pub const fn new() -> Self {
+        Vec { buf: RawVec::NEW, len: 0 }
+    }
+
+    /// Constructs a new, empty `Vec<T>` with the specified capacity.
+    ///
+    /// The vector will be able to hold exactly `capacity` elements without
+    /// reallocating. If `capacity` is 0, the vector will not allocate.
+    ///
+    /// It is important to note that although the returned vector has the
+    /// *capacity* specified, the vector will have a zero *length*. For an
+    /// explanation of the difference between length and capacity, see
+    /// *[Capacity and reallocation]*.
+    ///
+    /// [Capacity and reallocation]: #capacity-and-reallocation
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` bytes.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = Vec::with_capacity(10);
+    ///
+    /// // The vector contains no items, even though it has capacity for more
+    /// assert_eq!(vec.len(), 0);
+    /// assert_eq!(vec.capacity(), 10);
+    ///
+    /// // These are all done without reallocating...
+    /// for i in 0..10 {
+    ///     vec.push(i);
+    /// }
+    /// assert_eq!(vec.len(), 10);
+    /// assert_eq!(vec.capacity(), 10);
+    ///
+    /// // ...but this may make the vector reallocate
+    /// vec.push(11);
+    /// assert_eq!(vec.len(), 11);
+    /// assert!(vec.capacity() >= 11);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[inline]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    #[must_use]
+    pub fn with_capacity(capacity: usize) -> Self {
+        Self::with_capacity_in(capacity, Global)
+    }
+
+    /// Creates a `Vec<T>` directly from the raw components of another vector.
+    ///
+    /// # Safety
+    ///
+    /// This is highly unsafe, due to the number of invariants that aren't
+    /// checked:
+    ///
+    /// * `ptr` needs to have been previously allocated via [`String`]/`Vec<T>`
+    ///   (at least, it's highly likely to be incorrect if it wasn't).
+    /// * `T` needs to have the same alignment as what `ptr` was allocated with.
+    ///   (`T` having a less strict alignment is not sufficient, the alignment really
+    ///   needs to be equal to satisfy the [`dealloc`] requirement that memory must be
+    ///   allocated and deallocated with the same layout.)
+    /// * The size of `T` times the `capacity` (ie. the allocated size in bytes) needs
+    ///   to be the same size as the pointer was allocated with. (Because similar to
+    ///   alignment, [`dealloc`] must be called with the same layout `size`.)
+    /// * `length` needs to be less than or equal to `capacity`.
+    ///
+    /// Violating these may cause problems like corrupting the allocator's
+    /// internal data structures. For example it is normally **not** safe
+    /// to build a `Vec<u8>` from a pointer to a C `char` array with length
+    /// `size_t`, doing so is only safe if the array was initially allocated by
+    /// a `Vec` or `String`.
+    /// It's also not safe to build one from a `Vec<u16>` and its length, because
+    /// the allocator cares about the alignment, and these two types have different
+    /// alignments. The buffer was allocated with alignment 2 (for `u16`), but after
+    /// turning it into a `Vec<u8>` it'll be deallocated with alignment 1. To avoid
+    /// these issues, it is often preferable to do casting/transmuting using
+    /// [`slice::from_raw_parts`] instead.
+    ///
+    /// The ownership of `ptr` is effectively transferred to the
+    /// `Vec<T>` which may then deallocate, reallocate or change the
+    /// contents of memory pointed to by the pointer at will. Ensure
+    /// that nothing else uses the pointer after calling this
+    /// function.
+    ///
+    /// [`String`]: crate::string::String
+    /// [`dealloc`]: crate::alloc::GlobalAlloc::dealloc
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use std::ptr;
+    /// use std::mem;
+    ///
+    /// let v = vec![1, 2, 3];
+    ///
+    // FIXME Update this when vec_into_raw_parts is stabilized
+    /// // Prevent running `v`'s destructor so we are in complete control
+    /// // of the allocation.
+    /// let mut v = mem::ManuallyDrop::new(v);
+    ///
+    /// // Pull out the various important pieces of information about `v`
+    /// let p = v.as_mut_ptr();
+    /// let len = v.len();
+    /// let cap = v.capacity();
+    ///
+    /// unsafe {
+    ///     // Overwrite memory with 4, 5, 6
+    ///     for i in 0..len as isize {
+    ///         ptr::write(p.offset(i), 4 + i);
+    ///     }
+    ///
+    ///     // Put everything back together into a Vec
+    ///     let rebuilt = Vec::from_raw_parts(p, len, cap);
+    ///     assert_eq!(rebuilt, [4, 5, 6]);
+    /// }
+    /// ```
+    #[inline]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub unsafe fn from_raw_parts(ptr: *mut T, length: usize, capacity: usize) -> Self {
+        unsafe { Self::from_raw_parts_in(ptr, length, capacity, Global) }
+    }
+}
+
+impl<T, A: Allocator> Vec<T, A> {
+    /// Constructs a new, empty `Vec<T, A>`.
+    ///
+    /// The vector will not allocate until elements are pushed onto it.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// # #[allow(unused_mut)]
+    /// let mut vec: Vec<i32, _> = Vec::new_in(System);
+    /// ```
+    #[inline]
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    pub const fn new_in(alloc: A) -> Self {
+        Vec { buf: RawVec::new_in(alloc), len: 0 }
+    }
+
+    /// Constructs a new, empty `Vec<T, A>` with the specified capacity with the provided
+    /// allocator.
+    ///
+    /// The vector will be able to hold exactly `capacity` elements without
+    /// reallocating. If `capacity` is 0, the vector will not allocate.
+    ///
+    /// It is important to note that although the returned vector has the
+    /// *capacity* specified, the vector will have a zero *length*. For an
+    /// explanation of the difference between length and capacity, see
+    /// *[Capacity and reallocation]*.
+    ///
+    /// [Capacity and reallocation]: #capacity-and-reallocation
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` bytes.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let mut vec = Vec::with_capacity_in(10, System);
+    ///
+    /// // The vector contains no items, even though it has capacity for more
+    /// assert_eq!(vec.len(), 0);
+    /// assert_eq!(vec.capacity(), 10);
+    ///
+    /// // These are all done without reallocating...
+    /// for i in 0..10 {
+    ///     vec.push(i);
+    /// }
+    /// assert_eq!(vec.len(), 10);
+    /// assert_eq!(vec.capacity(), 10);
+    ///
+    /// // ...but this may make the vector reallocate
+    /// vec.push(11);
+    /// assert_eq!(vec.len(), 11);
+    /// assert!(vec.capacity() >= 11);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[inline]
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    pub fn with_capacity_in(capacity: usize, alloc: A) -> Self {
+        Vec { buf: RawVec::with_capacity_in(capacity, alloc), len: 0 }
+    }
+
+    /// Creates a `Vec<T, A>` directly from the raw components of another vector.
+    ///
+    /// # Safety
+    ///
+    /// This is highly unsafe, due to the number of invariants that aren't
+    /// checked:
+    ///
+    /// * `ptr` needs to have been previously allocated via [`String`]/`Vec<T>`
+    ///   (at least, it's highly likely to be incorrect if it wasn't).
+    /// * `T` needs to have the same size and alignment as what `ptr` was allocated with.
+    ///   (`T` having a less strict alignment is not sufficient, the alignment really
+    ///   needs to be equal to satisfy the [`dealloc`] requirement that memory must be
+    ///   allocated and deallocated with the same layout.)
+    /// * `length` needs to be less than or equal to `capacity`.
+    /// * `capacity` needs to be the capacity that the pointer was allocated with.
+    ///
+    /// Violating these may cause problems like corrupting the allocator's
+    /// internal data structures. For example it is **not** safe
+    /// to build a `Vec<u8>` from a pointer to a C `char` array with length `size_t`.
+    /// It's also not safe to build one from a `Vec<u16>` and its length, because
+    /// the allocator cares about the alignment, and these two types have different
+    /// alignments. The buffer was allocated with alignment 2 (for `u16`), but after
+    /// turning it into a `Vec<u8>` it'll be deallocated with alignment 1.
+    ///
+    /// The ownership of `ptr` is effectively transferred to the
+    /// `Vec<T>` which may then deallocate, reallocate or change the
+    /// contents of memory pointed to by the pointer at will. Ensure
+    /// that nothing else uses the pointer after calling this
+    /// function.
+    ///
+    /// [`String`]: crate::string::String
+    /// [`dealloc`]: crate::alloc::GlobalAlloc::dealloc
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// use std::ptr;
+    /// use std::mem;
+    ///
+    /// let mut v = Vec::with_capacity_in(3, System);
+    /// v.push(1);
+    /// v.push(2);
+    /// v.push(3);
+    ///
+    // FIXME Update this when vec_into_raw_parts is stabilized
+    /// // Prevent running `v`'s destructor so we are in complete control
+    /// // of the allocation.
+    /// let mut v = mem::ManuallyDrop::new(v);
+    ///
+    /// // Pull out the various important pieces of information about `v`
+    /// let p = v.as_mut_ptr();
+    /// let len = v.len();
+    /// let cap = v.capacity();
+    /// let alloc = v.allocator();
+    ///
+    /// unsafe {
+    ///     // Overwrite memory with 4, 5, 6
+    ///     for i in 0..len as isize {
+    ///         ptr::write(p.offset(i), 4 + i);
+    ///     }
+    ///
+    ///     // Put everything back together into a Vec
+    ///     let rebuilt = Vec::from_raw_parts_in(p, len, cap, alloc.clone());
+    ///     assert_eq!(rebuilt, [4, 5, 6]);
+    /// }
+    /// ```
+    #[inline]
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    pub unsafe fn from_raw_parts_in(ptr: *mut T, length: usize, capacity: usize, alloc: A) -> Self {
+        unsafe { Vec { buf: RawVec::from_raw_parts_in(ptr, capacity, alloc), len: length } }
+    }
+
+    /// Decomposes a `Vec<T>` into its raw components.
+    ///
+    /// Returns the raw pointer to the underlying data, the length of
+    /// the vector (in elements), and the allocated capacity of the
+    /// data (in elements). These are the same arguments in the same
+    /// order as the arguments to [`from_raw_parts`].
+    ///
+    /// After calling this function, the caller is responsible for the
+    /// memory previously managed by the `Vec`. The only way to do
+    /// this is to convert the raw pointer, length, and capacity back
+    /// into a `Vec` with the [`from_raw_parts`] function, allowing
+    /// the destructor to perform the cleanup.
+    ///
+    /// [`from_raw_parts`]: Vec::from_raw_parts
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(vec_into_raw_parts)]
+    /// let v: Vec<i32> = vec![-1, 0, 1];
+    ///
+    /// let (ptr, len, cap) = v.into_raw_parts();
+    ///
+    /// let rebuilt = unsafe {
+    ///     // We can now make changes to the components, such as
+    ///     // transmuting the raw pointer to a compatible type.
+    ///     let ptr = ptr as *mut u32;
+    ///
+    ///     Vec::from_raw_parts(ptr, len, cap)
+    /// };
+    /// assert_eq!(rebuilt, [4294967295, 0, 1]);
+    /// ```
+    #[unstable(feature = "vec_into_raw_parts", reason = "new API", issue = "65816")]
+    pub fn into_raw_parts(self) -> (*mut T, usize, usize) {
+        let mut me = ManuallyDrop::new(self);
+        (me.as_mut_ptr(), me.len(), me.capacity())
+    }
+
+    /// Decomposes a `Vec<T>` into its raw components.
+    ///
+    /// Returns the raw pointer to the underlying data, the length of the vector (in elements),
+    /// the allocated capacity of the data (in elements), and the allocator. These are the same
+    /// arguments in the same order as the arguments to [`from_raw_parts_in`].
+    ///
+    /// After calling this function, the caller is responsible for the
+    /// memory previously managed by the `Vec`. The only way to do
+    /// this is to convert the raw pointer, length, and capacity back
+    /// into a `Vec` with the [`from_raw_parts_in`] function, allowing
+    /// the destructor to perform the cleanup.
+    ///
+    /// [`from_raw_parts_in`]: Vec::from_raw_parts_in
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(allocator_api, vec_into_raw_parts)]
+    ///
+    /// use std::alloc::System;
+    ///
+    /// let mut v: Vec<i32, System> = Vec::new_in(System);
+    /// v.push(-1);
+    /// v.push(0);
+    /// v.push(1);
+    ///
+    /// let (ptr, len, cap, alloc) = v.into_raw_parts_with_alloc();
+    ///
+    /// let rebuilt = unsafe {
+    ///     // We can now make changes to the components, such as
+    ///     // transmuting the raw pointer to a compatible type.
+    ///     let ptr = ptr as *mut u32;
+    ///
+    ///     Vec::from_raw_parts_in(ptr, len, cap, alloc)
+    /// };
+    /// assert_eq!(rebuilt, [4294967295, 0, 1]);
+    /// ```
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    // #[unstable(feature = "vec_into_raw_parts", reason = "new API", issue = "65816")]
+    pub fn into_raw_parts_with_alloc(self) -> (*mut T, usize, usize, A) {
+        let mut me = ManuallyDrop::new(self);
+        let len = me.len();
+        let capacity = me.capacity();
+        let ptr = me.as_mut_ptr();
+        let alloc = unsafe { ptr::read(me.allocator()) };
+        (ptr, len, capacity, alloc)
+    }
+
+    /// Returns the number of elements the vector can hold without
+    /// reallocating.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let vec: Vec<i32> = Vec::with_capacity(10);
+    /// assert_eq!(vec.capacity(), 10);
+    /// ```
+    #[inline]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn capacity(&self) -> usize {
+        self.buf.capacity()
+    }
+
+    /// Reserves capacity for at least `additional` more elements to be inserted
+    /// in the given `Vec<T>`. The collection may reserve more space to avoid
+    /// frequent reallocations. After calling `reserve`, capacity will be
+    /// greater than or equal to `self.len() + additional`. Does nothing if
+    /// capacity is already sufficient.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` bytes.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1];
+    /// vec.reserve(10);
+    /// assert!(vec.capacity() >= 11);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn reserve(&mut self, additional: usize) {
+        self.buf.reserve(self.len, additional);
+    }
+
+    /// Reserves the minimum capacity for exactly `additional` more elements to
+    /// be inserted in the given `Vec<T>`. After calling `reserve_exact`,
+    /// capacity will be greater than or equal to `self.len() + additional`.
+    /// Does nothing if the capacity is already sufficient.
+    ///
+    /// Note that the allocator may give the collection more space than it
+    /// requests. Therefore, capacity can not be relied upon to be precisely
+    /// minimal. Prefer [`reserve`] if future insertions are expected.
+    ///
+    /// [`reserve`]: Vec::reserve
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` bytes.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1];
+    /// vec.reserve_exact(10);
+    /// assert!(vec.capacity() >= 11);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn reserve_exact(&mut self, additional: usize) {
+        self.buf.reserve_exact(self.len, additional);
+    }
+
+    /// Tries to reserve capacity for at least `additional` more elements to be inserted
+    /// in the given `Vec<T>`. The collection may reserve more space to avoid
+    /// frequent reallocations. After calling `try_reserve`, capacity will be
+    /// greater than or equal to `self.len() + additional`. Does nothing if
+    /// capacity is already sufficient.
+    ///
+    /// # Errors
+    ///
+    /// If the capacity overflows, or the allocator reports a failure, then an error
+    /// is returned.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use std::collections::TryReserveError;
+    ///
+    /// fn process_data(data: &[u32]) -> Result<Vec<u32>, TryReserveError> {
+    ///     let mut output = Vec::new();
+    ///
+    ///     // Pre-reserve the memory, exiting if we can't
+    ///     output.try_reserve(data.len())?;
+    ///
+    ///     // Now we know this can't OOM in the middle of our complex work
+    ///     output.extend(data.iter().map(|&val| {
+    ///         val * 2 + 5 // very complicated
+    ///     }));
+    ///
+    ///     Ok(output)
+    /// }
+    /// # process_data(&[1, 2, 3]).expect("why is the test harness OOMing on 12 bytes?");
+    /// ```
+    #[stable(feature = "try_reserve", since = "1.57.0")]
+    pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> {
+        self.buf.try_reserve(self.len, additional)
+    }
+
+    /// Tries to reserve the minimum capacity for exactly `additional`
+    /// elements to be inserted in the given `Vec<T>`. After calling
+    /// `try_reserve_exact`, capacity will be greater than or equal to
+    /// `self.len() + additional` if it returns `Ok(())`.
+    /// Does nothing if the capacity is already sufficient.
+    ///
+    /// Note that the allocator may give the collection more space than it
+    /// requests. Therefore, capacity can not be relied upon to be precisely
+    /// minimal. Prefer [`try_reserve`] if future insertions are expected.
+    ///
+    /// [`try_reserve`]: Vec::try_reserve
+    ///
+    /// # Errors
+    ///
+    /// If the capacity overflows, or the allocator reports a failure, then an error
+    /// is returned.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use std::collections::TryReserveError;
+    ///
+    /// fn process_data(data: &[u32]) -> Result<Vec<u32>, TryReserveError> {
+    ///     let mut output = Vec::new();
+    ///
+    ///     // Pre-reserve the memory, exiting if we can't
+    ///     output.try_reserve_exact(data.len())?;
+    ///
+    ///     // Now we know this can't OOM in the middle of our complex work
+    ///     output.extend(data.iter().map(|&val| {
+    ///         val * 2 + 5 // very complicated
+    ///     }));
+    ///
+    ///     Ok(output)
+    /// }
+    /// # process_data(&[1, 2, 3]).expect("why is the test harness OOMing on 12 bytes?");
+    /// ```
+    #[stable(feature = "try_reserve", since = "1.57.0")]
+    pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> {
+        self.buf.try_reserve_exact(self.len, additional)
+    }
+
+    /// Shrinks the capacity of the vector as much as possible.
+    ///
+    /// It will drop down as close as possible to the length but the allocator
+    /// may still inform the vector that there is space for a few more elements.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = Vec::with_capacity(10);
+    /// vec.extend([1, 2, 3]);
+    /// assert_eq!(vec.capacity(), 10);
+    /// vec.shrink_to_fit();
+    /// assert!(vec.capacity() >= 3);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn shrink_to_fit(&mut self) {
+        // The capacity is never less than the length, and there's nothing to do when
+        // they are equal, so we can avoid the panic case in `RawVec::shrink_to_fit`
+        // by only calling it with a greater capacity.
+        if self.capacity() > self.len {
+            self.buf.shrink_to_fit(self.len);
+        }
+    }
+
+    /// Shrinks the capacity of the vector with a lower bound.
+    ///
+    /// The capacity will remain at least as large as both the length
+    /// and the supplied value.
+    ///
+    /// If the current capacity is less than the lower limit, this is a no-op.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = Vec::with_capacity(10);
+    /// vec.extend([1, 2, 3]);
+    /// assert_eq!(vec.capacity(), 10);
+    /// vec.shrink_to(4);
+    /// assert!(vec.capacity() >= 4);
+    /// vec.shrink_to(0);
+    /// assert!(vec.capacity() >= 3);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "shrink_to", since = "1.56.0")]
+    pub fn shrink_to(&mut self, min_capacity: usize) {
+        if self.capacity() > min_capacity {
+            self.buf.shrink_to_fit(cmp::max(self.len, min_capacity));
+        }
+    }
+
+    /// Converts the vector into [`Box<[T]>`][owned slice].
+    ///
+    /// Note that this will drop any excess capacity.
+    ///
+    /// [owned slice]: Box
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let v = vec![1, 2, 3];
+    ///
+    /// let slice = v.into_boxed_slice();
+    /// ```
+    ///
+    /// Any excess capacity is removed:
+    ///
+    /// ```
+    /// let mut vec = Vec::with_capacity(10);
+    /// vec.extend([1, 2, 3]);
+    ///
+    /// assert_eq!(vec.capacity(), 10);
+    /// let slice = vec.into_boxed_slice();
+    /// assert_eq!(slice.into_vec().capacity(), 3);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn into_boxed_slice(mut self) -> Box<[T], A> {
+        unsafe {
+            self.shrink_to_fit();
+            let me = ManuallyDrop::new(self);
+            let buf = ptr::read(&me.buf);
+            let len = me.len();
+            buf.into_box(len).assume_init()
+        }
+    }
+
+    /// Shortens the vector, keeping the first `len` elements and dropping
+    /// the rest.
+    ///
+    /// If `len` is greater than the vector's current length, this has no
+    /// effect.
+    ///
+    /// The [`drain`] method can emulate `truncate`, but causes the excess
+    /// elements to be returned instead of dropped.
+    ///
+    /// Note that this method has no effect on the allocated capacity
+    /// of the vector.
+    ///
+    /// # Examples
+    ///
+    /// Truncating a five element vector to two elements:
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2, 3, 4, 5];
+    /// vec.truncate(2);
+    /// assert_eq!(vec, [1, 2]);
+    /// ```
+    ///
+    /// No truncation occurs when `len` is greater than the vector's current
+    /// length:
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2, 3];
+    /// vec.truncate(8);
+    /// assert_eq!(vec, [1, 2, 3]);
+    /// ```
+    ///
+    /// Truncating when `len == 0` is equivalent to calling the [`clear`]
+    /// method.
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2, 3];
+    /// vec.truncate(0);
+    /// assert_eq!(vec, []);
+    /// ```
+    ///
+    /// [`clear`]: Vec::clear
+    /// [`drain`]: Vec::drain
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn truncate(&mut self, len: usize) {
+        // This is safe because:
+        //
+        // * the slice passed to `drop_in_place` is valid; the `len > self.len`
+        //   case avoids creating an invalid slice, and
+        // * the `len` of the vector is shrunk before calling `drop_in_place`,
+        //   such that no value will be dropped twice in case `drop_in_place`
+        //   were to panic once (if it panics twice, the program aborts).
+        unsafe {
+            // Note: It's intentional that this is `>` and not `>=`.
+            //       Changing it to `>=` has negative performance
+            //       implications in some cases. See #78884 for more.
+            if len > self.len {
+                return;
+            }
+            let remaining_len = self.len - len;
+            let s = ptr::slice_from_raw_parts_mut(self.as_mut_ptr().add(len), remaining_len);
+            self.len = len;
+            ptr::drop_in_place(s);
+        }
+    }
+
+    /// Extracts a slice containing the entire vector.
+    ///
+    /// Equivalent to `&s[..]`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use std::io::{self, Write};
+    /// let buffer = vec![1, 2, 3, 5, 8];
+    /// io::sink().write(buffer.as_slice()).unwrap();
+    /// ```
+    #[inline]
+    #[stable(feature = "vec_as_slice", since = "1.7.0")]
+    pub fn as_slice(&self) -> &[T] {
+        self
+    }
+
+    /// Extracts a mutable slice of the entire vector.
+    ///
+    /// Equivalent to `&mut s[..]`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use std::io::{self, Read};
+    /// let mut buffer = vec![0; 3];
+    /// io::repeat(0b101).read_exact(buffer.as_mut_slice()).unwrap();
+    /// ```
+    #[inline]
+    #[stable(feature = "vec_as_slice", since = "1.7.0")]
+    pub fn as_mut_slice(&mut self) -> &mut [T] {
+        self
+    }
+
+    /// Returns a raw pointer to the vector's buffer.
+    ///
+    /// The caller must ensure that the vector outlives the pointer this
+    /// function returns, or else it will end up pointing to garbage.
+    /// Modifying the vector may cause its buffer to be reallocated,
+    /// which would also make any pointers to it invalid.
+    ///
+    /// The caller must also ensure that the memory the pointer (non-transitively) points to
+    /// is never written to (except inside an `UnsafeCell`) using this pointer or any pointer
+    /// derived from it. If you need to mutate the contents of the slice, use [`as_mut_ptr`].
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let x = vec![1, 2, 4];
+    /// let x_ptr = x.as_ptr();
+    ///
+    /// unsafe {
+    ///     for i in 0..x.len() {
+    ///         assert_eq!(*x_ptr.add(i), 1 << i);
+    ///     }
+    /// }
+    /// ```
+    ///
+    /// [`as_mut_ptr`]: Vec::as_mut_ptr
+    #[stable(feature = "vec_as_ptr", since = "1.37.0")]
+    #[inline]
+    pub fn as_ptr(&self) -> *const T {
+        // We shadow the slice method of the same name to avoid going through
+        // `deref`, which creates an intermediate reference.
+        let ptr = self.buf.ptr();
+        unsafe {
+            assume(!ptr.is_null());
+        }
+        ptr
+    }
+
+    /// Returns an unsafe mutable pointer to the vector's buffer.
+    ///
+    /// The caller must ensure that the vector outlives the pointer this
+    /// function returns, or else it will end up pointing to garbage.
+    /// Modifying the vector may cause its buffer to be reallocated,
+    /// which would also make any pointers to it invalid.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// // Allocate vector big enough for 4 elements.
+    /// let size = 4;
+    /// let mut x: Vec<i32> = Vec::with_capacity(size);
+    /// let x_ptr = x.as_mut_ptr();
+    ///
+    /// // Initialize elements via raw pointer writes, then set length.
+    /// unsafe {
+    ///     for i in 0..size {
+    ///         *x_ptr.add(i) = i as i32;
+    ///     }
+    ///     x.set_len(size);
+    /// }
+    /// assert_eq!(&*x, &[0, 1, 2, 3]);
+    /// ```
+    #[stable(feature = "vec_as_ptr", since = "1.37.0")]
+    #[inline]
+    pub fn as_mut_ptr(&mut self) -> *mut T {
+        // We shadow the slice method of the same name to avoid going through
+        // `deref_mut`, which creates an intermediate reference.
+        let ptr = self.buf.ptr();
+        unsafe {
+            assume(!ptr.is_null());
+        }
+        ptr
+    }
+
+    /// Returns a reference to the underlying allocator.
+    #[unstable(feature = "allocator_api", issue = "32838")]
+    #[inline]
+    pub fn allocator(&self) -> &A {
+        self.buf.allocator()
+    }
+
+    /// Forces the length of the vector to `new_len`.
+    ///
+    /// This is a low-level operation that maintains none of the normal
+    /// invariants of the type. Normally changing the length of a vector
+    /// is done using one of the safe operations instead, such as
+    /// [`truncate`], [`resize`], [`extend`], or [`clear`].
+    ///
+    /// [`truncate`]: Vec::truncate
+    /// [`resize`]: Vec::resize
+    /// [`extend`]: Extend::extend
+    /// [`clear`]: Vec::clear
+    ///
+    /// # Safety
+    ///
+    /// - `new_len` must be less than or equal to [`capacity()`].
+    /// - The elements at `old_len..new_len` must be initialized.
+    ///
+    /// [`capacity()`]: Vec::capacity
+    ///
+    /// # Examples
+    ///
+    /// This method can be useful for situations in which the vector
+    /// is serving as a buffer for other code, particularly over FFI:
+    ///
+    /// ```no_run
+    /// # #![allow(dead_code)]
+    /// # // This is just a minimal skeleton for the doc example;
+    /// # // don't use this as a starting point for a real library.
+    /// # pub struct StreamWrapper { strm: *mut std::ffi::c_void }
+    /// # const Z_OK: i32 = 0;
+    /// # extern "C" {
+    /// #     fn deflateGetDictionary(
+    /// #         strm: *mut std::ffi::c_void,
+    /// #         dictionary: *mut u8,
+    /// #         dictLength: *mut usize,
+    /// #     ) -> i32;
+    /// # }
+    /// # impl StreamWrapper {
+    /// pub fn get_dictionary(&self) -> Option<Vec<u8>> {
+    ///     // Per the FFI method's docs, "32768 bytes is always enough".
+    ///     let mut dict = Vec::with_capacity(32_768);
+    ///     let mut dict_length = 0;
+    ///     // SAFETY: When `deflateGetDictionary` returns `Z_OK`, it holds that:
+    ///     // 1. `dict_length` elements were initialized.
+    ///     // 2. `dict_length` <= the capacity (32_768)
+    ///     // which makes `set_len` safe to call.
+    ///     unsafe {
+    ///         // Make the FFI call...
+    ///         let r = deflateGetDictionary(self.strm, dict.as_mut_ptr(), &mut dict_length);
+    ///         if r == Z_OK {
+    ///             // ...and update the length to what was initialized.
+    ///             dict.set_len(dict_length);
+    ///             Some(dict)
+    ///         } else {
+    ///             None
+    ///         }
+    ///     }
+    /// }
+    /// # }
+    /// ```
+    ///
+    /// While the following example is sound, there is a memory leak since
+    /// the inner vectors were not freed prior to the `set_len` call:
+    ///
+    /// ```
+    /// let mut vec = vec![vec![1, 0, 0],
+    ///                    vec![0, 1, 0],
+    ///                    vec![0, 0, 1]];
+    /// // SAFETY:
+    /// // 1. `old_len..0` is empty so no elements need to be initialized.
+    /// // 2. `0 <= capacity` always holds whatever `capacity` is.
+    /// unsafe {
+    ///     vec.set_len(0);
+    /// }
+    /// ```
+    ///
+    /// Normally, here, one would use [`clear`] instead to correctly drop
+    /// the contents and thus not leak memory.
+    #[inline]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub unsafe fn set_len(&mut self, new_len: usize) {
+        debug_assert!(new_len <= self.capacity());
+
+        self.len = new_len;
+    }
+
+    /// Removes an element from the vector and returns it.
+    ///
+    /// The removed element is replaced by the last element of the vector.
+    ///
+    /// This does not preserve ordering, but is *O*(1).
+    /// If you need to preserve the element order, use [`remove`] instead.
+    ///
+    /// [`remove`]: Vec::remove
+    ///
+    /// # Panics
+    ///
+    /// Panics if `index` is out of bounds.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut v = vec!["foo", "bar", "baz", "qux"];
+    ///
+    /// assert_eq!(v.swap_remove(1), "bar");
+    /// assert_eq!(v, ["foo", "qux", "baz"]);
+    ///
+    /// assert_eq!(v.swap_remove(0), "foo");
+    /// assert_eq!(v, ["baz", "qux"]);
+    /// ```
+    #[inline]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn swap_remove(&mut self, index: usize) -> T {
+        #[cold]
+        #[inline(never)]
+        fn assert_failed(index: usize, len: usize) -> ! {
+            panic!("swap_remove index (is {index}) should be < len (is {len})");
+        }
+
+        let len = self.len();
+        if index >= len {
+            assert_failed(index, len);
+        }
+        unsafe {
+            // We replace self[index] with the last element. Note that if the
+            // bounds check above succeeds there must be a last element (which
+            // can be self[index] itself).
+            let value = ptr::read(self.as_ptr().add(index));
+            let base_ptr = self.as_mut_ptr();
+            ptr::copy(base_ptr.add(len - 1), base_ptr.add(index), 1);
+            self.set_len(len - 1);
+            value
+        }
+    }
+
+    /// Inserts an element at position `index` within the vector, shifting all
+    /// elements after it to the right.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `index > len`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2, 3];
+    /// vec.insert(1, 4);
+    /// assert_eq!(vec, [1, 4, 2, 3]);
+    /// vec.insert(4, 5);
+    /// assert_eq!(vec, [1, 4, 2, 3, 5]);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn insert(&mut self, index: usize, element: T) {
+        #[cold]
+        #[inline(never)]
+        fn assert_failed(index: usize, len: usize) -> ! {
+            panic!("insertion index (is {index}) should be <= len (is {len})");
+        }
+
+        let len = self.len();
+        if index > len {
+            assert_failed(index, len);
+        }
+
+        // space for the new element
+        if len == self.buf.capacity() {
+            self.reserve(1);
+        }
+
+        unsafe {
+            // infallible
+            // The spot to put the new value
+            {
+                let p = self.as_mut_ptr().add(index);
+                // Shift everything over to make space. (Duplicating the
+                // `index`th element into two consecutive places.)
+                ptr::copy(p, p.offset(1), len - index);
+                // Write it in, overwriting the first copy of the `index`th
+                // element.
+                ptr::write(p, element);
+            }
+            self.set_len(len + 1);
+        }
+    }
+
+    /// Removes and returns the element at position `index` within the vector,
+    /// shifting all elements after it to the left.
+    ///
+    /// Note: Because this shifts over the remaining elements, it has a
+    /// worst-case performance of *O*(*n*). If you don't need the order of elements
+    /// to be preserved, use [`swap_remove`] instead. If you'd like to remove
+    /// elements from the beginning of the `Vec`, consider using
+    /// [`VecDeque::pop_front`] instead.
+    ///
+    /// [`swap_remove`]: Vec::swap_remove
+    /// [`VecDeque::pop_front`]: crate::collections::VecDeque::pop_front
+    ///
+    /// # Panics
+    ///
+    /// Panics if `index` is out of bounds.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut v = vec![1, 2, 3];
+    /// assert_eq!(v.remove(1), 2);
+    /// assert_eq!(v, [1, 3]);
+    /// ```
+    #[stable(feature = "rust1", since = "1.0.0")]
+    #[track_caller]
+    pub fn remove(&mut self, index: usize) -> T {
+        #[cold]
+        #[inline(never)]
+        #[track_caller]
+        fn assert_failed(index: usize, len: usize) -> ! {
+            panic!("removal index (is {index}) should be < len (is {len})");
+        }
+
+        let len = self.len();
+        if index >= len {
+            assert_failed(index, len);
+        }
+        unsafe {
+            // infallible
+            let ret;
+            {
+                // the place we are taking from.
+                let ptr = self.as_mut_ptr().add(index);
+                // copy it out, unsafely having a copy of the value on
+                // the stack and in the vector at the same time.
+                ret = ptr::read(ptr);
+
+                // Shift everything down to fill in that spot.
+                ptr::copy(ptr.offset(1), ptr, len - index - 1);
+            }
+            self.set_len(len - 1);
+            ret
+        }
+    }
+
+    /// Retains only the elements specified by the predicate.
+    ///
+    /// In other words, remove all elements `e` for which `f(&e)` returns `false`.
+    /// This method operates in place, visiting each element exactly once in the
+    /// original order, and preserves the order of the retained elements.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2, 3, 4];
+    /// vec.retain(|&x| x % 2 == 0);
+    /// assert_eq!(vec, [2, 4]);
+    /// ```
+    ///
+    /// Because the elements are visited exactly once in the original order,
+    /// external state may be used to decide which elements to keep.
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2, 3, 4, 5];
+    /// let keep = [false, true, true, false, true];
+    /// let mut iter = keep.iter();
+    /// vec.retain(|_| *iter.next().unwrap());
+    /// assert_eq!(vec, [2, 3, 5]);
+    /// ```
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn retain<F>(&mut self, mut f: F)
+    where
+        F: FnMut(&T) -> bool,
+    {
+        self.retain_mut(|elem| f(elem));
+    }
+
+    /// Retains only the elements specified by the predicate, passing a mutable reference to it.
+    ///
+    /// In other words, remove all elements `e` such that `f(&mut e)` returns `false`.
+    /// This method operates in place, visiting each element exactly once in the
+    /// original order, and preserves the order of the retained elements.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2, 3, 4];
+    /// vec.retain_mut(|x| if *x > 3 {
+    ///     false
+    /// } else {
+    ///     *x += 1;
+    ///     true
+    /// });
+    /// assert_eq!(vec, [2, 3, 4]);
+    /// ```
+    #[stable(feature = "vec_retain_mut", since = "1.61.0")]
+    pub fn retain_mut<F>(&mut self, mut f: F)
+    where
+        F: FnMut(&mut T) -> bool,
+    {
+        let original_len = self.len();
+        // Avoid double drop if the drop guard is not executed,
+        // since we may make some holes during the process.
+        unsafe { self.set_len(0) };
+
+        // Vec: [Kept, Kept, Hole, Hole, Hole, Hole, Unchecked, Unchecked]
+        //      |<-              processed len   ->| ^- next to check
+        //                  |<-  deleted cnt     ->|
+        //      |<-              original_len                          ->|
+        // Kept: Elements which predicate returns true on.
+        // Hole: Moved or dropped element slot.
+        // Unchecked: Unchecked valid elements.
+        //
+        // This drop guard will be invoked when predicate or `drop` of element panicked.
+        // It shifts unchecked elements to cover holes and `set_len` to the correct length.
+        // In cases when predicate and `drop` never panick, it will be optimized out.
+        struct BackshiftOnDrop<'a, T, A: Allocator> {
+            v: &'a mut Vec<T, A>,
+            processed_len: usize,
+            deleted_cnt: usize,
+            original_len: usize,
+        }
+
+        impl<T, A: Allocator> Drop for BackshiftOnDrop<'_, T, A> {
+            fn drop(&mut self) {
+                if self.deleted_cnt > 0 {
+                    // SAFETY: Trailing unchecked items must be valid since we never touch them.
+                    unsafe {
+                        ptr::copy(
+                            self.v.as_ptr().add(self.processed_len),
+                            self.v.as_mut_ptr().add(self.processed_len - self.deleted_cnt),
+                            self.original_len - self.processed_len,
+                        );
+                    }
+                }
+                // SAFETY: After filling holes, all items are in contiguous memory.
+                unsafe {
+                    self.v.set_len(self.original_len - self.deleted_cnt);
+                }
+            }
+        }
+
+        let mut g = BackshiftOnDrop { v: self, processed_len: 0, deleted_cnt: 0, original_len };
+
+        fn process_loop<F, T, A: Allocator, const DELETED: bool>(
+            original_len: usize,
+            f: &mut F,
+            g: &mut BackshiftOnDrop<'_, T, A>,
+        ) where
+            F: FnMut(&mut T) -> bool,
+        {
+            while g.processed_len != original_len {
+                // SAFETY: Unchecked element must be valid.
+                let cur = unsafe { &mut *g.v.as_mut_ptr().add(g.processed_len) };
+                if !f(cur) {
+                    // Advance early to avoid double drop if `drop_in_place` panicked.
+                    g.processed_len += 1;
+                    g.deleted_cnt += 1;
+                    // SAFETY: We never touch this element again after dropped.
+                    unsafe { ptr::drop_in_place(cur) };
+                    // We already advanced the counter.
+                    if DELETED {
+                        continue;
+                    } else {
+                        break;
+                    }
+                }
+                if DELETED {
+                    // SAFETY: `deleted_cnt` > 0, so the hole slot must not overlap with current element.
+                    // We use copy for move, and never touch this element again.
+                    unsafe {
+                        let hole_slot = g.v.as_mut_ptr().add(g.processed_len - g.deleted_cnt);
+                        ptr::copy_nonoverlapping(cur, hole_slot, 1);
+                    }
+                }
+                g.processed_len += 1;
+            }
+        }
+
+        // Stage 1: Nothing was deleted.
+        process_loop::<F, T, A, false>(original_len, &mut f, &mut g);
+
+        // Stage 2: Some elements were deleted.
+        process_loop::<F, T, A, true>(original_len, &mut f, &mut g);
+
+        // All item are processed. This can be optimized to `set_len` by LLVM.
+        drop(g);
+    }
+
+    /// Removes all but the first of consecutive elements in the vector that resolve to the same
+    /// key.
+    ///
+    /// If the vector is sorted, this removes all duplicates.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![10, 20, 21, 30, 20];
+    ///
+    /// vec.dedup_by_key(|i| *i / 10);
+    ///
+    /// assert_eq!(vec, [10, 20, 30, 20]);
+    /// ```
+    #[stable(feature = "dedup_by", since = "1.16.0")]
+    #[inline]
+    pub fn dedup_by_key<F, K>(&mut self, mut key: F)
+    where
+        F: FnMut(&mut T) -> K,
+        K: PartialEq,
+    {
+        self.dedup_by(|a, b| key(a) == key(b))
+    }
+
+    /// Removes all but the first of consecutive elements in the vector satisfying a given equality
+    /// relation.
+    ///
+    /// The `same_bucket` function is passed references to two elements from the vector and
+    /// must determine if the elements compare equal. The elements are passed in opposite order
+    /// from their order in the slice, so if `same_bucket(a, b)` returns `true`, `a` is removed.
+    ///
+    /// If the vector is sorted, this removes all duplicates.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec!["foo", "bar", "Bar", "baz", "bar"];
+    ///
+    /// vec.dedup_by(|a, b| a.eq_ignore_ascii_case(b));
+    ///
+    /// assert_eq!(vec, ["foo", "bar", "baz", "bar"]);
+    /// ```
+    #[stable(feature = "dedup_by", since = "1.16.0")]
+    pub fn dedup_by<F>(&mut self, mut same_bucket: F)
+    where
+        F: FnMut(&mut T, &mut T) -> bool,
+    {
+        let len = self.len();
+        if len <= 1 {
+            return;
+        }
+
+        /* INVARIANT: vec.len() > read >= write > write-1 >= 0 */
+        struct FillGapOnDrop<'a, T, A: core::alloc::Allocator> {
+            /* Offset of the element we want to check if it is duplicate */
+            read: usize,
+
+            /* Offset of the place where we want to place the non-duplicate
+             * when we find it. */
+            write: usize,
+
+            /* The Vec that would need correction if `same_bucket` panicked */
+            vec: &'a mut Vec<T, A>,
+        }
+
+        impl<'a, T, A: core::alloc::Allocator> Drop for FillGapOnDrop<'a, T, A> {
+            fn drop(&mut self) {
+                /* This code gets executed when `same_bucket` panics */
+
+                /* SAFETY: invariant guarantees that `read - write`
+                 * and `len - read` never overflow and that the copy is always
+                 * in-bounds. */
+                unsafe {
+                    let ptr = self.vec.as_mut_ptr();
+                    let len = self.vec.len();
+
+                    /* How many items were left when `same_bucket` panicked.
+                     * Basically vec[read..].len() */
+                    let items_left = len.wrapping_sub(self.read);
+
+                    /* Pointer to first item in vec[write..write+items_left] slice */
+                    let dropped_ptr = ptr.add(self.write);
+                    /* Pointer to first item in vec[read..] slice */
+                    let valid_ptr = ptr.add(self.read);
+
+                    /* Copy `vec[read..]` to `vec[write..write+items_left]`.
+                     * The slices can overlap, so `copy_nonoverlapping` cannot be used */
+                    ptr::copy(valid_ptr, dropped_ptr, items_left);
+
+                    /* How many items have been already dropped
+                     * Basically vec[read..write].len() */
+                    let dropped = self.read.wrapping_sub(self.write);
+
+                    self.vec.set_len(len - dropped);
+                }
+            }
+        }
+
+        let mut gap = FillGapOnDrop { read: 1, write: 1, vec: self };
+        let ptr = gap.vec.as_mut_ptr();
+
+        /* Drop items while going through Vec, it should be more efficient than
+         * doing slice partition_dedup + truncate */
+
+        /* SAFETY: Because of the invariant, read_ptr, prev_ptr and write_ptr
+         * are always in-bounds and read_ptr never aliases prev_ptr */
+        unsafe {
+            while gap.read < len {
+                let read_ptr = ptr.add(gap.read);
+                let prev_ptr = ptr.add(gap.write.wrapping_sub(1));
+
+                if same_bucket(&mut *read_ptr, &mut *prev_ptr) {
+                    // Increase `gap.read` now since the drop may panic.
+                    gap.read += 1;
+                    /* We have found duplicate, drop it in-place */
+                    ptr::drop_in_place(read_ptr);
+                } else {
+                    let write_ptr = ptr.add(gap.write);
+
+                    /* Because `read_ptr` can be equal to `write_ptr`, we either
+                     * have to use `copy` or conditional `copy_nonoverlapping`.
+                     * Looks like the first option is faster. */
+                    ptr::copy(read_ptr, write_ptr, 1);
+
+                    /* We have filled that place, so go further */
+                    gap.write += 1;
+                    gap.read += 1;
+                }
+            }
+
+            /* Technically we could let `gap` clean up with its Drop, but
+             * when `same_bucket` is guaranteed to not panic, this bloats a little
+             * the codegen, so we just do it manually */
+            gap.vec.set_len(gap.write);
+            mem::forget(gap);
+        }
+    }
+
+    /// Appends an element to the back of a collection.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` bytes.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2];
+    /// vec.push(3);
+    /// assert_eq!(vec, [1, 2, 3]);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[inline]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn push(&mut self, value: T) {
+        // This will panic or abort if we would allocate > isize::MAX bytes
+        // or if the length increment would overflow for zero-sized types.
+        if self.len == self.buf.capacity() {
+            self.buf.reserve_for_push(self.len);
+        }
+        unsafe {
+            let end = self.as_mut_ptr().add(self.len);
+            ptr::write(end, value);
+            self.len += 1;
+        }
+    }
+
+    /// Removes the last element from a vector and returns it, or [`None`] if it
+    /// is empty.
+    ///
+    /// If you'd like to pop the first element, consider using
+    /// [`VecDeque::pop_front`] instead.
+    ///
+    /// [`VecDeque::pop_front`]: crate::collections::VecDeque::pop_front
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2, 3];
+    /// assert_eq!(vec.pop(), Some(3));
+    /// assert_eq!(vec, [1, 2]);
+    /// ```
+    #[inline]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn pop(&mut self) -> Option<T> {
+        if self.len == 0 {
+            None
+        } else {
+            unsafe {
+                self.len -= 1;
+                Some(ptr::read(self.as_ptr().add(self.len())))
+            }
+        }
+    }
+
+    /// Moves all the elements of `other` into `self`, leaving `other` empty.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the number of elements in the vector overflows a `usize`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2, 3];
+    /// let mut vec2 = vec![4, 5, 6];
+    /// vec.append(&mut vec2);
+    /// assert_eq!(vec, [1, 2, 3, 4, 5, 6]);
+    /// assert_eq!(vec2, []);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[inline]
+    #[stable(feature = "append", since = "1.4.0")]
+    pub fn append(&mut self, other: &mut Self) {
+        unsafe {
+            self.append_elements(other.as_slice() as _);
+            other.set_len(0);
+        }
+    }
+
+    /// Appends elements to `self` from other buffer.
+    #[cfg(not(no_global_oom_handling))]
+    #[inline]
+    unsafe fn append_elements(&mut self, other: *const [T]) {
+        let count = unsafe { (*other).len() };
+        self.reserve(count);
+        let len = self.len();
+        unsafe { ptr::copy_nonoverlapping(other as *const T, self.as_mut_ptr().add(len), count) };
+        self.len += count;
+    }
+
+    /// Removes the specified range from the vector in bulk, returning all
+    /// removed elements as an iterator. If the iterator is dropped before
+    /// being fully consumed, it drops the remaining removed elements.
+    ///
+    /// The returned iterator keeps a mutable borrow on the vector to optimize
+    /// its implementation.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the starting point is greater than the end point or if
+    /// the end point is greater than the length of the vector.
+    ///
+    /// # Leaking
+    ///
+    /// If the returned iterator goes out of scope without being dropped (due to
+    /// [`mem::forget`], for example), the vector may have lost and leaked
+    /// elements arbitrarily, including elements outside the range.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut v = vec![1, 2, 3];
+    /// let u: Vec<_> = v.drain(1..).collect();
+    /// assert_eq!(v, &[1]);
+    /// assert_eq!(u, &[2, 3]);
+    ///
+    /// // A full range clears the vector, like `clear()` does
+    /// v.drain(..);
+    /// assert_eq!(v, &[]);
+    /// ```
+    #[stable(feature = "drain", since = "1.6.0")]
+    pub fn drain<R>(&mut self, range: R) -> Drain<'_, T, A>
+    where
+        R: RangeBounds<usize>,
+    {
+        // Memory safety
+        //
+        // When the Drain is first created, it shortens the length of
+        // the source vector to make sure no uninitialized or moved-from elements
+        // are accessible at all if the Drain's destructor never gets to run.
+        //
+        // Drain will ptr::read out the values to remove.
+        // When finished, remaining tail of the vec is copied back to cover
+        // the hole, and the vector length is restored to the new length.
+        //
+        let len = self.len();
+        let Range { start, end } = slice::range(range, ..len);
+
+        unsafe {
+            // set self.vec length's to start, to be safe in case Drain is leaked
+            self.set_len(start);
+            // Use the borrow in the IterMut to indicate borrowing behavior of the
+            // whole Drain iterator (like &mut T).
+            let range_slice = slice::from_raw_parts_mut(self.as_mut_ptr().add(start), end - start);
+            Drain {
+                tail_start: end,
+                tail_len: len - end,
+                iter: range_slice.iter(),
+                vec: NonNull::from(self),
+            }
+        }
+    }
+
+    /// Clears the vector, removing all values.
+    ///
+    /// Note that this method has no effect on the allocated capacity
+    /// of the vector.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut v = vec![1, 2, 3];
+    ///
+    /// v.clear();
+    ///
+    /// assert!(v.is_empty());
+    /// ```
+    #[inline]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn clear(&mut self) {
+        let elems: *mut [T] = self.as_mut_slice();
+
+        // SAFETY:
+        // - `elems` comes directly from `as_mut_slice` and is therefore valid.
+        // - Setting `self.len` before calling `drop_in_place` means that,
+        //   if an element's `Drop` impl panics, the vector's `Drop` impl will
+        //   do nothing (leaking the rest of the elements) instead of dropping
+        //   some twice.
+        unsafe {
+            self.len = 0;
+            ptr::drop_in_place(elems);
+        }
+    }
+
+    /// Returns the number of elements in the vector, also referred to
+    /// as its 'length'.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let a = vec![1, 2, 3];
+    /// assert_eq!(a.len(), 3);
+    /// ```
+    #[inline]
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Returns `true` if the vector contains no elements.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut v = Vec::new();
+    /// assert!(v.is_empty());
+    ///
+    /// v.push(1);
+    /// assert!(!v.is_empty());
+    /// ```
+    #[stable(feature = "rust1", since = "1.0.0")]
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Splits the collection into two at the given index.
+    ///
+    /// Returns a newly allocated vector containing the elements in the range
+    /// `[at, len)`. After the call, the original vector will be left containing
+    /// the elements `[0, at)` with its previous capacity unchanged.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `at > len`.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2, 3];
+    /// let vec2 = vec.split_off(1);
+    /// assert_eq!(vec, [1]);
+    /// assert_eq!(vec2, [2, 3]);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[inline]
+    #[must_use = "use `.truncate()` if you don't need the other half"]
+    #[stable(feature = "split_off", since = "1.4.0")]
+    pub fn split_off(&mut self, at: usize) -> Self
+    where
+        A: Clone,
+    {
+        #[cold]
+        #[inline(never)]
+        fn assert_failed(at: usize, len: usize) -> ! {
+            panic!("`at` split index (is {at}) should be <= len (is {len})");
+        }
+
+        if at > self.len() {
+            assert_failed(at, self.len());
+        }
+
+        if at == 0 {
+            // the new vector can take over the original buffer and avoid the copy
+            return mem::replace(
+                self,
+                Vec::with_capacity_in(self.capacity(), self.allocator().clone()),
+            );
+        }
+
+        let other_len = self.len - at;
+        let mut other = Vec::with_capacity_in(other_len, self.allocator().clone());
+
+        // Unsafely `set_len` and copy items to `other`.
+        unsafe {
+            self.set_len(at);
+            other.set_len(other_len);
+
+            ptr::copy_nonoverlapping(self.as_ptr().add(at), other.as_mut_ptr(), other.len());
+        }
+        other
+    }
+
+    /// Resizes the `Vec` in-place so that `len` is equal to `new_len`.
+    ///
+    /// If `new_len` is greater than `len`, the `Vec` is extended by the
+    /// difference, with each additional slot filled with the result of
+    /// calling the closure `f`. The return values from `f` will end up
+    /// in the `Vec` in the order they have been generated.
+    ///
+    /// If `new_len` is less than `len`, the `Vec` is simply truncated.
+    ///
+    /// This method uses a closure to create new values on every push. If
+    /// you'd rather [`Clone`] a given value, use [`Vec::resize`]. If you
+    /// want to use the [`Default`] trait to generate values, you can
+    /// pass [`Default::default`] as the second argument.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2, 3];
+    /// vec.resize_with(5, Default::default);
+    /// assert_eq!(vec, [1, 2, 3, 0, 0]);
+    ///
+    /// let mut vec = vec![];
+    /// let mut p = 1;
+    /// vec.resize_with(4, || { p *= 2; p });
+    /// assert_eq!(vec, [2, 4, 8, 16]);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "vec_resize_with", since = "1.33.0")]
+    pub fn resize_with<F>(&mut self, new_len: usize, f: F)
+    where
+        F: FnMut() -> T,
+    {
+        let len = self.len();
+        if new_len > len {
+            self.extend_with(new_len - len, ExtendFunc(f));
+        } else {
+            self.truncate(new_len);
+        }
+    }
+
+    /// Consumes and leaks the `Vec`, returning a mutable reference to the contents,
+    /// `&'a mut [T]`. Note that the type `T` must outlive the chosen lifetime
+    /// `'a`. If the type has only static references, or none at all, then this
+    /// may be chosen to be `'static`.
+    ///
+    /// As of Rust 1.57, this method does not reallocate or shrink the `Vec`,
+    /// so the leaked allocation may include unused capacity that is not part
+    /// of the returned slice.
+    ///
+    /// This function is mainly useful for data that lives for the remainder of
+    /// the program's life. Dropping the returned reference will cause a memory
+    /// leak.
+    ///
+    /// # Examples
+    ///
+    /// Simple usage:
+    ///
+    /// ```
+    /// let x = vec![1, 2, 3];
+    /// let static_ref: &'static mut [usize] = x.leak();
+    /// static_ref[0] += 1;
+    /// assert_eq!(static_ref, &[2, 2, 3]);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "vec_leak", since = "1.47.0")]
+    #[inline]
+    pub fn leak<'a>(self) -> &'a mut [T]
+    where
+        A: 'a,
+    {
+        let mut me = ManuallyDrop::new(self);
+        unsafe { slice::from_raw_parts_mut(me.as_mut_ptr(), me.len) }
+    }
+
+    /// Returns the remaining spare capacity of the vector as a slice of
+    /// `MaybeUninit<T>`.
+    ///
+    /// The returned slice can be used to fill the vector with data (e.g. by
+    /// reading from a file) before marking the data as initialized using the
+    /// [`set_len`] method.
+    ///
+    /// [`set_len`]: Vec::set_len
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// // Allocate vector big enough for 10 elements.
+    /// let mut v = Vec::with_capacity(10);
+    ///
+    /// // Fill in the first 3 elements.
+    /// let uninit = v.spare_capacity_mut();
+    /// uninit[0].write(0);
+    /// uninit[1].write(1);
+    /// uninit[2].write(2);
+    ///
+    /// // Mark the first 3 elements of the vector as being initialized.
+    /// unsafe {
+    ///     v.set_len(3);
+    /// }
+    ///
+    /// assert_eq!(&v, &[0, 1, 2]);
+    /// ```
+    #[stable(feature = "vec_spare_capacity", since = "1.60.0")]
+    #[inline]
+    pub fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit<T>] {
+        // Note:
+        // This method is not implemented in terms of `split_at_spare_mut`,
+        // to prevent invalidation of pointers to the buffer.
+        unsafe {
+            slice::from_raw_parts_mut(
+                self.as_mut_ptr().add(self.len) as *mut MaybeUninit<T>,
+                self.buf.capacity() - self.len,
+            )
+        }
+    }
+
+    /// Returns vector content as a slice of `T`, along with the remaining spare
+    /// capacity of the vector as a slice of `MaybeUninit<T>`.
+    ///
+    /// The returned spare capacity slice can be used to fill the vector with data
+    /// (e.g. by reading from a file) before marking the data as initialized using
+    /// the [`set_len`] method.
+    ///
+    /// [`set_len`]: Vec::set_len
+    ///
+    /// Note that this is a low-level API, which should be used with care for
+    /// optimization purposes. If you need to append data to a `Vec`
+    /// you can use [`push`], [`extend`], [`extend_from_slice`],
+    /// [`extend_from_within`], [`insert`], [`append`], [`resize`] or
+    /// [`resize_with`], depending on your exact needs.
+    ///
+    /// [`push`]: Vec::push
+    /// [`extend`]: Vec::extend
+    /// [`extend_from_slice`]: Vec::extend_from_slice
+    /// [`extend_from_within`]: Vec::extend_from_within
+    /// [`insert`]: Vec::insert
+    /// [`append`]: Vec::append
+    /// [`resize`]: Vec::resize
+    /// [`resize_with`]: Vec::resize_with
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(vec_split_at_spare)]
+    ///
+    /// let mut v = vec![1, 1, 2];
+    ///
+    /// // Reserve additional space big enough for 10 elements.
+    /// v.reserve(10);
+    ///
+    /// let (init, uninit) = v.split_at_spare_mut();
+    /// let sum = init.iter().copied().sum::<u32>();
+    ///
+    /// // Fill in the next 4 elements.
+    /// uninit[0].write(sum);
+    /// uninit[1].write(sum * 2);
+    /// uninit[2].write(sum * 3);
+    /// uninit[3].write(sum * 4);
+    ///
+    /// // Mark the 4 elements of the vector as being initialized.
+    /// unsafe {
+    ///     let len = v.len();
+    ///     v.set_len(len + 4);
+    /// }
+    ///
+    /// assert_eq!(&v, &[1, 1, 2, 4, 8, 12, 16]);
+    /// ```
+    #[unstable(feature = "vec_split_at_spare", issue = "81944")]
+    #[inline]
+    pub fn split_at_spare_mut(&mut self) -> (&mut [T], &mut [MaybeUninit<T>]) {
+        // SAFETY:
+        // - len is ignored and so never changed
+        let (init, spare, _) = unsafe { self.split_at_spare_mut_with_len() };
+        (init, spare)
+    }
+
+    /// Safety: changing returned .2 (&mut usize) is considered the same as calling `.set_len(_)`.
+    ///
+    /// This method provides unique access to all vec parts at once in `extend_from_within`.
+    unsafe fn split_at_spare_mut_with_len(
+        &mut self,
+    ) -> (&mut [T], &mut [MaybeUninit<T>], &mut usize) {
+        let ptr = self.as_mut_ptr();
+        // SAFETY:
+        // - `ptr` is guaranteed to be valid for `self.len` elements
+        // - but the allocation extends out to `self.buf.capacity()` elements, possibly
+        // uninitialized
+        let spare_ptr = unsafe { ptr.add(self.len) };
+        let spare_ptr = spare_ptr.cast::<MaybeUninit<T>>();
+        let spare_len = self.buf.capacity() - self.len;
+
+        // SAFETY:
+        // - `ptr` is guaranteed to be valid for `self.len` elements
+        // - `spare_ptr` is pointing one element past the buffer, so it doesn't overlap with `initialized`
+        unsafe {
+            let initialized = slice::from_raw_parts_mut(ptr, self.len);
+            let spare = slice::from_raw_parts_mut(spare_ptr, spare_len);
+
+            (initialized, spare, &mut self.len)
+        }
+    }
+}
+
+impl<T: Clone, A: Allocator> Vec<T, A> {
+    /// Resizes the `Vec` in-place so that `len` is equal to `new_len`.
+    ///
+    /// If `new_len` is greater than `len`, the `Vec` is extended by the
+    /// difference, with each additional slot filled with `value`.
+    /// If `new_len` is less than `len`, the `Vec` is simply truncated.
+    ///
+    /// This method requires `T` to implement [`Clone`],
+    /// in order to be able to clone the passed value.
+    /// If you need more flexibility (or want to rely on [`Default`] instead of
+    /// [`Clone`]), use [`Vec::resize_with`].
+    /// If you only need to resize to a smaller size, use [`Vec::truncate`].
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec!["hello"];
+    /// vec.resize(3, "world");
+    /// assert_eq!(vec, ["hello", "world", "world"]);
+    ///
+    /// let mut vec = vec![1, 2, 3, 4];
+    /// vec.resize(2, 0);
+    /// assert_eq!(vec, [1, 2]);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "vec_resize", since = "1.5.0")]
+    pub fn resize(&mut self, new_len: usize, value: T) {
+        let len = self.len();
+
+        if new_len > len {
+            self.extend_with(new_len - len, ExtendElement(value))
+        } else {
+            self.truncate(new_len);
+        }
+    }
+
+    /// Clones and appends all elements in a slice to the `Vec`.
+    ///
+    /// Iterates over the slice `other`, clones each element, and then appends
+    /// it to this `Vec`. The `other` slice is traversed in-order.
+    ///
+    /// Note that this function is same as [`extend`] except that it is
+    /// specialized to work with slices instead. If and when Rust gets
+    /// specialization this function will likely be deprecated (but still
+    /// available).
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1];
+    /// vec.extend_from_slice(&[2, 3, 4]);
+    /// assert_eq!(vec, [1, 2, 3, 4]);
+    /// ```
+    ///
+    /// [`extend`]: Vec::extend
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "vec_extend_from_slice", since = "1.6.0")]
+    pub fn extend_from_slice(&mut self, other: &[T]) {
+        self.spec_extend(other.iter())
+    }
+
+    /// Copies elements from `src` range to the end of the vector.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the starting point is greater than the end point or if
+    /// the end point is greater than the length of the vector.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![0, 1, 2, 3, 4];
+    ///
+    /// vec.extend_from_within(2..);
+    /// assert_eq!(vec, [0, 1, 2, 3, 4, 2, 3, 4]);
+    ///
+    /// vec.extend_from_within(..2);
+    /// assert_eq!(vec, [0, 1, 2, 3, 4, 2, 3, 4, 0, 1]);
+    ///
+    /// vec.extend_from_within(4..8);
+    /// assert_eq!(vec, [0, 1, 2, 3, 4, 2, 3, 4, 0, 1, 4, 2, 3, 4]);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[stable(feature = "vec_extend_from_within", since = "1.53.0")]
+    pub fn extend_from_within<R>(&mut self, src: R)
+    where
+        R: RangeBounds<usize>,
+    {
+        let range = slice::range(src, ..self.len());
+        self.reserve(range.len());
+
+        // SAFETY:
+        // - `slice::range` guarantees  that the given range is valid for indexing self
+        unsafe {
+            self.spec_extend_from_within(range);
+        }
+    }
+}
+
+impl<T, A: Allocator, const N: usize> Vec<[T; N], A> {
+    /// Takes a `Vec<[T; N]>` and flattens it into a `Vec<T>`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the length of the resulting vector would overflow a `usize`.
+    ///
+    /// This is only possible when flattening a vector of arrays of zero-sized
+    /// types, and thus tends to be irrelevant in practice. If
+    /// `size_of::<T>() > 0`, this will never panic.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(slice_flatten)]
+    ///
+    /// let mut vec = vec![[1, 2, 3], [4, 5, 6], [7, 8, 9]];
+    /// assert_eq!(vec.pop(), Some([7, 8, 9]));
+    ///
+    /// let mut flattened = vec.into_flattened();
+    /// assert_eq!(flattened.pop(), Some(6));
+    /// ```
+    #[unstable(feature = "slice_flatten", issue = "95629")]
+    pub fn into_flattened(self) -> Vec<T, A> {
+        let (ptr, len, cap, alloc) = self.into_raw_parts_with_alloc();
+        let (new_len, new_cap) = if mem::size_of::<T>() == 0 {
+            (len.checked_mul(N).expect("vec len overflow"), usize::MAX)
+        } else {
+            // SAFETY:
+            // - `cap * N` cannot overflow because the allocation is already in
+            // the address space.
+            // - Each `[T; N]` has `N` valid elements, so there are `len * N`
+            // valid elements in the allocation.
+            unsafe { (len.unchecked_mul(N), cap.unchecked_mul(N)) }
+        };
+        // SAFETY:
+        // - `ptr` was allocated by `self`
+        // - `ptr` is well-aligned because `[T; N]` has the same alignment as `T`.
+        // - `new_cap` refers to the same sized allocation as `cap` because
+        // `new_cap * size_of::<T>()` == `cap * size_of::<[T; N]>()`
+        // - `len` <= `cap`, so `len * N` <= `cap * N`.
+        unsafe { Vec::<T, A>::from_raw_parts_in(ptr.cast(), new_len, new_cap, alloc) }
+    }
+}
+
+// This code generalizes `extend_with_{element,default}`.
+trait ExtendWith<T> {
+    fn next(&mut self) -> T;
+    fn last(self) -> T;
+}
+
+struct ExtendElement<T>(T);
+impl<T: Clone> ExtendWith<T> for ExtendElement<T> {
+    fn next(&mut self) -> T {
+        self.0.clone()
+    }
+    fn last(self) -> T {
+        self.0
+    }
+}
+
+struct ExtendFunc<F>(F);
+impl<T, F: FnMut() -> T> ExtendWith<T> for ExtendFunc<F> {
+    fn next(&mut self) -> T {
+        (self.0)()
+    }
+    fn last(mut self) -> T {
+        (self.0)()
+    }
+}
+
+impl<T, A: Allocator> Vec<T, A> {
+    #[cfg(not(no_global_oom_handling))]
+    /// Extend the vector by `n` values, using the given generator.
+    fn extend_with<E: ExtendWith<T>>(&mut self, n: usize, mut value: E) {
+        self.reserve(n);
+
+        unsafe {
+            let mut ptr = self.as_mut_ptr().add(self.len());
+            // Use SetLenOnDrop to work around bug where compiler
+            // might not realize the store through `ptr` through self.set_len()
+            // don't alias.
+            let mut local_len = SetLenOnDrop::new(&mut self.len);
+
+            // Write all elements except the last one
+            for _ in 1..n {
+                ptr::write(ptr, value.next());
+                ptr = ptr.offset(1);
+                // Increment the length in every step in case next() panics
+                local_len.increment_len(1);
+            }
+
+            if n > 0 {
+                // We can write the last element directly without cloning needlessly
+                ptr::write(ptr, value.last());
+                local_len.increment_len(1);
+            }
+
+            // len set by scope guard
+        }
+    }
+}
+
+impl<T: PartialEq, A: Allocator> Vec<T, A> {
+    /// Removes consecutive repeated elements in the vector according to the
+    /// [`PartialEq`] trait implementation.
+    ///
+    /// If the vector is sorted, this removes all duplicates.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2, 2, 3, 2];
+    ///
+    /// vec.dedup();
+    ///
+    /// assert_eq!(vec, [1, 2, 3, 2]);
+    /// ```
+    #[stable(feature = "rust1", since = "1.0.0")]
+    #[inline]
+    pub fn dedup(&mut self) {
+        self.dedup_by(|a, b| a == b)
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Internal methods and functions
+////////////////////////////////////////////////////////////////////////////////
+
+#[doc(hidden)]
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+pub fn from_elem<T: Clone>(elem: T, n: usize) -> Vec<T> {
+    <T as SpecFromElem>::from_elem(elem, n, Global)
+}
+
+#[doc(hidden)]
+#[cfg(not(no_global_oom_handling))]
+#[unstable(feature = "allocator_api", issue = "32838")]
+pub fn from_elem_in<T: Clone, A: Allocator>(elem: T, n: usize, alloc: A) -> Vec<T, A> {
+    <T as SpecFromElem>::from_elem(elem, n, alloc)
+}
+
+trait ExtendFromWithinSpec {
+    /// # Safety
+    ///
+    /// - `src` needs to be valid index
+    /// - `self.capacity() - self.len()` must be `>= src.len()`
+    unsafe fn spec_extend_from_within(&mut self, src: Range<usize>);
+}
+
+impl<T: Clone, A: Allocator> ExtendFromWithinSpec for Vec<T, A> {
+    default unsafe fn spec_extend_from_within(&mut self, src: Range<usize>) {
+        // SAFETY:
+        // - len is increased only after initializing elements
+        let (this, spare, len) = unsafe { self.split_at_spare_mut_with_len() };
+
+        // SAFETY:
+        // - caller guaratees that src is a valid index
+        let to_clone = unsafe { this.get_unchecked(src) };
+
+        iter::zip(to_clone, spare)
+            .map(|(src, dst)| dst.write(src.clone()))
+            // Note:
+            // - Element was just initialized with `MaybeUninit::write`, so it's ok to increase len
+            // - len is increased after each element to prevent leaks (see issue #82533)
+            .for_each(|_| *len += 1);
+    }
+}
+
+impl<T: Copy, A: Allocator> ExtendFromWithinSpec for Vec<T, A> {
+    unsafe fn spec_extend_from_within(&mut self, src: Range<usize>) {
+        let count = src.len();
+        {
+            let (init, spare) = self.split_at_spare_mut();
+
+            // SAFETY:
+            // - caller guaratees that `src` is a valid index
+            let source = unsafe { init.get_unchecked(src) };
+
+            // SAFETY:
+            // - Both pointers are created from unique slice references (`&mut [_]`)
+            //   so they are valid and do not overlap.
+            // - Elements are :Copy so it's OK to to copy them, without doing
+            //   anything with the original values
+            // - `count` is equal to the len of `source`, so source is valid for
+            //   `count` reads
+            // - `.reserve(count)` guarantees that `spare.len() >= count` so spare
+            //   is valid for `count` writes
+            unsafe { ptr::copy_nonoverlapping(source.as_ptr(), spare.as_mut_ptr() as _, count) };
+        }
+
+        // SAFETY:
+        // - The elements were just initialized by `copy_nonoverlapping`
+        self.len += count;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Common trait implementations for Vec
+////////////////////////////////////////////////////////////////////////////////
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T, A: Allocator> ops::Deref for Vec<T, A> {
+    type Target = [T];
+
+    fn deref(&self) -> &[T] {
+        unsafe { slice::from_raw_parts(self.as_ptr(), self.len) }
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T, A: Allocator> ops::DerefMut for Vec<T, A> {
+    fn deref_mut(&mut self) -> &mut [T] {
+        unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) }
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+trait SpecCloneFrom {
+    fn clone_from(this: &mut Self, other: &Self);
+}
+
+#[cfg(not(no_global_oom_handling))]
+impl<T: Clone, A: Allocator> SpecCloneFrom for Vec<T, A> {
+    default fn clone_from(this: &mut Self, other: &Self) {
+        // drop anything that will not be overwritten
+        this.truncate(other.len());
+
+        // self.len <= other.len due to the truncate above, so the
+        // slices here are always in-bounds.
+        let (init, tail) = other.split_at(this.len());
+
+        // reuse the contained values' allocations/resources.
+        this.clone_from_slice(init);
+        this.extend_from_slice(tail);
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+impl<T: Copy, A: Allocator> SpecCloneFrom for Vec<T, A> {
+    fn clone_from(this: &mut Self, other: &Self) {
+        this.clear();
+        this.extend_from_slice(other);
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: Clone, A: Allocator + Clone> Clone for Vec<T, A> {
+    #[cfg(not(test))]
+    fn clone(&self) -> Self {
+        let alloc = self.allocator().clone();
+        <[T]>::to_vec_in(&**self, alloc)
+    }
+
+    // HACK(japaric): with cfg(test) the inherent `[T]::to_vec` method, which is
+    // required for this method definition, is not available. Instead use the
+    // `slice::to_vec`  function which is only available with cfg(test)
+    // NB see the slice::hack module in slice.rs for more information
+    #[cfg(test)]
+    fn clone(&self) -> Self {
+        let alloc = self.allocator().clone();
+        crate::slice::to_vec(&**self, alloc)
+    }
+
+    fn clone_from(&mut self, other: &Self) {
+        SpecCloneFrom::clone_from(self, other)
+    }
+}
+
+/// The hash of a vector is the same as that of the corresponding slice,
+/// as required by the `core::borrow::Borrow` implementation.
+///
+/// ```
+/// #![feature(build_hasher_simple_hash_one)]
+/// use std::hash::BuildHasher;
+///
+/// let b = std::collections::hash_map::RandomState::new();
+/// let v: Vec<u8> = vec![0xa8, 0x3c, 0x09];
+/// let s: &[u8] = &[0xa8, 0x3c, 0x09];
+/// assert_eq!(b.hash_one(v), b.hash_one(s));
+/// ```
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: Hash, A: Allocator> Hash for Vec<T, A> {
+    #[inline]
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        Hash::hash(&**self, state)
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+#[rustc_on_unimplemented(
+    message = "vector indices are of type `usize` or ranges of `usize`",
+    label = "vector indices are of type `usize` or ranges of `usize`"
+)]
+impl<T, I: SliceIndex<[T]>, A: Allocator> Index<I> for Vec<T, A> {
+    type Output = I::Output;
+
+    #[inline]
+    fn index(&self, index: I) -> &Self::Output {
+        Index::index(&**self, index)
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+#[rustc_on_unimplemented(
+    message = "vector indices are of type `usize` or ranges of `usize`",
+    label = "vector indices are of type `usize` or ranges of `usize`"
+)]
+impl<T, I: SliceIndex<[T]>, A: Allocator> IndexMut<I> for Vec<T, A> {
+    #[inline]
+    fn index_mut(&mut self, index: I) -> &mut Self::Output {
+        IndexMut::index_mut(&mut **self, index)
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T> FromIterator<T> for Vec<T> {
+    #[inline]
+    fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Vec<T> {
+        <Self as SpecFromIter<T, I::IntoIter>>::from_iter(iter.into_iter())
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T, A: Allocator> IntoIterator for Vec<T, A> {
+    type Item = T;
+    type IntoIter = IntoIter<T, A>;
+
+    /// Creates a consuming iterator, that is, one that moves each value out of
+    /// the vector (from start to end). The vector cannot be used after calling
+    /// this.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let v = vec!["a".to_string(), "b".to_string()];
+    /// for s in v.into_iter() {
+    ///     // s has type String, not &String
+    ///     println!("{s}");
+    /// }
+    /// ```
+    #[inline]
+    fn into_iter(self) -> IntoIter<T, A> {
+        unsafe {
+            let mut me = ManuallyDrop::new(self);
+            let alloc = ManuallyDrop::new(ptr::read(me.allocator()));
+            let begin = me.as_mut_ptr();
+            let end = if mem::size_of::<T>() == 0 {
+                arith_offset(begin as *const i8, me.len() as isize) as *const T
+            } else {
+                begin.add(me.len()) as *const T
+            };
+            let cap = me.buf.capacity();
+            IntoIter {
+                buf: NonNull::new_unchecked(begin),
+                phantom: PhantomData,
+                cap,
+                alloc,
+                ptr: begin,
+                end,
+            }
+        }
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<'a, T, A: Allocator> IntoIterator for &'a Vec<T, A> {
+    type Item = &'a T;
+    type IntoIter = slice::Iter<'a, T>;
+
+    fn into_iter(self) -> slice::Iter<'a, T> {
+        self.iter()
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<'a, T, A: Allocator> IntoIterator for &'a mut Vec<T, A> {
+    type Item = &'a mut T;
+    type IntoIter = slice::IterMut<'a, T>;
+
+    fn into_iter(self) -> slice::IterMut<'a, T> {
+        self.iter_mut()
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T, A: Allocator> Extend<T> for Vec<T, A> {
+    #[inline]
+    fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
+        <Self as SpecExtend<T, I::IntoIter>>::spec_extend(self, iter.into_iter())
+    }
+
+    #[inline]
+    fn extend_one(&mut self, item: T) {
+        self.push(item);
+    }
+
+    #[inline]
+    fn extend_reserve(&mut self, additional: usize) {
+        self.reserve(additional);
+    }
+}
+
+impl<T, A: Allocator> Vec<T, A> {
+    // leaf method to which various SpecFrom/SpecExtend implementations delegate when
+    // they have no further optimizations to apply
+    #[cfg(not(no_global_oom_handling))]
+    fn extend_desugared<I: Iterator<Item = T>>(&mut self, mut iterator: I) {
+        // This is the case for a general iterator.
+        //
+        // This function should be the moral equivalent of:
+        //
+        //      for item in iterator {
+        //          self.push(item);
+        //      }
+        while let Some(element) = iterator.next() {
+            let len = self.len();
+            if len == self.capacity() {
+                let (lower, _) = iterator.size_hint();
+                self.reserve(lower.saturating_add(1));
+            }
+            unsafe {
+                ptr::write(self.as_mut_ptr().add(len), element);
+                // Since next() executes user code which can panic we have to bump the length
+                // after each step.
+                // NB can't overflow since we would have had to alloc the address space
+                self.set_len(len + 1);
+            }
+        }
+    }
+
+    /// Creates a splicing iterator that replaces the specified range in the vector
+    /// with the given `replace_with` iterator and yields the removed items.
+    /// `replace_with` does not need to be the same length as `range`.
+    ///
+    /// `range` is removed even if the iterator is not consumed until the end.
+    ///
+    /// It is unspecified how many elements are removed from the vector
+    /// if the `Splice` value is leaked.
+    ///
+    /// The input iterator `replace_with` is only consumed when the `Splice` value is dropped.
+    ///
+    /// This is optimal if:
+    ///
+    /// * The tail (elements in the vector after `range`) is empty,
+    /// * or `replace_with` yields fewer or equal elements than `range`’s length
+    /// * or the lower bound of its `size_hint()` is exact.
+    ///
+    /// Otherwise, a temporary vector is allocated and the tail is moved twice.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the starting point is greater than the end point or if
+    /// the end point is greater than the length of the vector.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut v = vec![1, 2, 3, 4];
+    /// let new = [7, 8, 9];
+    /// let u: Vec<_> = v.splice(1..3, new).collect();
+    /// assert_eq!(v, &[1, 7, 8, 9, 4]);
+    /// assert_eq!(u, &[2, 3]);
+    /// ```
+    #[cfg(not(no_global_oom_handling))]
+    #[inline]
+    #[stable(feature = "vec_splice", since = "1.21.0")]
+    pub fn splice<R, I>(&mut self, range: R, replace_with: I) -> Splice<'_, I::IntoIter, A>
+    where
+        R: RangeBounds<usize>,
+        I: IntoIterator<Item = T>,
+    {
+        Splice { drain: self.drain(range), replace_with: replace_with.into_iter() }
+    }
+
+    /// Creates an iterator which uses a closure to determine if an element should be removed.
+    ///
+    /// If the closure returns true, then the element is removed and yielded.
+    /// If the closure returns false, the element will remain in the vector and will not be yielded
+    /// by the iterator.
+    ///
+    /// Using this method is equivalent to the following code:
+    ///
+    /// ```
+    /// # let some_predicate = |x: &mut i32| { *x == 2 || *x == 3 || *x == 6 };
+    /// # let mut vec = vec![1, 2, 3, 4, 5, 6];
+    /// let mut i = 0;
+    /// while i < vec.len() {
+    ///     if some_predicate(&mut vec[i]) {
+    ///         let val = vec.remove(i);
+    ///         // your code here
+    ///     } else {
+    ///         i += 1;
+    ///     }
+    /// }
+    ///
+    /// # assert_eq!(vec, vec![1, 4, 5]);
+    /// ```
+    ///
+    /// But `drain_filter` is easier to use. `drain_filter` is also more efficient,
+    /// because it can backshift the elements of the array in bulk.
+    ///
+    /// Note that `drain_filter` also lets you mutate every element in the filter closure,
+    /// regardless of whether you choose to keep or remove it.
+    ///
+    /// # Examples
+    ///
+    /// Splitting an array into evens and odds, reusing the original allocation:
+    ///
+    /// ```
+    /// #![feature(drain_filter)]
+    /// let mut numbers = vec![1, 2, 3, 4, 5, 6, 8, 9, 11, 13, 14, 15];
+    ///
+    /// let evens = numbers.drain_filter(|x| *x % 2 == 0).collect::<Vec<_>>();
+    /// let odds = numbers;
+    ///
+    /// assert_eq!(evens, vec![2, 4, 6, 8, 14]);
+    /// assert_eq!(odds, vec![1, 3, 5, 9, 11, 13, 15]);
+    /// ```
+    #[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")]
+    pub fn drain_filter<F>(&mut self, filter: F) -> DrainFilter<'_, T, F, A>
+    where
+        F: FnMut(&mut T) -> bool,
+    {
+        let old_len = self.len();
+
+        // Guard against us getting leaked (leak amplification)
+        unsafe {
+            self.set_len(0);
+        }
+
+        DrainFilter { vec: self, idx: 0, del: 0, old_len, pred: filter, panic_flag: false }
+    }
+}
+
+/// Extend implementation that copies elements out of references before pushing them onto the Vec.
+///
+/// This implementation is specialized for slice iterators, where it uses [`copy_from_slice`] to
+/// append the entire slice at once.
+///
+/// [`copy_from_slice`]: slice::copy_from_slice
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "extend_ref", since = "1.2.0")]
+impl<'a, T: Copy + 'a, A: Allocator + 'a> Extend<&'a T> for Vec<T, A> {
+    fn extend<I: IntoIterator<Item = &'a T>>(&mut self, iter: I) {
+        self.spec_extend(iter.into_iter())
+    }
+
+    #[inline]
+    fn extend_one(&mut self, &item: &'a T) {
+        self.push(item);
+    }
+
+    #[inline]
+    fn extend_reserve(&mut self, additional: usize) {
+        self.reserve(additional);
+    }
+}
+
+/// Implements comparison of vectors, [lexicographically](core::cmp::Ord#lexicographical-comparison).
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: PartialOrd, A: Allocator> PartialOrd for Vec<T, A> {
+    #[inline]
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        PartialOrd::partial_cmp(&**self, &**other)
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: Eq, A: Allocator> Eq for Vec<T, A> {}
+
+/// Implements ordering of vectors, [lexicographically](core::cmp::Ord#lexicographical-comparison).
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: Ord, A: Allocator> Ord for Vec<T, A> {
+    #[inline]
+    fn cmp(&self, other: &Self) -> Ordering {
+        Ord::cmp(&**self, &**other)
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+unsafe impl<#[may_dangle] T, A: Allocator> Drop for Vec<T, A> {
+    fn drop(&mut self) {
+        unsafe {
+            // use drop for [T]
+            // use a raw slice to refer to the elements of the vector as weakest necessary type;
+            // could avoid questions of validity in certain cases
+            ptr::drop_in_place(ptr::slice_from_raw_parts_mut(self.as_mut_ptr(), self.len))
+        }
+        // RawVec handles deallocation
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+#[rustc_const_unstable(feature = "const_default_impls", issue = "87864")]
+impl<T> const Default for Vec<T> {
+    /// Creates an empty `Vec<T>`.
+    fn default() -> Vec<T> {
+        Vec::new()
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: fmt::Debug, A: Allocator> fmt::Debug for Vec<T, A> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(&**self, f)
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T, A: Allocator> AsRef<Vec<T, A>> for Vec<T, A> {
+    fn as_ref(&self) -> &Vec<T, A> {
+        self
+    }
+}
+
+#[stable(feature = "vec_as_mut", since = "1.5.0")]
+impl<T, A: Allocator> AsMut<Vec<T, A>> for Vec<T, A> {
+    fn as_mut(&mut self) -> &mut Vec<T, A> {
+        self
+    }
+}
+
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T, A: Allocator> AsRef<[T]> for Vec<T, A> {
+    fn as_ref(&self) -> &[T] {
+        self
+    }
+}
+
+#[stable(feature = "vec_as_mut", since = "1.5.0")]
+impl<T, A: Allocator> AsMut<[T]> for Vec<T, A> {
+    fn as_mut(&mut self) -> &mut [T] {
+        self
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+impl<T: Clone> From<&[T]> for Vec<T> {
+    /// Allocate a `Vec<T>` and fill it by cloning `s`'s items.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// assert_eq!(Vec::from(&[1, 2, 3][..]), vec![1, 2, 3]);
+    /// ```
+    #[cfg(not(test))]
+    fn from(s: &[T]) -> Vec<T> {
+        s.to_vec()
+    }
+    #[cfg(test)]
+    fn from(s: &[T]) -> Vec<T> {
+        crate::slice::to_vec(s, Global)
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "vec_from_mut", since = "1.19.0")]
+impl<T: Clone> From<&mut [T]> for Vec<T> {
+    /// Allocate a `Vec<T>` and fill it by cloning `s`'s items.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// assert_eq!(Vec::from(&mut [1, 2, 3][..]), vec![1, 2, 3]);
+    /// ```
+    #[cfg(not(test))]
+    fn from(s: &mut [T]) -> Vec<T> {
+        s.to_vec()
+    }
+    #[cfg(test)]
+    fn from(s: &mut [T]) -> Vec<T> {
+        crate::slice::to_vec(s, Global)
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "vec_from_array", since = "1.44.0")]
+impl<T, const N: usize> From<[T; N]> for Vec<T> {
+    /// Allocate a `Vec<T>` and move `s`'s items into it.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// assert_eq!(Vec::from([1, 2, 3]), vec![1, 2, 3]);
+    /// ```
+    #[cfg(not(test))]
+    fn from(s: [T; N]) -> Vec<T> {
+        <[T]>::into_vec(box s)
+    }
+
+    #[cfg(test)]
+    fn from(s: [T; N]) -> Vec<T> {
+        crate::slice::into_vec(box s)
+    }
+}
+
+#[stable(feature = "vec_from_cow_slice", since = "1.14.0")]
+impl<'a, T> From<Cow<'a, [T]>> for Vec<T>
+where
+    [T]: ToOwned<Owned = Vec<T>>,
+{
+    /// Convert a clone-on-write slice into a vector.
+    ///
+    /// If `s` already owns a `Vec<T>`, it will be returned directly.
+    /// If `s` is borrowing a slice, a new `Vec<T>` will be allocated and
+    /// filled by cloning `s`'s items into it.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// # use std::borrow::Cow;
+    /// let o: Cow<[i32]> = Cow::Owned(vec![1, 2, 3]);
+    /// let b: Cow<[i32]> = Cow::Borrowed(&[1, 2, 3]);
+    /// assert_eq!(Vec::from(o), Vec::from(b));
+    /// ```
+    fn from(s: Cow<'a, [T]>) -> Vec<T> {
+        s.into_owned()
+    }
+}
+
+// note: test pulls in libstd, which causes errors here
+#[cfg(not(test))]
+#[stable(feature = "vec_from_box", since = "1.18.0")]
+impl<T, A: Allocator> From<Box<[T], A>> for Vec<T, A> {
+    /// Convert a boxed slice into a vector by transferring ownership of
+    /// the existing heap allocation.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let b: Box<[i32]> = vec![1, 2, 3].into_boxed_slice();
+    /// assert_eq!(Vec::from(b), vec![1, 2, 3]);
+    /// ```
+    fn from(s: Box<[T], A>) -> Self {
+        s.into_vec()
+    }
+}
+
+// note: test pulls in libstd, which causes errors here
+#[cfg(not(no_global_oom_handling))]
+#[cfg(not(test))]
+#[stable(feature = "box_from_vec", since = "1.20.0")]
+impl<T, A: Allocator> From<Vec<T, A>> for Box<[T], A> {
+    /// Convert a vector into a boxed slice.
+    ///
+    /// If `v` has excess capacity, its items will be moved into a
+    /// newly-allocated buffer with exactly the right capacity.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// assert_eq!(Box::from(vec![1, 2, 3]), vec![1, 2, 3].into_boxed_slice());
+    /// ```
+    fn from(v: Vec<T, A>) -> Self {
+        v.into_boxed_slice()
+    }
+}
+
+#[cfg(not(no_global_oom_handling))]
+#[stable(feature = "rust1", since = "1.0.0")]
+impl From<&str> for Vec<u8> {
+    /// Allocate a `Vec<u8>` and fill it with a UTF-8 string.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// assert_eq!(Vec::from("123"), vec![b'1', b'2', b'3']);
+    /// ```
+    fn from(s: &str) -> Vec<u8> {
+        From::from(s.as_bytes())
+    }
+}
+
+#[stable(feature = "array_try_from_vec", since = "1.48.0")]
+impl<T, A: Allocator, const N: usize> TryFrom<Vec<T, A>> for [T; N] {
+    type Error = Vec<T, A>;
+
+    /// Gets the entire contents of the `Vec<T>` as an array,
+    /// if its size exactly matches that of the requested array.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// assert_eq!(vec![1, 2, 3].try_into(), Ok([1, 2, 3]));
+    /// assert_eq!(<Vec<i32>>::new().try_into(), Ok([]));
+    /// ```
+    ///
+    /// If the length doesn't match, the input comes back in `Err`:
+    /// ```
+    /// let r: Result<[i32; 4], _> = (0..10).collect::<Vec<_>>().try_into();
+    /// assert_eq!(r, Err(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]));
+    /// ```
+    ///
+    /// If you're fine with just getting a prefix of the `Vec<T>`,
+    /// you can call [`.truncate(N)`](Vec::truncate) first.
+    /// ```
+    /// let mut v = String::from("hello world").into_bytes();
+    /// v.sort();
+    /// v.truncate(2);
+    /// let [a, b]: [_; 2] = v.try_into().unwrap();
+    /// assert_eq!(a, b' ');
+    /// assert_eq!(b, b'd');
+    /// ```
+    fn try_from(mut vec: Vec<T, A>) -> Result<[T; N], Vec<T, A>> {
+        if vec.len() != N {
+            return Err(vec);
+        }
+
+        // SAFETY: `.set_len(0)` is always sound.
+        unsafe { vec.set_len(0) };
+
+        // SAFETY: A `Vec`'s pointer is always aligned properly, and
+        // the alignment the array needs is the same as the items.
+        // We checked earlier that we have sufficient items.
+        // The items will not double-drop as the `set_len`
+        // tells the `Vec` not to also drop them.
+        let array = unsafe { ptr::read(vec.as_ptr() as *const [T; N]) };
+        Ok(array)
+    }
+}
diff --git a/rust/alloc/vec/partial_eq.rs b/rust/alloc/vec/partial_eq.rs
new file mode 100644
index 000000000000..b0cf72577a1b
--- /dev/null
+++ b/rust/alloc/vec/partial_eq.rs
@@ -0,0 +1,47 @@
+use crate::alloc::Allocator;
+#[cfg(not(no_global_oom_handling))]
+use crate::borrow::Cow;
+
+use super::Vec;
+
+macro_rules! __impl_slice_eq1 {
+    ([$($vars:tt)*] $lhs:ty, $rhs:ty $(where $ty:ty: $bound:ident)?, #[$stability:meta]) => {
+        #[$stability]
+        impl<T, U, $($vars)*> PartialEq<$rhs> for $lhs
+        where
+            T: PartialEq<U>,
+            $($ty: $bound)?
+        {
+            #[inline]
+            fn eq(&self, other: &$rhs) -> bool { self[..] == other[..] }
+            #[inline]
+            fn ne(&self, other: &$rhs) -> bool { self[..] != other[..] }
+        }
+    }
+}
+
+__impl_slice_eq1! { [A1: Allocator, A2: Allocator] Vec<T, A1>, Vec<U, A2>, #[stable(feature = "rust1", since = "1.0.0")] }
+__impl_slice_eq1! { [A: Allocator] Vec<T, A>, &[U], #[stable(feature = "rust1", since = "1.0.0")] }
+__impl_slice_eq1! { [A: Allocator] Vec<T, A>, &mut [U], #[stable(feature = "rust1", since = "1.0.0")] }
+__impl_slice_eq1! { [A: Allocator] &[T], Vec<U, A>, #[stable(feature = "partialeq_vec_for_ref_slice", since = "1.46.0")] }
+__impl_slice_eq1! { [A: Allocator] &mut [T], Vec<U, A>, #[stable(feature = "partialeq_vec_for_ref_slice", since = "1.46.0")] }
+__impl_slice_eq1! { [A: Allocator] Vec<T, A>, [U], #[stable(feature = "partialeq_vec_for_slice", since = "1.48.0")]  }
+__impl_slice_eq1! { [A: Allocator] [T], Vec<U, A>, #[stable(feature = "partialeq_vec_for_slice", since = "1.48.0")]  }
+#[cfg(not(no_global_oom_handling))]
+__impl_slice_eq1! { [A: Allocator] Cow<'_, [T]>, Vec<U, A> where T: Clone, #[stable(feature = "rust1", since = "1.0.0")] }
+#[cfg(not(no_global_oom_handling))]
+__impl_slice_eq1! { [] Cow<'_, [T]>, &[U] where T: Clone, #[stable(feature = "rust1", since = "1.0.0")] }
+#[cfg(not(no_global_oom_handling))]
+__impl_slice_eq1! { [] Cow<'_, [T]>, &mut [U] where T: Clone, #[stable(feature = "rust1", since = "1.0.0")] }
+__impl_slice_eq1! { [A: Allocator, const N: usize] Vec<T, A>, [U; N], #[stable(feature = "rust1", since = "1.0.0")] }
+__impl_slice_eq1! { [A: Allocator, const N: usize] Vec<T, A>, &[U; N], #[stable(feature = "rust1", since = "1.0.0")] }
+
+// NOTE: some less important impls are omitted to reduce code bloat
+// FIXME(Centril): Reconsider this?
+//__impl_slice_eq1! { [const N: usize] Vec<A>, &mut [B; N], }
+//__impl_slice_eq1! { [const N: usize] [A; N], Vec<B>, }
+//__impl_slice_eq1! { [const N: usize] &[A; N], Vec<B>, }
+//__impl_slice_eq1! { [const N: usize] &mut [A; N], Vec<B>, }
+//__impl_slice_eq1! { [const N: usize] Cow<'a, [A]>, [B; N], }
+//__impl_slice_eq1! { [const N: usize] Cow<'a, [A]>, &[B; N], }
+//__impl_slice_eq1! { [const N: usize] Cow<'a, [A]>, &mut [B; N], }
-- 
cgit v1.2.3


From 057b8d2571071da05d06810ca70f26c6316f6ea7 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sat, 3 Jul 2021 17:02:21 +0200
Subject: rust: adapt `alloc` crate to the kernel

This customizes the subset of the Rust standard library `alloc` that
was just imported as-is, mainly by:

  - Adding SPDX license identifiers.

  - Skipping modules (e.g. `rc` and `sync`) via new `cfg`s.

  - Adding fallible (`try_*`) versions of existing infallible methods
    (i.e. returning a `Result` instead of panicking).

    Since the standard library requires stable/unstable attributes,
    these additions are annotated with:

        #[stable(feature = "kernel", since = "1.0.0")]

    Using "kernel" as the feature allows to have the additions
    clearly marked. The "1.0.0" version is just a placeholder.

    (At the moment, only one is needed, but in the future more
    fallible methods will be added).

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Matthew Bakhtiari <dev@mtbk.me>
Signed-off-by: Matthew Bakhtiari <dev@mtbk.me>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/alloc/README.md           | 33 +++++++++++++++++++++++++++++++++
 rust/alloc/alloc.rs            |  2 ++
 rust/alloc/borrow.rs           |  4 +++-
 rust/alloc/boxed.rs            |  4 ++++
 rust/alloc/collections/mod.rs  |  2 ++
 rust/alloc/lib.rs              |  8 ++++++++
 rust/alloc/raw_vec.rs          |  9 +++++++++
 rust/alloc/slice.rs            |  2 ++
 rust/alloc/vec/drain.rs        |  2 ++
 rust/alloc/vec/drain_filter.rs |  2 ++
 rust/alloc/vec/into_iter.rs    |  4 ++++
 rust/alloc/vec/is_zero.rs      |  2 ++
 rust/alloc/vec/mod.rs          | 25 +++++++++++++++++++++++++
 rust/alloc/vec/partial_eq.rs   |  2 ++
 14 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 rust/alloc/README.md

diff --git a/rust/alloc/README.md b/rust/alloc/README.md
new file mode 100644
index 000000000000..c89c753720b5
--- /dev/null
+++ b/rust/alloc/README.md
@@ -0,0 +1,33 @@
+# `alloc`
+
+These source files come from the Rust standard library, hosted in
+the <https://github.com/rust-lang/rust> repository, licensed under
+"Apache-2.0 OR MIT" and adapted for kernel use. For copyright details,
+see <https://github.com/rust-lang/rust/blob/master/COPYRIGHT>.
+
+Please note that these files should be kept as close as possible to
+upstream. In general, only additions should be performed (e.g. new
+methods). Eventually, changes should make it into upstream so that,
+at some point, this fork can be dropped from the kernel tree.
+
+
+## Rationale
+
+On one hand, kernel folks wanted to keep `alloc` in-tree to have more
+freedom in both workflow and actual features if actually needed
+(e.g. receiver types if we ended up using them), which is reasonable.
+
+On the other hand, Rust folks wanted to keep `alloc` as close as
+upstream as possible and avoid as much divergence as possible, which
+is also reasonable.
+
+We agreed on a middle-ground: we would keep a subset of `alloc`
+in-tree that would be as small and as close as possible to upstream.
+Then, upstream can start adding the functions that we add to `alloc`
+etc., until we reach a point where the kernel already knows exactly
+what it needs in `alloc` and all the new methods are merged into
+upstream, so that we can drop `alloc` from the kernel tree and go back
+to using the upstream one.
+
+By doing this, the kernel can go a bit faster now, and Rust can
+slowly incorporate and discuss the changes as needed.
diff --git a/rust/alloc/alloc.rs b/rust/alloc/alloc.rs
index 6162b5c6d4c9..ca224a541770 100644
--- a/rust/alloc/alloc.rs
+++ b/rust/alloc/alloc.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 //! Memory allocation APIs
 
 #![stable(feature = "alloc_module", since = "1.28.0")]
diff --git a/rust/alloc/borrow.rs b/rust/alloc/borrow.rs
index cb4e438f8bea..dde4957200d4 100644
--- a/rust/alloc/borrow.rs
+++ b/rust/alloc/borrow.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 //! A module for working with borrowed data.
 
 #![stable(feature = "rust1", since = "1.0.0")]
@@ -11,7 +13,7 @@ use core::ops::{Add, AddAssign};
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use core::borrow::{Borrow, BorrowMut};
 
-use crate::fmt;
+use core::fmt;
 #[cfg(not(no_global_oom_handling))]
 use crate::string::String;
 
diff --git a/rust/alloc/boxed.rs b/rust/alloc/boxed.rs
index c07536f0d0ce..dcfe87b14f3a 100644
--- a/rust/alloc/boxed.rs
+++ b/rust/alloc/boxed.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 //! A pointer type for heap allocation.
 //!
 //! [`Box<T>`], casually referred to as a 'box', provides the simplest form of
@@ -163,9 +165,11 @@ use crate::str::from_boxed_utf8_unchecked;
 #[cfg(not(no_global_oom_handling))]
 use crate::vec::Vec;
 
+#[cfg(not(no_thin))]
 #[unstable(feature = "thin_box", issue = "92791")]
 pub use thin::ThinBox;
 
+#[cfg(not(no_thin))]
 mod thin;
 
 /// A pointer type for heap allocation.
diff --git a/rust/alloc/collections/mod.rs b/rust/alloc/collections/mod.rs
index 628a5b155673..1eec265b28f8 100644
--- a/rust/alloc/collections/mod.rs
+++ b/rust/alloc/collections/mod.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 //! Collection types.
 
 #![stable(feature = "rust1", since = "1.0.0")]
diff --git a/rust/alloc/lib.rs b/rust/alloc/lib.rs
index fd21b3671182..233bcd5e4654 100644
--- a/rust/alloc/lib.rs
+++ b/rust/alloc/lib.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 //! # The Rust core allocation and collections library
 //!
 //! This library provides smart pointers and collections for managing
@@ -192,6 +194,7 @@ extern crate std;
 extern crate test;
 
 // Module with internal macros used by other modules (needs to be included before other modules).
+#[cfg(not(no_macros))]
 #[macro_use]
 mod macros;
 
@@ -216,11 +219,16 @@ pub mod borrow;
 pub mod collections;
 #[cfg(not(no_global_oom_handling))]
 pub mod ffi;
+#[cfg(not(no_fmt))]
 pub mod fmt;
+#[cfg(not(no_rc))]
 pub mod rc;
 pub mod slice;
+#[cfg(not(no_str))]
 pub mod str;
+#[cfg(not(no_string))]
 pub mod string;
+#[cfg(not(no_sync))]
 #[cfg(target_has_atomic = "ptr")]
 pub mod sync;
 #[cfg(all(not(no_global_oom_handling), target_has_atomic = "ptr"))]
diff --git a/rust/alloc/raw_vec.rs b/rust/alloc/raw_vec.rs
index 4be5f6cf9ca5..daf5f2da7168 100644
--- a/rust/alloc/raw_vec.rs
+++ b/rust/alloc/raw_vec.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 #![unstable(feature = "raw_vec_internals", reason = "unstable const warnings", issue = "none")]
 
 use core::alloc::LayoutError;
@@ -307,6 +309,12 @@ impl<T, A: Allocator> RawVec<T, A> {
         }
     }
 
+    /// The same as `reserve_for_push`, but returns on errors instead of panicking or aborting.
+    #[inline(never)]
+    pub fn try_reserve_for_push(&mut self, len: usize) -> Result<(), TryReserveError> {
+        self.grow_amortized(len, 1)
+    }
+
     /// Ensures that the buffer contains at least enough space to hold `len +
     /// additional` elements. If it doesn't already, will reallocate the
     /// minimum possible amount of memory necessary. Generally this will be
@@ -421,6 +429,7 @@ impl<T, A: Allocator> RawVec<T, A> {
         Ok(())
     }
 
+    #[allow(dead_code)]
     fn shrink(&mut self, cap: usize) -> Result<(), TryReserveError> {
         assert!(cap <= self.capacity(), "Tried to shrink to a larger capacity");
 
diff --git a/rust/alloc/slice.rs b/rust/alloc/slice.rs
index 199b3c9d0290..e444e97fa145 100644
--- a/rust/alloc/slice.rs
+++ b/rust/alloc/slice.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 //! A dynamically-sized view into a contiguous sequence, `[T]`.
 //!
 //! *[See also the slice primitive type](slice).*
diff --git a/rust/alloc/vec/drain.rs b/rust/alloc/vec/drain.rs
index 5cdee0bd4da4..b6a5f98e4fcd 100644
--- a/rust/alloc/vec/drain.rs
+++ b/rust/alloc/vec/drain.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 use crate::alloc::{Allocator, Global};
 use core::fmt;
 use core::iter::{FusedIterator, TrustedLen};
diff --git a/rust/alloc/vec/drain_filter.rs b/rust/alloc/vec/drain_filter.rs
index 3c37c92ae44b..b04fce041622 100644
--- a/rust/alloc/vec/drain_filter.rs
+++ b/rust/alloc/vec/drain_filter.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 use crate::alloc::{Allocator, Global};
 use core::ptr::{self};
 use core::slice::{self};
diff --git a/rust/alloc/vec/into_iter.rs b/rust/alloc/vec/into_iter.rs
index 9b84a1d9b4b6..f7a50e76691e 100644
--- a/rust/alloc/vec/into_iter.rs
+++ b/rust/alloc/vec/into_iter.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 #[cfg(not(no_global_oom_handling))]
 use super::AsVecIntoIter;
 use crate::alloc::{Allocator, Global};
@@ -9,6 +11,7 @@ use core::iter::{
 };
 use core::marker::PhantomData;
 use core::mem::{self, ManuallyDrop};
+#[cfg(not(no_global_oom_handling))]
 use core::ops::Deref;
 use core::ptr::{self, NonNull};
 use core::slice::{self};
@@ -123,6 +126,7 @@ impl<T, A: Allocator> IntoIter<T, A> {
     }
 
     /// Forgets to Drop the remaining elements while still allowing the backing allocation to be freed.
+    #[allow(dead_code)]
     pub(crate) fn forget_remaining_elements(&mut self) {
         self.ptr = self.end;
     }
diff --git a/rust/alloc/vec/is_zero.rs b/rust/alloc/vec/is_zero.rs
index edf270db81d4..377f3d172777 100644
--- a/rust/alloc/vec/is_zero.rs
+++ b/rust/alloc/vec/is_zero.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 use crate::boxed::Box;
 
 #[rustc_specialization_trait]
diff --git a/rust/alloc/vec/mod.rs b/rust/alloc/vec/mod.rs
index 3dc8a4fbba86..540787804cc2 100644
--- a/rust/alloc/vec/mod.rs
+++ b/rust/alloc/vec/mod.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 //! A contiguous growable array type with heap-allocated contents, written
 //! `Vec<T>`.
 //!
@@ -1739,6 +1741,29 @@ impl<T, A: Allocator> Vec<T, A> {
         }
     }
 
+    /// Tries to append an element to the back of a collection.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let mut vec = vec![1, 2];
+    /// vec.try_push(3).unwrap();
+    /// assert_eq!(vec, [1, 2, 3]);
+    /// ```
+    #[inline]
+    #[stable(feature = "kernel", since = "1.0.0")]
+    pub fn try_push(&mut self, value: T) -> Result<(), TryReserveError> {
+        if self.len == self.buf.capacity() {
+            self.buf.try_reserve_for_push(self.len)?;
+        }
+        unsafe {
+            let end = self.as_mut_ptr().add(self.len);
+            ptr::write(end, value);
+            self.len += 1;
+        }
+        Ok(())
+    }
+
     /// Removes the last element from a vector and returns it, or [`None`] if it
     /// is empty.
     ///
diff --git a/rust/alloc/vec/partial_eq.rs b/rust/alloc/vec/partial_eq.rs
index b0cf72577a1b..10ad4e492287 100644
--- a/rust/alloc/vec/partial_eq.rs
+++ b/rust/alloc/vec/partial_eq.rs
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
 use crate::alloc::Allocator;
 #[cfg(not(no_global_oom_handling))]
 use crate::borrow::Cow;
-- 
cgit v1.2.3


From db958dcf88e512c55595a30944467cb5bf3647fb Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sat, 3 Jul 2021 16:54:11 +0200
Subject: rust: add `compiler_builtins` crate

Rust provides `compiler_builtins` as a port of LLVM's `compiler-rt`.
Since we do not need the vast majority of them, we avoid the
dependency by providing our own crate.

Reviewed-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Sven Van Asbroeck <thesven73@gmail.com>
Signed-off-by: Sven Van Asbroeck <thesven73@gmail.com>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/compiler_builtins.rs | 63 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 rust/compiler_builtins.rs

diff --git a/rust/compiler_builtins.rs b/rust/compiler_builtins.rs
new file mode 100644
index 000000000000..f8f39a3e6855
--- /dev/null
+++ b/rust/compiler_builtins.rs
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Our own `compiler_builtins`.
+//!
+//! Rust provides [`compiler_builtins`] as a port of LLVM's [`compiler-rt`].
+//! Since we do not need the vast majority of them, we avoid the dependency
+//! by providing this file.
+//!
+//! At the moment, some builtins are required that should not be. For instance,
+//! [`core`] has 128-bit integers functionality which we should not be compiling
+//! in. We will work with upstream [`core`] to provide feature flags to disable
+//! the parts we do not need. For the moment, we define them to [`panic!`] at
+//! runtime for simplicity to catch mistakes, instead of performing surgery
+//! on `core.o`.
+//!
+//! In any case, all these symbols are weakened to ensure we do not override
+//! those that may be provided by the rest of the kernel.
+//!
+//! [`compiler_builtins`]: https://github.com/rust-lang/compiler-builtins
+//! [`compiler-rt`]: https://compiler-rt.llvm.org/
+
+#![feature(compiler_builtins)]
+#![compiler_builtins]
+#![no_builtins]
+#![no_std]
+
+macro_rules! define_panicking_intrinsics(
+    ($reason: tt, { $($ident: ident, )* }) => {
+        $(
+            #[doc(hidden)]
+            #[no_mangle]
+            pub extern "C" fn $ident() {
+                panic!($reason);
+            }
+        )*
+    }
+);
+
+define_panicking_intrinsics!("`f32` should not be used", {
+    __eqsf2,
+    __gesf2,
+    __lesf2,
+    __nesf2,
+    __unordsf2,
+});
+
+define_panicking_intrinsics!("`f64` should not be used", {
+    __unorddf2,
+});
+
+define_panicking_intrinsics!("`i128` should not be used", {
+    __ashrti3,
+    __muloti4,
+    __multi3,
+});
+
+define_panicking_intrinsics!("`u128` should not be used", {
+    __ashlti3,
+    __lshrti3,
+    __udivmodti4,
+    __udivti3,
+    __umodti3,
+});
-- 
cgit v1.2.3


From 1fbde52bde73e5a4c90577bce935e966a1ba1387 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sat, 3 Jul 2021 16:56:17 +0200
Subject: rust: add `macros` crate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This crate contains all the procedural macros ("proc macros")
shared by all the kernel.

Procedural macros allow to create syntax extensions. They run at
compile-time and can consume as well as produce Rust syntax.

For instance, the `module!` macro that is used by Rust modules
is implemented here. It allows to easily declare the equivalent
information to the `MODULE_*` macros in C modules, e.g.:

    module! {
        type: RustMinimal,
        name: b"rust_minimal",
        author: b"Rust for Linux Contributors",
        description: b"Rust minimal sample",
        license: b"GPL",
    }

Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Finn Behrens <me@kloenk.de>
Signed-off-by: Finn Behrens <me@kloenk.de>
Co-developed-by: Adam Bratschi-Kaye <ark.email@gmail.com>
Signed-off-by: Adam Bratschi-Kaye <ark.email@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Sumera Priyadarsini <sylphrenadin@gmail.com>
Signed-off-by: Sumera Priyadarsini <sylphrenadin@gmail.com>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Matthew Bakhtiari <dev@mtbk.me>
Signed-off-by: Matthew Bakhtiari <dev@mtbk.me>
Co-developed-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Signed-off-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/macros/helpers.rs |  51 +++++++++
 rust/macros/lib.rs     |  72 +++++++++++++
 rust/macros/module.rs  | 282 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 405 insertions(+)
 create mode 100644 rust/macros/helpers.rs
 create mode 100644 rust/macros/lib.rs
 create mode 100644 rust/macros/module.rs

diff --git a/rust/macros/helpers.rs b/rust/macros/helpers.rs
new file mode 100644
index 000000000000..cdc7dc6135d2
--- /dev/null
+++ b/rust/macros/helpers.rs
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+
+use proc_macro::{token_stream, TokenTree};
+
+pub(crate) fn try_ident(it: &mut token_stream::IntoIter) -> Option<String> {
+    if let Some(TokenTree::Ident(ident)) = it.next() {
+        Some(ident.to_string())
+    } else {
+        None
+    }
+}
+
+pub(crate) fn try_literal(it: &mut token_stream::IntoIter) -> Option<String> {
+    if let Some(TokenTree::Literal(literal)) = it.next() {
+        Some(literal.to_string())
+    } else {
+        None
+    }
+}
+
+pub(crate) fn try_byte_string(it: &mut token_stream::IntoIter) -> Option<String> {
+    try_literal(it).and_then(|byte_string| {
+        if byte_string.starts_with("b\"") && byte_string.ends_with('\"') {
+            Some(byte_string[2..byte_string.len() - 1].to_string())
+        } else {
+            None
+        }
+    })
+}
+
+pub(crate) fn expect_ident(it: &mut token_stream::IntoIter) -> String {
+    try_ident(it).expect("Expected Ident")
+}
+
+pub(crate) fn expect_punct(it: &mut token_stream::IntoIter) -> char {
+    if let TokenTree::Punct(punct) = it.next().expect("Reached end of token stream for Punct") {
+        punct.as_char()
+    } else {
+        panic!("Expected Punct");
+    }
+}
+
+pub(crate) fn expect_byte_string(it: &mut token_stream::IntoIter) -> String {
+    try_byte_string(it).expect("Expected byte string")
+}
+
+pub(crate) fn expect_end(it: &mut token_stream::IntoIter) {
+    if it.next().is_some() {
+        panic!("Expected end");
+    }
+}
diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs
new file mode 100644
index 000000000000..91764bfb1f89
--- /dev/null
+++ b/rust/macros/lib.rs
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Crate for all kernel procedural macros.
+
+mod helpers;
+mod module;
+
+use proc_macro::TokenStream;
+
+/// Declares a kernel module.
+///
+/// The `type` argument should be a type which implements the [`Module`]
+/// trait. Also accepts various forms of kernel metadata.
+///
+/// C header: [`include/linux/moduleparam.h`](../../../include/linux/moduleparam.h)
+///
+/// [`Module`]: ../kernel/trait.Module.html
+///
+/// # Examples
+///
+/// ```ignore
+/// use kernel::prelude::*;
+///
+/// module!{
+///     type: MyModule,
+///     name: b"my_kernel_module",
+///     author: b"Rust for Linux Contributors",
+///     description: b"My very own kernel module!",
+///     license: b"GPL",
+///     params: {
+///        my_i32: i32 {
+///            default: 42,
+///            permissions: 0o000,
+///            description: b"Example of i32",
+///        },
+///        writeable_i32: i32 {
+///            default: 42,
+///            permissions: 0o644,
+///            description: b"Example of i32",
+///        },
+///    },
+/// }
+///
+/// struct MyModule;
+///
+/// impl kernel::Module for MyModule {
+///     fn init() -> Result<Self> {
+///         // If the parameter is writeable, then the kparam lock must be
+///         // taken to read the parameter:
+///         {
+///             let lock = THIS_MODULE.kernel_param_lock();
+///             pr_info!("i32 param is:  {}\n", writeable_i32.read(&lock));
+///         }
+///         // If the parameter is read only, it can be read without locking
+///         // the kernel parameters:
+///         pr_info!("i32 param is:  {}\n", my_i32.read());
+///         Ok(Self)
+///     }
+/// }
+/// ```
+///
+/// # Supported argument types
+///   - `type`: type which implements the [`Module`] trait (required).
+///   - `name`: byte array of the name of the kernel module (required).
+///   - `author`: byte array of the author of the kernel module.
+///   - `description`: byte array of the description of the kernel module.
+///   - `license`: byte array of the license of the kernel module (required).
+///   - `alias`: byte array of alias name of the kernel module.
+#[proc_macro]
+pub fn module(ts: TokenStream) -> TokenStream {
+    module::module(ts)
+}
diff --git a/rust/macros/module.rs b/rust/macros/module.rs
new file mode 100644
index 000000000000..186a5b8be23c
--- /dev/null
+++ b/rust/macros/module.rs
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: GPL-2.0
+
+use crate::helpers::*;
+use proc_macro::{token_stream, Literal, TokenStream, TokenTree};
+use std::fmt::Write;
+
+struct ModInfoBuilder<'a> {
+    module: &'a str,
+    counter: usize,
+    buffer: String,
+}
+
+impl<'a> ModInfoBuilder<'a> {
+    fn new(module: &'a str) -> Self {
+        ModInfoBuilder {
+            module,
+            counter: 0,
+            buffer: String::new(),
+        }
+    }
+
+    fn emit_base(&mut self, field: &str, content: &str, builtin: bool) {
+        let string = if builtin {
+            // Built-in modules prefix their modinfo strings by `module.`.
+            format!(
+                "{module}.{field}={content}\0",
+                module = self.module,
+                field = field,
+                content = content
+            )
+        } else {
+            // Loadable modules' modinfo strings go as-is.
+            format!("{field}={content}\0", field = field, content = content)
+        };
+
+        write!(
+            &mut self.buffer,
+            "
+                {cfg}
+                #[doc(hidden)]
+                #[link_section = \".modinfo\"]
+                #[used]
+                pub static __{module}_{counter}: [u8; {length}] = *{string};
+            ",
+            cfg = if builtin {
+                "#[cfg(not(MODULE))]"
+            } else {
+                "#[cfg(MODULE)]"
+            },
+            module = self.module.to_uppercase(),
+            counter = self.counter,
+            length = string.len(),
+            string = Literal::byte_string(string.as_bytes()),
+        )
+        .unwrap();
+
+        self.counter += 1;
+    }
+
+    fn emit_only_builtin(&mut self, field: &str, content: &str) {
+        self.emit_base(field, content, true)
+    }
+
+    fn emit_only_loadable(&mut self, field: &str, content: &str) {
+        self.emit_base(field, content, false)
+    }
+
+    fn emit(&mut self, field: &str, content: &str) {
+        self.emit_only_builtin(field, content);
+        self.emit_only_loadable(field, content);
+    }
+}
+
+#[derive(Debug, Default)]
+struct ModuleInfo {
+    type_: String,
+    license: String,
+    name: String,
+    author: Option<String>,
+    description: Option<String>,
+    alias: Option<String>,
+}
+
+impl ModuleInfo {
+    fn parse(it: &mut token_stream::IntoIter) -> Self {
+        let mut info = ModuleInfo::default();
+
+        const EXPECTED_KEYS: &[&str] =
+            &["type", "name", "author", "description", "license", "alias"];
+        const REQUIRED_KEYS: &[&str] = &["type", "name", "license"];
+        let mut seen_keys = Vec::new();
+
+        loop {
+            let key = match it.next() {
+                Some(TokenTree::Ident(ident)) => ident.to_string(),
+                Some(_) => panic!("Expected Ident or end"),
+                None => break,
+            };
+
+            if seen_keys.contains(&key) {
+                panic!(
+                    "Duplicated key \"{}\". Keys can only be specified once.",
+                    key
+                );
+            }
+
+            assert_eq!(expect_punct(it), ':');
+
+            match key.as_str() {
+                "type" => info.type_ = expect_ident(it),
+                "name" => info.name = expect_byte_string(it),
+                "author" => info.author = Some(expect_byte_string(it)),
+                "description" => info.description = Some(expect_byte_string(it)),
+                "license" => info.license = expect_byte_string(it),
+                "alias" => info.alias = Some(expect_byte_string(it)),
+                _ => panic!(
+                    "Unknown key \"{}\". Valid keys are: {:?}.",
+                    key, EXPECTED_KEYS
+                ),
+            }
+
+            assert_eq!(expect_punct(it), ',');
+
+            seen_keys.push(key);
+        }
+
+        expect_end(it);
+
+        for key in REQUIRED_KEYS {
+            if !seen_keys.iter().any(|e| e == key) {
+                panic!("Missing required key \"{}\".", key);
+            }
+        }
+
+        let mut ordered_keys: Vec<&str> = Vec::new();
+        for key in EXPECTED_KEYS {
+            if seen_keys.iter().any(|e| e == key) {
+                ordered_keys.push(key);
+            }
+        }
+
+        if seen_keys != ordered_keys {
+            panic!(
+                "Keys are not ordered as expected. Order them like: {:?}.",
+                ordered_keys
+            );
+        }
+
+        info
+    }
+}
+
+pub(crate) fn module(ts: TokenStream) -> TokenStream {
+    let mut it = ts.into_iter();
+
+    let info = ModuleInfo::parse(&mut it);
+
+    let mut modinfo = ModInfoBuilder::new(info.name.as_ref());
+    if let Some(author) = info.author {
+        modinfo.emit("author", &author);
+    }
+    if let Some(description) = info.description {
+        modinfo.emit("description", &description);
+    }
+    modinfo.emit("license", &info.license);
+    if let Some(alias) = info.alias {
+        modinfo.emit("alias", &alias);
+    }
+
+    // Built-in modules also export the `file` modinfo string.
+    let file =
+        std::env::var("RUST_MODFILE").expect("Unable to fetch RUST_MODFILE environmental variable");
+    modinfo.emit_only_builtin("file", &file);
+
+    format!(
+        "
+            /// The module name.
+            ///
+            /// Used by the printing macros, e.g. [`info!`].
+            const __LOG_PREFIX: &[u8] = b\"{name}\\0\";
+
+            /// The \"Rust loadable module\" mark, for `scripts/is_rust_module.sh`.
+            //
+            // This may be best done another way later on, e.g. as a new modinfo
+            // key or a new section. For the moment, keep it simple.
+            #[cfg(MODULE)]
+            #[doc(hidden)]
+            #[used]
+            static __IS_RUST_MODULE: () = ();
+
+            static mut __MOD: Option<{type_}> = None;
+
+            // SAFETY: `__this_module` is constructed by the kernel at load time and will not be
+            // freed until the module is unloaded.
+            #[cfg(MODULE)]
+            static THIS_MODULE: kernel::ThisModule = unsafe {{
+                kernel::ThisModule::from_ptr(&kernel::bindings::__this_module as *const _ as *mut _)
+            }};
+            #[cfg(not(MODULE))]
+            static THIS_MODULE: kernel::ThisModule = unsafe {{
+                kernel::ThisModule::from_ptr(core::ptr::null_mut())
+            }};
+
+            // Loadable modules need to export the `{{init,cleanup}}_module` identifiers.
+            #[cfg(MODULE)]
+            #[doc(hidden)]
+            #[no_mangle]
+            pub extern \"C\" fn init_module() -> core::ffi::c_int {{
+                __init()
+            }}
+
+            #[cfg(MODULE)]
+            #[doc(hidden)]
+            #[no_mangle]
+            pub extern \"C\" fn cleanup_module() {{
+                __exit()
+            }}
+
+            // Built-in modules are initialized through an initcall pointer
+            // and the identifiers need to be unique.
+            #[cfg(not(MODULE))]
+            #[cfg(not(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS))]
+            #[doc(hidden)]
+            #[link_section = \"{initcall_section}\"]
+            #[used]
+            pub static __{name}_initcall: extern \"C\" fn() -> core::ffi::c_int = __{name}_init;
+
+            #[cfg(not(MODULE))]
+            #[cfg(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)]
+            core::arch::global_asm!(
+                r#\".section \"{initcall_section}\", \"a\"
+                __{name}_initcall:
+                    .long   __{name}_init - .
+                    .previous
+                \"#
+            );
+
+            #[cfg(not(MODULE))]
+            #[doc(hidden)]
+            #[no_mangle]
+            pub extern \"C\" fn __{name}_init() -> core::ffi::c_int {{
+                __init()
+            }}
+
+            #[cfg(not(MODULE))]
+            #[doc(hidden)]
+            #[no_mangle]
+            pub extern \"C\" fn __{name}_exit() {{
+                __exit()
+            }}
+
+            fn __init() -> core::ffi::c_int {{
+                match <{type_} as kernel::Module>::init(&THIS_MODULE) {{
+                    Ok(m) => {{
+                        unsafe {{
+                            __MOD = Some(m);
+                        }}
+                        return 0;
+                    }}
+                    Err(e) => {{
+                        return e.to_kernel_errno();
+                    }}
+                }}
+            }}
+
+            fn __exit() {{
+                unsafe {{
+                    // Invokes `drop()` on `__MOD`, which should be used for cleanup.
+                    __MOD = None;
+                }}
+            }}
+
+            {modinfo}
+        ",
+        type_ = info.type_,
+        name = info.name,
+        modinfo = modinfo.buffer,
+        initcall_section = ".initcall6.init"
+    )
+    .parse()
+    .expect("Error parsing formatted string into token stream.")
+}
-- 
cgit v1.2.3


From 8326ac05ee8841e625a6a54d225242eec579fa13 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Mon, 25 Jul 2022 23:46:47 +0200
Subject: rust: add `bindings` crate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This crate contains the bindings to the C side of the kernel.

Calling C (in general, FFI) is assumed to be unsafe in Rust
and, in many cases, this is accurate. For instance, virtually
all C functions that take a pointer are unsafe since, typically,
it will be dereferenced at some point (and in most cases there
is no way for the callee to check its validity beforehand).

Since one of the goals of using Rust in the kernel is precisely
to avoid unsafe code in "leaf" kernel modules (e.g. drivers),
these bindings should not be used directly by them.

Instead, these bindings need to be wrapped into safe abstractions.
These abstractions provide a safe API that kernel modules can use.
In this way, unsafe code in kernel modules is minimized.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Finn Behrens <me@kloenk.de>
Signed-off-by: Finn Behrens <me@kloenk.de>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Sven Van Asbroeck <thesven73@gmail.com>
Signed-off-by: Sven Van Asbroeck <thesven73@gmail.com>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Maciej Falkowski <m.falkowski@samsung.com>
Signed-off-by: Maciej Falkowski <m.falkowski@samsung.com>
Co-developed-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Co-developed-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Signed-off-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/bindings/bindings_helper.h | 13 ++++++++++
 rust/bindings/lib.rs            | 53 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 rust/bindings/bindings_helper.h
 create mode 100644 rust/bindings/lib.rs

diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h
new file mode 100644
index 000000000000..c48bc284214a
--- /dev/null
+++ b/rust/bindings/bindings_helper.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Header that contains the code (mostly headers) for which Rust bindings
+ * will be automatically generated by `bindgen`.
+ *
+ * Sorted alphabetically.
+ */
+
+#include <linux/slab.h>
+
+/* `bindgen` gets confused at certain things. */
+const gfp_t BINDINGS_GFP_KERNEL = GFP_KERNEL;
+const gfp_t BINDINGS___GFP_ZERO = __GFP_ZERO;
diff --git a/rust/bindings/lib.rs b/rust/bindings/lib.rs
new file mode 100644
index 000000000000..6c50ee62c56b
--- /dev/null
+++ b/rust/bindings/lib.rs
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Bindings.
+//!
+//! Imports the generated bindings by `bindgen`.
+//!
+//! This crate may not be directly used. If you need a kernel C API that is
+//! not ported or wrapped in the `kernel` crate, then do so first instead of
+//! using this crate.
+
+#![no_std]
+#![feature(core_ffi_c)]
+// See <https://github.com/rust-lang/rust-bindgen/issues/1651>.
+#![cfg_attr(test, allow(deref_nullptr))]
+#![cfg_attr(test, allow(unaligned_references))]
+#![cfg_attr(test, allow(unsafe_op_in_unsafe_fn))]
+#![allow(
+    clippy::all,
+    missing_docs,
+    non_camel_case_types,
+    non_upper_case_globals,
+    non_snake_case,
+    improper_ctypes,
+    unreachable_pub,
+    unsafe_op_in_unsafe_fn
+)]
+
+mod bindings_raw {
+    // Use glob import here to expose all helpers.
+    // Symbols defined within the module will take precedence to the glob import.
+    pub use super::bindings_helper::*;
+    include!(concat!(
+        env!("OBJTREE"),
+        "/rust/bindings/bindings_generated.rs"
+    ));
+}
+
+// When both a directly exposed symbol and a helper exists for the same function,
+// the directly exposed symbol is preferred and the helper becomes dead code, so
+// ignore the warning here.
+#[allow(dead_code)]
+mod bindings_helper {
+    // Import the generated bindings for types.
+    include!(concat!(
+        env!("OBJTREE"),
+        "/rust/bindings/bindings_helpers_generated.rs"
+    ));
+}
+
+pub use bindings_raw::*;
+
+pub const GFP_KERNEL: gfp_t = BINDINGS_GFP_KERNEL;
+pub const __GFP_ZERO: gfp_t = BINDINGS___GFP_ZERO;
-- 
cgit v1.2.3


From 247b365dc8dc442412c88fe02346cd2bddac7bd6 Mon Sep 17 00:00:00 2001
From: Wedson Almeida Filho <wedsonaf@google.com>
Date: Fri, 11 Feb 2022 20:25:34 +0100
Subject: rust: add `kernel` crate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `kernel` crate currently includes all the abstractions that wrap
kernel features written in C.

These abstractions call the C side of the kernel via the generated
bindings with the `bindgen` tool. Modules developed in Rust should
never call the bindings themselves.

In the future, as the abstractions grow in number, we may need
to split this crate into several, possibly following a similar
subdivision in subsystems as the kernel itself and/or moving
the code to the actual subsystems.

Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Geoffrey Thomas <geofft@ldpreload.com>
Signed-off-by: Geoffrey Thomas <geofft@ldpreload.com>
Co-developed-by: Finn Behrens <me@kloenk.de>
Signed-off-by: Finn Behrens <me@kloenk.de>
Co-developed-by: Adam Bratschi-Kaye <ark.email@gmail.com>
Signed-off-by: Adam Bratschi-Kaye <ark.email@gmail.com>
Co-developed-by: Sven Van Asbroeck <thesven73@gmail.com>
Signed-off-by: Sven Van Asbroeck <thesven73@gmail.com>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Boris-Chengbiao Zhou <bobo1239@web.de>
Signed-off-by: Boris-Chengbiao Zhou <bobo1239@web.de>
Co-developed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Co-developed-by: Fox Chen <foxhlchen@gmail.com>
Signed-off-by: Fox Chen <foxhlchen@gmail.com>
Co-developed-by: Viktor Garske <viktor@v-gar.de>
Signed-off-by: Viktor Garske <viktor@v-gar.de>
Co-developed-by: Dariusz Sosnowski <dsosnowski@dsosnowski.pl>
Signed-off-by: Dariusz Sosnowski <dsosnowski@dsosnowski.pl>
Co-developed-by: Léo Lanteri Thauvin <leseulartichaut@gmail.com>
Signed-off-by: Léo Lanteri Thauvin <leseulartichaut@gmail.com>
Co-developed-by: Niklas Mohrin <dev@niklasmohrin.de>
Signed-off-by: Niklas Mohrin <dev@niklasmohrin.de>
Co-developed-by: Milan Landaverde <milan@mdaverde.com>
Signed-off-by: Milan Landaverde <milan@mdaverde.com>
Co-developed-by: Morgan Bartlett <mjmouse9999@gmail.com>
Signed-off-by: Morgan Bartlett <mjmouse9999@gmail.com>
Co-developed-by: Maciej Falkowski <m.falkowski@samsung.com>
Signed-off-by: Maciej Falkowski <m.falkowski@samsung.com>
Co-developed-by: Nándor István Krácser <bonifaido@gmail.com>
Signed-off-by: Nándor István Krácser <bonifaido@gmail.com>
Co-developed-by: David Gow <davidgow@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Co-developed-by: John Baublitz <john.m.baublitz@gmail.com>
Signed-off-by: John Baublitz <john.m.baublitz@gmail.com>
Co-developed-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Signed-off-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/kernel/allocator.rs |  64 +++++++++++++++
 rust/kernel/error.rs     |  59 ++++++++++++++
 rust/kernel/lib.rs       |  78 +++++++++++++++++++
 rust/kernel/prelude.rs   |  20 +++++
 rust/kernel/print.rs     | 198 +++++++++++++++++++++++++++++++++++++++++++++++
 rust/kernel/str.rs       |  72 +++++++++++++++++
 6 files changed, 491 insertions(+)
 create mode 100644 rust/kernel/allocator.rs
 create mode 100644 rust/kernel/error.rs
 create mode 100644 rust/kernel/lib.rs
 create mode 100644 rust/kernel/prelude.rs
 create mode 100644 rust/kernel/print.rs
 create mode 100644 rust/kernel/str.rs

diff --git a/rust/kernel/allocator.rs b/rust/kernel/allocator.rs
new file mode 100644
index 000000000000..397a3dd57a9b
--- /dev/null
+++ b/rust/kernel/allocator.rs
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Allocator support.
+
+use core::alloc::{GlobalAlloc, Layout};
+use core::ptr;
+
+use crate::bindings;
+
+struct KernelAllocator;
+
+unsafe impl GlobalAlloc for KernelAllocator {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        // `krealloc()` is used instead of `kmalloc()` because the latter is
+        // an inline function and cannot be bound to as a result.
+        unsafe { bindings::krealloc(ptr::null(), layout.size(), bindings::GFP_KERNEL) as *mut u8 }
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) {
+        unsafe {
+            bindings::kfree(ptr as *const core::ffi::c_void);
+        }
+    }
+}
+
+#[global_allocator]
+static ALLOCATOR: KernelAllocator = KernelAllocator;
+
+// `rustc` only generates these for some crate types. Even then, we would need
+// to extract the object file that has them from the archive. For the moment,
+// let's generate them ourselves instead.
+//
+// Note that `#[no_mangle]` implies exported too, nowadays.
+#[no_mangle]
+fn __rust_alloc(size: usize, _align: usize) -> *mut u8 {
+    unsafe { bindings::krealloc(core::ptr::null(), size, bindings::GFP_KERNEL) as *mut u8 }
+}
+
+#[no_mangle]
+fn __rust_dealloc(ptr: *mut u8, _size: usize, _align: usize) {
+    unsafe { bindings::kfree(ptr as *const core::ffi::c_void) };
+}
+
+#[no_mangle]
+fn __rust_realloc(ptr: *mut u8, _old_size: usize, _align: usize, new_size: usize) -> *mut u8 {
+    unsafe {
+        bindings::krealloc(
+            ptr as *const core::ffi::c_void,
+            new_size,
+            bindings::GFP_KERNEL,
+        ) as *mut u8
+    }
+}
+
+#[no_mangle]
+fn __rust_alloc_zeroed(size: usize, _align: usize) -> *mut u8 {
+    unsafe {
+        bindings::krealloc(
+            core::ptr::null(),
+            size,
+            bindings::GFP_KERNEL | bindings::__GFP_ZERO,
+        ) as *mut u8
+    }
+}
diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs
new file mode 100644
index 000000000000..466b2a8fe569
--- /dev/null
+++ b/rust/kernel/error.rs
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Kernel errors.
+//!
+//! C header: [`include/uapi/asm-generic/errno-base.h`](../../../include/uapi/asm-generic/errno-base.h)
+
+use alloc::collections::TryReserveError;
+
+/// Contains the C-compatible error codes.
+pub mod code {
+    /// Out of memory.
+    pub const ENOMEM: super::Error = super::Error(-(crate::bindings::ENOMEM as i32));
+}
+
+/// Generic integer kernel error.
+///
+/// The kernel defines a set of integer generic error codes based on C and
+/// POSIX ones. These codes may have a more specific meaning in some contexts.
+///
+/// # Invariants
+///
+/// The value is a valid `errno` (i.e. `>= -MAX_ERRNO && < 0`).
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub struct Error(core::ffi::c_int);
+
+impl Error {
+    /// Returns the kernel error code.
+    pub fn to_kernel_errno(self) -> core::ffi::c_int {
+        self.0
+    }
+}
+
+impl From<TryReserveError> for Error {
+    fn from(_: TryReserveError) -> Error {
+        code::ENOMEM
+    }
+}
+
+/// A [`Result`] with an [`Error`] error type.
+///
+/// To be used as the return type for functions that may fail.
+///
+/// # Error codes in C and Rust
+///
+/// In C, it is common that functions indicate success or failure through
+/// their return value; modifying or returning extra data through non-`const`
+/// pointer parameters. In particular, in the kernel, functions that may fail
+/// typically return an `int` that represents a generic error code. We model
+/// those as [`Error`].
+///
+/// In Rust, it is idiomatic to model functions that may fail as returning
+/// a [`Result`]. Since in the kernel many functions return an error code,
+/// [`Result`] is a type alias for a [`core::result::Result`] that uses
+/// [`Error`] as its error type.
+///
+/// Note that even if a function does not return anything when it succeeds,
+/// it should still be modeled as returning a `Result` rather than
+/// just an [`Error`].
+pub type Result<T = ()> = core::result::Result<T, Error>;
diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
new file mode 100644
index 000000000000..abd46261d385
--- /dev/null
+++ b/rust/kernel/lib.rs
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! The `kernel` crate.
+//!
+//! This crate contains the kernel APIs that have been ported or wrapped for
+//! usage by Rust code in the kernel and is shared by all of them.
+//!
+//! In other words, all the rest of the Rust code in the kernel (e.g. kernel
+//! modules written in Rust) depends on [`core`], [`alloc`] and this crate.
+//!
+//! If you need a kernel C API that is not ported or wrapped yet here, then
+//! do so first instead of bypassing this crate.
+
+#![no_std]
+#![feature(core_ffi_c)]
+
+// Ensure conditional compilation based on the kernel configuration works;
+// otherwise we may silently break things like initcall handling.
+#[cfg(not(CONFIG_RUST))]
+compile_error!("Missing kernel configuration for conditional compilation");
+
+#[cfg(not(test))]
+#[cfg(not(testlib))]
+mod allocator;
+pub mod error;
+pub mod prelude;
+pub mod print;
+pub mod str;
+
+#[doc(hidden)]
+pub use bindings;
+pub use macros;
+
+/// Prefix to appear before log messages printed from within the `kernel` crate.
+const __LOG_PREFIX: &[u8] = b"rust_kernel\0";
+
+/// The top level entrypoint to implementing a kernel module.
+///
+/// For any teardown or cleanup operations, your type may implement [`Drop`].
+pub trait Module: Sized + Sync {
+    /// Called at module initialization time.
+    ///
+    /// Use this method to perform whatever setup or registration your module
+    /// should do.
+    ///
+    /// Equivalent to the `module_init` macro in the C API.
+    fn init(module: &'static ThisModule) -> error::Result<Self>;
+}
+
+/// Equivalent to `THIS_MODULE` in the C API.
+///
+/// C header: `include/linux/export.h`
+pub struct ThisModule(*mut bindings::module);
+
+// SAFETY: `THIS_MODULE` may be used from all threads within a module.
+unsafe impl Sync for ThisModule {}
+
+impl ThisModule {
+    /// Creates a [`ThisModule`] given the `THIS_MODULE` pointer.
+    ///
+    /// # Safety
+    ///
+    /// The pointer must be equal to the right `THIS_MODULE`.
+    pub const unsafe fn from_ptr(ptr: *mut bindings::module) -> ThisModule {
+        ThisModule(ptr)
+    }
+}
+
+#[cfg(not(any(testlib, test)))]
+#[panic_handler]
+fn panic(info: &core::panic::PanicInfo<'_>) -> ! {
+    pr_emerg!("{}\n", info);
+    // SAFETY: FFI call.
+    unsafe { bindings::BUG() };
+    // Bindgen currently does not recognize `__noreturn` so `BUG` returns `()`
+    // instead of `!`. See <https://github.com/rust-lang/rust-bindgen/issues/2094>.
+    loop {}
+}
diff --git a/rust/kernel/prelude.rs b/rust/kernel/prelude.rs
new file mode 100644
index 000000000000..495e22250726
--- /dev/null
+++ b/rust/kernel/prelude.rs
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! The `kernel` prelude.
+//!
+//! These are the most common items used by Rust code in the kernel,
+//! intended to be imported by all Rust code, for convenience.
+//!
+//! # Examples
+//!
+//! ```
+//! use kernel::prelude::*;
+//! ```
+
+pub use super::{
+    error::{Error, Result},
+    pr_emerg, pr_info, ThisModule,
+};
+pub use alloc::{boxed::Box, vec::Vec};
+pub use core::pin::Pin;
+pub use macros::module;
diff --git a/rust/kernel/print.rs b/rust/kernel/print.rs
new file mode 100644
index 000000000000..55db5a1ba752
--- /dev/null
+++ b/rust/kernel/print.rs
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Printing facilities.
+//!
+//! C header: [`include/linux/printk.h`](../../../../include/linux/printk.h)
+//!
+//! Reference: <https://www.kernel.org/doc/html/latest/core-api/printk-basics.html>
+
+use core::{
+    ffi::{c_char, c_void},
+    fmt,
+};
+
+use crate::str::RawFormatter;
+
+#[cfg(CONFIG_PRINTK)]
+use crate::bindings;
+
+// Called from `vsprintf` with format specifier `%pA`.
+#[no_mangle]
+unsafe fn rust_fmt_argument(buf: *mut c_char, end: *mut c_char, ptr: *const c_void) -> *mut c_char {
+    use fmt::Write;
+    // SAFETY: The C contract guarantees that `buf` is valid if it's less than `end`.
+    let mut w = unsafe { RawFormatter::from_ptrs(buf.cast(), end.cast()) };
+    let _ = w.write_fmt(unsafe { *(ptr as *const fmt::Arguments<'_>) });
+    w.pos().cast()
+}
+
+/// Format strings.
+///
+/// Public but hidden since it should only be used from public macros.
+#[doc(hidden)]
+pub mod format_strings {
+    use crate::bindings;
+
+    /// The length we copy from the `KERN_*` kernel prefixes.
+    const LENGTH_PREFIX: usize = 2;
+
+    /// The length of the fixed format strings.
+    pub const LENGTH: usize = 10;
+
+    /// Generates a fixed format string for the kernel's [`_printk`].
+    ///
+    /// The format string is always the same for a given level, i.e. for a
+    /// given `prefix`, which are the kernel's `KERN_*` constants.
+    ///
+    /// [`_printk`]: ../../../../include/linux/printk.h
+    const fn generate(is_cont: bool, prefix: &[u8; 3]) -> [u8; LENGTH] {
+        // Ensure the `KERN_*` macros are what we expect.
+        assert!(prefix[0] == b'\x01');
+        if is_cont {
+            assert!(prefix[1] == b'c');
+        } else {
+            assert!(prefix[1] >= b'0' && prefix[1] <= b'7');
+        }
+        assert!(prefix[2] == b'\x00');
+
+        let suffix: &[u8; LENGTH - LENGTH_PREFIX] = if is_cont {
+            b"%pA\0\0\0\0\0"
+        } else {
+            b"%s: %pA\0"
+        };
+
+        [
+            prefix[0], prefix[1], suffix[0], suffix[1], suffix[2], suffix[3], suffix[4], suffix[5],
+            suffix[6], suffix[7],
+        ]
+    }
+
+    // Generate the format strings at compile-time.
+    //
+    // This avoids the compiler generating the contents on the fly in the stack.
+    //
+    // Furthermore, `static` instead of `const` is used to share the strings
+    // for all the kernel.
+    pub static EMERG: [u8; LENGTH] = generate(false, bindings::KERN_EMERG);
+    pub static INFO: [u8; LENGTH] = generate(false, bindings::KERN_INFO);
+}
+
+/// Prints a message via the kernel's [`_printk`].
+///
+/// Public but hidden since it should only be used from public macros.
+///
+/// # Safety
+///
+/// The format string must be one of the ones in [`format_strings`], and
+/// the module name must be null-terminated.
+///
+/// [`_printk`]: ../../../../include/linux/_printk.h
+#[doc(hidden)]
+#[cfg_attr(not(CONFIG_PRINTK), allow(unused_variables))]
+pub unsafe fn call_printk(
+    format_string: &[u8; format_strings::LENGTH],
+    module_name: &[u8],
+    args: fmt::Arguments<'_>,
+) {
+    // `_printk` does not seem to fail in any path.
+    #[cfg(CONFIG_PRINTK)]
+    unsafe {
+        bindings::_printk(
+            format_string.as_ptr() as _,
+            module_name.as_ptr(),
+            &args as *const _ as *const c_void,
+        );
+    }
+}
+
+/// Performs formatting and forwards the string to [`call_printk`].
+///
+/// Public but hidden since it should only be used from public macros.
+#[doc(hidden)]
+#[cfg(not(testlib))]
+#[macro_export]
+#[allow(clippy::crate_in_macro_def)]
+macro_rules! print_macro (
+    // The non-continuation cases (most of them, e.g. `INFO`).
+    ($format_string:path, $($arg:tt)+) => (
+        // SAFETY: This hidden macro should only be called by the documented
+        // printing macros which ensure the format string is one of the fixed
+        // ones. All `__LOG_PREFIX`s are null-terminated as they are generated
+        // by the `module!` proc macro or fixed values defined in a kernel
+        // crate.
+        unsafe {
+            $crate::print::call_printk(
+                &$format_string,
+                crate::__LOG_PREFIX,
+                format_args!($($arg)+),
+            );
+        }
+    );
+);
+
+/// Stub for doctests
+#[cfg(testlib)]
+#[macro_export]
+macro_rules! print_macro (
+    ($format_string:path, $e:expr, $($arg:tt)+) => (
+        ()
+    );
+);
+
+// We could use a macro to generate these macros. However, doing so ends
+// up being a bit ugly: it requires the dollar token trick to escape `$` as
+// well as playing with the `doc` attribute. Furthermore, they cannot be easily
+// imported in the prelude due to [1]. So, for the moment, we just write them
+// manually, like in the C side; while keeping most of the logic in another
+// macro, i.e. [`print_macro`].
+//
+// [1]: https://github.com/rust-lang/rust/issues/52234
+
+/// Prints an emergency-level message (level 0).
+///
+/// Use this level if the system is unusable.
+///
+/// Equivalent to the kernel's [`pr_emerg`] macro.
+///
+/// Mimics the interface of [`std::print!`]. See [`core::fmt`] and
+/// `alloc::format!` for information about the formatting syntax.
+///
+/// [`pr_emerg`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html#c.pr_emerg
+/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html
+///
+/// # Examples
+///
+/// ```
+/// pr_emerg!("hello {}\n", "there");
+/// ```
+#[macro_export]
+macro_rules! pr_emerg (
+    ($($arg:tt)*) => (
+        $crate::print_macro!($crate::print::format_strings::EMERG, $($arg)*)
+    )
+);
+
+/// Prints an info-level message (level 6).
+///
+/// Use this level for informational messages.
+///
+/// Equivalent to the kernel's [`pr_info`] macro.
+///
+/// Mimics the interface of [`std::print!`]. See [`core::fmt`] and
+/// `alloc::format!` for information about the formatting syntax.
+///
+/// [`pr_info`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html#c.pr_info
+/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html
+///
+/// # Examples
+///
+/// ```
+/// pr_info!("hello {}\n", "there");
+/// ```
+#[macro_export]
+#[doc(alias = "print")]
+macro_rules! pr_info (
+    ($($arg:tt)*) => (
+        $crate::print_macro!($crate::print::format_strings::INFO, $($arg)*)
+    )
+);
diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs
new file mode 100644
index 000000000000..e45ff220ae50
--- /dev/null
+++ b/rust/kernel/str.rs
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! String representations.
+
+use core::fmt;
+
+/// Allows formatting of [`fmt::Arguments`] into a raw buffer.
+///
+/// It does not fail if callers write past the end of the buffer so that they can calculate the
+/// size required to fit everything.
+///
+/// # Invariants
+///
+/// The memory region between `pos` (inclusive) and `end` (exclusive) is valid for writes if `pos`
+/// is less than `end`.
+pub(crate) struct RawFormatter {
+    // Use `usize` to use `saturating_*` functions.
+    #[allow(dead_code)]
+    beg: usize,
+    pos: usize,
+    end: usize,
+}
+
+impl RawFormatter {
+    /// Creates a new instance of [`RawFormatter`] with the given buffer pointers.
+    ///
+    /// # Safety
+    ///
+    /// If `pos` is less than `end`, then the region between `pos` (inclusive) and `end`
+    /// (exclusive) must be valid for writes for the lifetime of the returned [`RawFormatter`].
+    pub(crate) unsafe fn from_ptrs(pos: *mut u8, end: *mut u8) -> Self {
+        // INVARIANT: The safety requierments guarantee the type invariants.
+        Self {
+            beg: pos as _,
+            pos: pos as _,
+            end: end as _,
+        }
+    }
+
+    /// Returns the current insert position.
+    ///
+    /// N.B. It may point to invalid memory.
+    pub(crate) fn pos(&self) -> *mut u8 {
+        self.pos as _
+    }
+}
+
+impl fmt::Write for RawFormatter {
+    fn write_str(&mut self, s: &str) -> fmt::Result {
+        // `pos` value after writing `len` bytes. This does not have to be bounded by `end`, but we
+        // don't want it to wrap around to 0.
+        let pos_new = self.pos.saturating_add(s.len());
+
+        // Amount that we can copy. `saturating_sub` ensures we get 0 if `pos` goes past `end`.
+        let len_to_copy = core::cmp::min(pos_new, self.end).saturating_sub(self.pos);
+
+        if len_to_copy > 0 {
+            // SAFETY: If `len_to_copy` is non-zero, then we know `pos` has not gone past `end`
+            // yet, so it is valid for write per the type invariants.
+            unsafe {
+                core::ptr::copy_nonoverlapping(
+                    s.as_bytes().as_ptr(),
+                    self.pos as *mut u8,
+                    len_to_copy,
+                )
+            };
+        }
+
+        self.pos = pos_new;
+        Ok(())
+    }
+}
-- 
cgit v1.2.3


From 8fcbf024920b96cd8c59f85d57d97af7237237b4 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sat, 3 Jul 2021 17:18:23 +0200
Subject: rust: export generated symbols
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All symbols are reexported reusing the `EXPORT_SYMBOL_GPL` macro
from C. The lists of symbols are generated on the fly.

There are three main sets of symbols to distinguish:

  - The ones from the `core` and `alloc` crates (from the Rust
    standard library). The code is licensed as Apache/MIT.

  - The ones from our abstractions in the `kernel` crate.

  - The helpers (already exported since they are not generated).

We export everything as GPL. This ensures we do not mistakenly
expose GPL kernel symbols/features as non-GPL, even indirectly.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Signed-off-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 rust/exports.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 rust/exports.c

diff --git a/rust/exports.c b/rust/exports.c
new file mode 100644
index 000000000000..bb7cc64cecd0
--- /dev/null
+++ b/rust/exports.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A hack to export Rust symbols for loadable modules without having to redo
+ * the entire `include/linux/export.h` logic in Rust.
+ *
+ * This requires the Rust's new/future `v0` mangling scheme because the default
+ * one ("legacy") uses invalid characters for C identifiers (thus we cannot use
+ * the `EXPORT_SYMBOL_*` macros).
+ *
+ * All symbols are exported as GPL-only to guarantee no GPL-only feature is
+ * accidentally exposed.
+ */
+
+#include <linux/module.h>
+
+#define EXPORT_SYMBOL_RUST_GPL(sym) extern int sym; EXPORT_SYMBOL_GPL(sym)
+
+#include "exports_core_generated.h"
+#include "exports_alloc_generated.h"
+#include "exports_bindings_generated.h"
+#include "exports_kernel_generated.h"
-- 
cgit v1.2.3


From 787983da77185d355564b0436f7b4eaa40b8904b Mon Sep 17 00:00:00 2001
From: Gary Guo <gary@garyguo.net>
Date: Sat, 3 Jul 2021 17:38:57 +0200
Subject: vsprintf: add new `%pA` format specifier

This patch adds a format specifier `%pA` to `vsprintf` which formats
a pointer as `core::fmt::Arguments`. Doing so allows us to directly
format to the internal buffer of `printf`, so we do not have to use
a temporary buffer on the stack to pre-assemble the message on
the Rust side.

This specifier is intended only to be used from Rust and not for C, so
`checkpatch.pl` is intentionally unchanged to catch any misuse.

Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Documentation/core-api/printk-formats.rst | 10 ++++++++++
 lib/vsprintf.c                            | 13 +++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
index 5e89497ba314..dbe1aacc79d0 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -625,6 +625,16 @@ Examples::
 	%p4cc	Y10  little-endian (0x20303159)
 	%p4cc	NV12 big-endian (0xb231564e)
 
+Rust
+----
+
+::
+
+	%pA
+
+Only intended to be used from Rust code to format ``core::fmt::Arguments``.
+Do *not* use it from C.
+
 Thanks
 ======
 
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3c1853a9d1c0..c414a8d9f1ea 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -2246,6 +2246,9 @@ int __init no_hash_pointers_enable(char *str)
 }
 early_param("no_hash_pointers", no_hash_pointers_enable);
 
+/* Used for Rust formatting ('%pA'). */
+char *rust_fmt_argument(char *buf, char *end, void *ptr);
+
 /*
  * Show a '%p' thing.  A kernel extension is that the '%p' is followed
  * by an extra set of alphanumeric characters that are extended format
@@ -2372,6 +2375,10 @@ early_param("no_hash_pointers", no_hash_pointers_enable);
  *
  * Note: The default behaviour (unadorned %p) is to hash the address,
  * rendering it useful as a unique identifier.
+ *
+ * There is also a '%pA' format specifier, but it is only intended to be used
+ * from Rust code to format core::fmt::Arguments. Do *not* use it from C.
+ * See rust/kernel/print.rs for details.
  */
 static noinline_for_stack
 char *pointer(const char *fmt, char *buf, char *end, void *ptr,
@@ -2444,6 +2451,12 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 		return device_node_string(buf, end, ptr, spec, fmt + 1);
 	case 'f':
 		return fwnode_string(buf, end, ptr, spec, fmt + 1);
+	case 'A':
+		if (!IS_ENABLED(CONFIG_RUST)) {
+			WARN_ONCE(1, "Please remove %%pA from non-Rust code\n");
+			return error_string(buf, end, "(%pA?)", spec);
+		}
+		return rust_fmt_argument(buf, end, ptr);
 	case 'x':
 		return pointer_string(buf, end, ptr, spec);
 	case 'e':
-- 
cgit v1.2.3


From de48fa1a01e7752135c960a20d6c3b26544a8120 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sun, 22 May 2022 03:11:08 +0200
Subject: scripts: checkpatch: diagnose uses of `%pA` in the C side as errors

The `%pA` format specifier is only intended to be used from Rust.

`checkpatch.pl` already gives a warning for invalid specificers:

    WARNING: Invalid vsprintf pointer extension '%pA'

This makes it an error and introduces an explanatory message:

    ERROR: Invalid vsprintf pointer extension '%pA' - '%pA' is only intended to be used from Rust code

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Joe Perches <joe@perches.com>
Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 scripts/checkpatch.pl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 79e759aac543..74a769310adf 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -6783,15 +6783,19 @@ sub process {
 				}
 				if ($bad_specifier ne "") {
 					my $stat_real = get_stat_real($linenr, $lc);
+					my $msg_level = \&WARN;
 					my $ext_type = "Invalid";
 					my $use = "";
 					if ($bad_specifier =~ /p[Ff]/) {
 						$use = " - use %pS instead";
 						$use =~ s/pS/ps/ if ($bad_specifier =~ /pf/);
+					} elsif ($bad_specifier =~ /pA/) {
+						$use =  " - '%pA' is only intended to be used from Rust code";
+						$msg_level = \&ERROR;
 					}
 
-					WARN("VSPRINTF_POINTER_EXTENSION",
-					     "$ext_type vsprintf pointer extension '$bad_specifier'$use\n" . "$here\n$stat_real\n");
+					&{$msg_level}("VSPRINTF_POINTER_EXTENSION",
+						      "$ext_type vsprintf pointer extension '$bad_specifier'$use\n" . "$here\n$stat_real\n");
 				}
 			}
 		}
-- 
cgit v1.2.3


From d1d84b5f73888ccb9fc148dfc3cb3e15d3604d65 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sun, 22 May 2022 17:22:58 +0200
Subject: scripts: checkpatch: enable language-independent checks for Rust

Include Rust in the "source code files" category, so that
the language-independent tests are checked for Rust too,
and teach `checkpatch` about the comment style for Rust files.

This enables the malformed SPDX check, the misplaced SPDX license
tag check, the long line checks, the lines without a newline check
and the embedded filename check.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 scripts/checkpatch.pl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 74a769310adf..b5ed31d631fa 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3616,7 +3616,7 @@ sub process {
 				my $comment = "";
 				if ($realfile =~ /\.(h|s|S)$/) {
 					$comment = '/*';
-				} elsif ($realfile =~ /\.(c|dts|dtsi)$/) {
+				} elsif ($realfile =~ /\.(c|rs|dts|dtsi)$/) {
 					$comment = '//';
 				} elsif (($checklicenseline == 2) || $realfile =~ /\.(sh|pl|py|awk|tc|yaml)$/) {
 					$comment = '#';
@@ -3664,7 +3664,7 @@ sub process {
 		}
 
 # check we are in a valid source file if not then ignore this hunk
-		next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/);
+		next if ($realfile !~ /\.(h|c|rs|s|S|sh|dtsi|dts)$/);
 
 # check for using SPDX-License-Identifier on the wrong line number
 		if ($realline != $checklicenseline &&
-- 
cgit v1.2.3


From 99115db4ecc87af73415939439ec604ea0531e6f Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sun, 5 Dec 2021 19:00:43 +0100
Subject: scripts: decode_stacktrace: demangle Rust symbols

Recent versions of both Binutils (`c++filt`) and LLVM (`llvm-cxxfilt`)
provide Rust v0 mangling support.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 scripts/decode_stacktrace.sh | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 7075e26ab2c4..564c5632e1a2 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -8,6 +8,14 @@ usage() {
 	echo "	$0 -r <release> | <vmlinux> [<base path>|auto] [<modules path>]"
 }
 
+# Try to find a Rust demangler
+if type llvm-cxxfilt >/dev/null 2>&1 ; then
+	cppfilt=llvm-cxxfilt
+elif type c++filt >/dev/null 2>&1 ; then
+	cppfilt=c++filt
+	cppfilt_opts=-i
+fi
+
 if [[ $1 == "-r" ]] ; then
 	vmlinux=""
 	basepath="auto"
@@ -180,6 +188,12 @@ parse_symbol() {
 	# In the case of inlines, move everything to same line
 	code=${code//$'\n'/' '}
 
+	# Demangle if the name looks like a Rust symbol and if
+	# we got a Rust demangler
+	if [[ $name =~ ^_R && $cppfilt != "" ]] ; then
+		name=$("$cppfilt" "$cppfilt_opts" "$name")
+	fi
+
 	# Replace old address with pretty line numbers
 	symbol="$segment$name ($code)"
 }
-- 
cgit v1.2.3


From 8c4555ccc55cf90e1e3eb2507be3c354f3d15839 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sat, 3 Jul 2021 17:26:15 +0200
Subject: scripts: add `generate_rust_analyzer.py`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `generate_rust_analyzer.py` script generates the configuration
file (`rust-project.json`) for rust-analyzer.

rust-analyzer is a modular compiler frontend for the Rust language.
It provides an LSP server which can be used in editors such as
VS Code, Emacs or Vim.

Reviewed-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Finn Behrens <me@kloenk.de>
Signed-off-by: Finn Behrens <me@kloenk.de>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Boris-Chengbiao Zhou <bobo1239@web.de>
Signed-off-by: Boris-Chengbiao Zhou <bobo1239@web.de>
Co-developed-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Signed-off-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 .gitignore                        |   3 +
 scripts/generate_rust_analyzer.py | 135 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)
 create mode 100755 scripts/generate_rust_analyzer.py

diff --git a/.gitignore b/.gitignore
index 265959544978..80989914c97d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -162,3 +162,6 @@ x509.genkey
 
 # Documentation toolchain
 sphinx_*/
+
+# Rust analyzer configuration
+/rust-project.json
diff --git a/scripts/generate_rust_analyzer.py b/scripts/generate_rust_analyzer.py
new file mode 100755
index 000000000000..75bb611bd751
--- /dev/null
+++ b/scripts/generate_rust_analyzer.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+"""generate_rust_analyzer - Generates the `rust-project.json` file for `rust-analyzer`.
+"""
+
+import argparse
+import json
+import logging
+import pathlib
+import sys
+
+def generate_crates(srctree, objtree, sysroot_src):
+    # Generate the configuration list.
+    cfg = []
+    with open(objtree / "include" / "generated" / "rustc_cfg") as fd:
+        for line in fd:
+            line = line.replace("--cfg=", "")
+            line = line.replace("\n", "")
+            cfg.append(line)
+
+    # Now fill the crates list -- dependencies need to come first.
+    #
+    # Avoid O(n^2) iterations by keeping a map of indexes.
+    crates = []
+    crates_indexes = {}
+
+    def append_crate(display_name, root_module, deps, cfg=[], is_workspace_member=True, is_proc_macro=False):
+        crates_indexes[display_name] = len(crates)
+        crates.append({
+            "display_name": display_name,
+            "root_module": str(root_module),
+            "is_workspace_member": is_workspace_member,
+            "is_proc_macro": is_proc_macro,
+            "deps": [{"crate": crates_indexes[dep], "name": dep} for dep in deps],
+            "cfg": cfg,
+            "edition": "2021",
+            "env": {
+                "RUST_MODFILE": "This is only for rust-analyzer"
+            }
+        })
+
+    # First, the ones in `rust/` since they are a bit special.
+    append_crate(
+        "core",
+        sysroot_src / "core" / "src" / "lib.rs",
+        [],
+        is_workspace_member=False,
+    )
+
+    append_crate(
+        "compiler_builtins",
+        srctree / "rust" / "compiler_builtins.rs",
+        [],
+    )
+
+    append_crate(
+        "alloc",
+        srctree / "rust" / "alloc" / "lib.rs",
+        ["core", "compiler_builtins"],
+    )
+
+    append_crate(
+        "macros",
+        srctree / "rust" / "macros" / "lib.rs",
+        [],
+        is_proc_macro=True,
+    )
+    crates[-1]["proc_macro_dylib_path"] = "rust/libmacros.so"
+
+    append_crate(
+        "bindings",
+        srctree / "rust"/ "bindings" / "lib.rs",
+        ["core"],
+        cfg=cfg,
+    )
+    crates[-1]["env"]["OBJTREE"] = str(objtree.resolve(True))
+
+    append_crate(
+        "kernel",
+        srctree / "rust" / "kernel" / "lib.rs",
+        ["core", "alloc", "macros", "bindings"],
+        cfg=cfg,
+    )
+    crates[-1]["source"] = {
+        "include_dirs": [
+            str(srctree / "rust" / "kernel"),
+            str(objtree / "rust")
+        ],
+        "exclude_dirs": [],
+    }
+
+    # Then, the rest outside of `rust/`.
+    #
+    # We explicitly mention the top-level folders we want to cover.
+    for folder in ("samples", "drivers"):
+        for path in (srctree / folder).rglob("*.rs"):
+            logging.info("Checking %s", path)
+            name = path.name.replace(".rs", "")
+
+            # Skip those that are not crate roots.
+            if f"{name}.o" not in open(path.parent / "Makefile").read():
+                continue
+
+            logging.info("Adding %s", name)
+            append_crate(
+                name,
+                path,
+                ["core", "alloc", "kernel"],
+                cfg=cfg,
+            )
+
+    return crates
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--verbose', '-v', action='store_true')
+    parser.add_argument("srctree", type=pathlib.Path)
+    parser.add_argument("objtree", type=pathlib.Path)
+    parser.add_argument("sysroot_src", type=pathlib.Path)
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        format="[%(asctime)s] [%(levelname)s] %(message)s",
+        level=logging.INFO if args.verbose else logging.WARNING
+    )
+
+    rust_project = {
+        "crates": generate_crates(args.srctree, args.objtree, args.sysroot_src),
+        "sysroot_src": str(args.sysroot_src),
+    }
+
+    json.dump(rust_project, sys.stdout, sort_keys=True, indent=4)
+
+if __name__ == "__main__":
+    main()
-- 
cgit v1.2.3


From 9a8ff24ce584ad9895f9416fe8fad6f8842f758d Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Thu, 4 Aug 2022 12:20:37 +0200
Subject: scripts: add `generate_rust_target.rs`

This script takes care of generating the custom target specification
file for `rustc`, based on the kernel configuration.

It also serves as an example of a Rust host program.

A dummy architecture is kept in this patch so that a later patch
adds x86 support on top with as few changes as possible.

Reviewed-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: David Gow <davidgow@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 scripts/.gitignore              |   1 +
 scripts/generate_rust_target.rs | 171 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 172 insertions(+)
 create mode 100644 scripts/generate_rust_target.rs

diff --git a/scripts/.gitignore b/scripts/.gitignore
index eed308bef604..b7aec8eb1bd4 100644
--- a/scripts/.gitignore
+++ b/scripts/.gitignore
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 /asn1_compiler
 /bin2c
+/generate_rust_target
 /insert-sys-cert
 /kallsyms
 /module.lds
diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs
new file mode 100644
index 000000000000..7256c9606cf0
--- /dev/null
+++ b/scripts/generate_rust_target.rs
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! The custom target specification file generator for `rustc`.
+//!
+//! To configure a target from scratch, a JSON-encoded file has to be passed
+//! to `rustc` (introduced in [RFC 131]). These options and the file itself are
+//! unstable. Eventually, `rustc` should provide a way to do this in a stable
+//! manner. For instance, via command-line arguments. Therefore, this file
+//! should avoid using keys which can be set via `-C` or `-Z` options.
+//!
+//! [RFC 131]: https://rust-lang.github.io/rfcs/0131-target-specification.html
+
+use std::{
+    collections::HashMap,
+    fmt::{Display, Formatter, Result},
+    io::BufRead,
+};
+
+enum Value {
+    Boolean(bool),
+    Number(i32),
+    String(String),
+    Object(Object),
+}
+
+type Object = Vec<(String, Value)>;
+
+/// Minimal "almost JSON" generator (e.g. no `null`s, no arrays, no escaping),
+/// enough for this purpose.
+impl Display for Value {
+    fn fmt(&self, formatter: &mut Formatter<'_>) -> Result {
+        match self {
+            Value::Boolean(boolean) => write!(formatter, "{}", boolean),
+            Value::Number(number) => write!(formatter, "{}", number),
+            Value::String(string) => write!(formatter, "\"{}\"", string),
+            Value::Object(object) => {
+                formatter.write_str("{")?;
+                if let [ref rest @ .., ref last] = object[..] {
+                    for (key, value) in rest {
+                        write!(formatter, "\"{}\": {},", key, value)?;
+                    }
+                    write!(formatter, "\"{}\": {}", last.0, last.1)?;
+                }
+                formatter.write_str("}")
+            }
+        }
+    }
+}
+
+struct TargetSpec(Object);
+
+impl TargetSpec {
+    fn new() -> TargetSpec {
+        TargetSpec(Vec::new())
+    }
+}
+
+trait Push<T> {
+    fn push(&mut self, key: &str, value: T);
+}
+
+impl Push<bool> for TargetSpec {
+    fn push(&mut self, key: &str, value: bool) {
+        self.0.push((key.to_string(), Value::Boolean(value)));
+    }
+}
+
+impl Push<i32> for TargetSpec {
+    fn push(&mut self, key: &str, value: i32) {
+        self.0.push((key.to_string(), Value::Number(value)));
+    }
+}
+
+impl Push<String> for TargetSpec {
+    fn push(&mut self, key: &str, value: String) {
+        self.0.push((key.to_string(), Value::String(value)));
+    }
+}
+
+impl Push<&str> for TargetSpec {
+    fn push(&mut self, key: &str, value: &str) {
+        self.push(key, value.to_string());
+    }
+}
+
+impl Push<Object> for TargetSpec {
+    fn push(&mut self, key: &str, value: Object) {
+        self.0.push((key.to_string(), Value::Object(value)));
+    }
+}
+
+impl Display for TargetSpec {
+    fn fmt(&self, formatter: &mut Formatter<'_>) -> Result {
+        // We add some newlines for clarity.
+        formatter.write_str("{\n")?;
+        if let [ref rest @ .., ref last] = self.0[..] {
+            for (key, value) in rest {
+                write!(formatter, "    \"{}\": {},\n", key, value)?;
+            }
+            write!(formatter, "    \"{}\": {}\n", last.0, last.1)?;
+        }
+        formatter.write_str("}")
+    }
+}
+
+struct KernelConfig(HashMap<String, String>);
+
+impl KernelConfig {
+    /// Parses `include/config/auto.conf` from `stdin`.
+    fn from_stdin() -> KernelConfig {
+        let mut result = HashMap::new();
+
+        let stdin = std::io::stdin();
+        let mut handle = stdin.lock();
+        let mut line = String::new();
+
+        loop {
+            line.clear();
+
+            if handle.read_line(&mut line).unwrap() == 0 {
+                break;
+            }
+
+            if line.starts_with('#') {
+                continue;
+            }
+
+            let (key, value) = line.split_once('=').expect("Missing `=` in line.");
+            result.insert(key.to_string(), value.trim_end_matches('\n').to_string());
+        }
+
+        KernelConfig(result)
+    }
+
+    /// Does the option exist in the configuration (any value)?
+    ///
+    /// The argument must be passed without the `CONFIG_` prefix.
+    /// This avoids repetition and it also avoids `fixdep` making us
+    /// depend on it.
+    fn has(&self, option: &str) -> bool {
+        let option = "CONFIG_".to_owned() + option;
+        self.0.contains_key(&option)
+    }
+}
+
+fn main() {
+    let cfg = KernelConfig::from_stdin();
+    let mut ts = TargetSpec::new();
+
+    // `llvm-target`s are taken from `scripts/Makefile.clang`.
+    if cfg.has("DUMMY_ARCH") {
+        ts.push("arch", "dummy_arch");
+    } else {
+        panic!("Unsupported architecture");
+    }
+
+    ts.push("emit-debug-gdb-scripts", false);
+    ts.push("frame-pointer", "may-omit");
+    ts.push(
+        "stack-probes",
+        vec![("kind".to_string(), Value::String("none".to_string()))],
+    );
+
+    // Everything else is LE, whether `CPU_LITTLE_ENDIAN` is declared or not
+    // (e.g. x86). It is also `rustc`'s default.
+    if cfg.has("CPU_BIG_ENDIAN") {
+        ts.push("target-endian", "big");
+    }
+
+    println!("{}", ts);
+}
-- 
cgit v1.2.3


From 78521f3399abce9bb9db16d848044be873e117ac Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Thu, 4 Aug 2022 12:25:53 +0200
Subject: scripts: add `rust_is_available.sh`

This script tests whether the Rust toolchain requirements are in place
to enable Rust support. It uses `min-tool-version.sh` to fetch
the version numbers.

The build system will call it to set `CONFIG_RUST_IS_AVAILABLE` in
a later patch.

It also has an option (`-v`) to explain what is missing, which is
useful to set up the development environment. This is used via
the `make rustavailable` target added in a later patch.

Reviewed-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Finn Behrens <me@kloenk.de>
Signed-off-by: Finn Behrens <me@kloenk.de>
Co-developed-by: Miguel Cano <macanroj@gmail.com>
Signed-off-by: Miguel Cano <macanroj@gmail.com>
Co-developed-by: Tiago Lam <tiagolam@gmail.com>
Signed-off-by: Tiago Lam <tiagolam@gmail.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 scripts/min-tool-version.sh                  |   6 +
 scripts/rust_is_available.sh                 | 160 +++++++++++++++++++++++++++
 scripts/rust_is_available_bindgen_libclang.h |   2 +
 3 files changed, 168 insertions(+)
 create mode 100755 scripts/rust_is_available.sh
 create mode 100644 scripts/rust_is_available_bindgen_libclang.h

diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh
index 250925aab101..b6593eac5003 100755
--- a/scripts/min-tool-version.sh
+++ b/scripts/min-tool-version.sh
@@ -30,6 +30,12 @@ llvm)
 		echo 11.0.0
 	fi
 	;;
+rustc)
+	echo 1.62.0
+	;;
+bindgen)
+	echo 0.56.0
+	;;
 *)
 	echo "$1: unknown tool" >&2
 	exit 1
diff --git a/scripts/rust_is_available.sh b/scripts/rust_is_available.sh
new file mode 100755
index 000000000000..aebbf1913970
--- /dev/null
+++ b/scripts/rust_is_available.sh
@@ -0,0 +1,160 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Tests whether a suitable Rust toolchain is available.
+#
+# Pass `-v` for human output and more checks (as warnings).
+
+set -e
+
+min_tool_version=$(dirname $0)/min-tool-version.sh
+
+# Convert the version string x.y.z to a canonical up-to-7-digits form.
+#
+# Note that this function uses one more digit (compared to other
+# instances in other version scripts) to give a bit more space to
+# `rustc` since it will reach 1.100.0 in late 2026.
+get_canonical_version()
+{
+	IFS=.
+	set -- $1
+	echo $((100000 * $1 + 100 * $2 + $3))
+}
+
+# Check that the Rust compiler exists.
+if ! command -v "$RUSTC" >/dev/null; then
+	if [ "$1" = -v ]; then
+		echo >&2 "***"
+		echo >&2 "*** Rust compiler '$RUSTC' could not be found."
+		echo >&2 "***"
+	fi
+	exit 1
+fi
+
+# Check that the Rust bindings generator exists.
+if ! command -v "$BINDGEN" >/dev/null; then
+	if [ "$1" = -v ]; then
+		echo >&2 "***"
+		echo >&2 "*** Rust bindings generator '$BINDGEN' could not be found."
+		echo >&2 "***"
+	fi
+	exit 1
+fi
+
+# Check that the Rust compiler version is suitable.
+#
+# Non-stable and distributions' versions may have a version suffix, e.g. `-dev`.
+rust_compiler_version=$( \
+	LC_ALL=C "$RUSTC" --version 2>/dev/null \
+		| head -n 1 \
+		| grep -oE '[0-9]+\.[0-9]+\.[0-9]+' \
+)
+rust_compiler_min_version=$($min_tool_version rustc)
+rust_compiler_cversion=$(get_canonical_version $rust_compiler_version)
+rust_compiler_min_cversion=$(get_canonical_version $rust_compiler_min_version)
+if [ "$rust_compiler_cversion" -lt "$rust_compiler_min_cversion" ]; then
+	if [ "$1" = -v ]; then
+		echo >&2 "***"
+		echo >&2 "*** Rust compiler '$RUSTC' is too old."
+		echo >&2 "***   Your version:    $rust_compiler_version"
+		echo >&2 "***   Minimum version: $rust_compiler_min_version"
+		echo >&2 "***"
+	fi
+	exit 1
+fi
+if [ "$1" = -v ] && [ "$rust_compiler_cversion" -gt "$rust_compiler_min_cversion" ]; then
+	echo >&2 "***"
+	echo >&2 "*** Rust compiler '$RUSTC' is too new. This may or may not work."
+	echo >&2 "***   Your version:     $rust_compiler_version"
+	echo >&2 "***   Expected version: $rust_compiler_min_version"
+	echo >&2 "***"
+fi
+
+# Check that the Rust bindings generator is suitable.
+#
+# Non-stable and distributions' versions may have a version suffix, e.g. `-dev`.
+rust_bindings_generator_version=$( \
+	LC_ALL=C "$BINDGEN" --version 2>/dev/null \
+		| head -n 1 \
+		| grep -oE '[0-9]+\.[0-9]+\.[0-9]+' \
+)
+rust_bindings_generator_min_version=$($min_tool_version bindgen)
+rust_bindings_generator_cversion=$(get_canonical_version $rust_bindings_generator_version)
+rust_bindings_generator_min_cversion=$(get_canonical_version $rust_bindings_generator_min_version)
+if [ "$rust_bindings_generator_cversion" -lt "$rust_bindings_generator_min_cversion" ]; then
+	if [ "$1" = -v ]; then
+		echo >&2 "***"
+		echo >&2 "*** Rust bindings generator '$BINDGEN' is too old."
+		echo >&2 "***   Your version:    $rust_bindings_generator_version"
+		echo >&2 "***   Minimum version: $rust_bindings_generator_min_version"
+		echo >&2 "***"
+	fi
+	exit 1
+fi
+if [ "$1" = -v ] && [ "$rust_bindings_generator_cversion" -gt "$rust_bindings_generator_min_cversion" ]; then
+	echo >&2 "***"
+	echo >&2 "*** Rust bindings generator '$BINDGEN' is too new. This may or may not work."
+	echo >&2 "***   Your version:     $rust_bindings_generator_version"
+	echo >&2 "***   Expected version: $rust_bindings_generator_min_version"
+	echo >&2 "***"
+fi
+
+# Check that the `libclang` used by the Rust bindings generator is suitable.
+bindgen_libclang_version=$( \
+	LC_ALL=C "$BINDGEN" $(dirname $0)/rust_is_available_bindgen_libclang.h 2>&1 >/dev/null \
+		| grep -F 'clang version ' \
+		| grep -oE '[0-9]+\.[0-9]+\.[0-9]+' \
+		| head -n 1 \
+)
+bindgen_libclang_min_version=$($min_tool_version llvm)
+bindgen_libclang_cversion=$(get_canonical_version $bindgen_libclang_version)
+bindgen_libclang_min_cversion=$(get_canonical_version $bindgen_libclang_min_version)
+if [ "$bindgen_libclang_cversion" -lt "$bindgen_libclang_min_cversion" ]; then
+	if [ "$1" = -v ]; then
+		echo >&2 "***"
+		echo >&2 "*** libclang (used by the Rust bindings generator '$BINDGEN') is too old."
+		echo >&2 "***   Your version:    $bindgen_libclang_version"
+		echo >&2 "***   Minimum version: $bindgen_libclang_min_version"
+		echo >&2 "***"
+	fi
+	exit 1
+fi
+
+# If the C compiler is Clang, then we can also check whether its version
+# matches the `libclang` version used by the Rust bindings generator.
+#
+# In the future, we might be able to perform a full version check, see
+# https://github.com/rust-lang/rust-bindgen/issues/2138.
+if [ "$1" = -v ]; then
+	cc_name=$($(dirname $0)/cc-version.sh "$CC" | cut -f1 -d' ')
+	if [ "$cc_name" = Clang ]; then
+		clang_version=$( \
+			LC_ALL=C "$CC" --version 2>/dev/null \
+				| sed -nE '1s:.*version ([0-9]+\.[0-9]+\.[0-9]+).*:\1:p'
+		)
+		if [ "$clang_version" != "$bindgen_libclang_version" ]; then
+			echo >&2 "***"
+			echo >&2 "*** libclang (used by the Rust bindings generator '$BINDGEN')"
+			echo >&2 "*** version does not match Clang's. This may be a problem."
+			echo >&2 "***   libclang version: $bindgen_libclang_version"
+			echo >&2 "***   Clang version:    $clang_version"
+			echo >&2 "***"
+		fi
+	fi
+fi
+
+# Check that the source code for the `core` standard library exists.
+#
+# `$KRUSTFLAGS` is passed in case the user added `--sysroot`.
+rustc_sysroot=$("$RUSTC" $KRUSTFLAGS --print sysroot)
+rustc_src=${RUST_LIB_SRC:-"$rustc_sysroot/lib/rustlib/src/rust/library"}
+rustc_src_core="$rustc_src/core/src/lib.rs"
+if [ ! -e "$rustc_src_core" ]; then
+	if [ "$1" = -v ]; then
+		echo >&2 "***"
+		echo >&2 "*** Source code for the 'core' standard library could not be found"
+		echo >&2 "*** at '$rustc_src_core'."
+		echo >&2 "***"
+	fi
+	exit 1
+fi
diff --git a/scripts/rust_is_available_bindgen_libclang.h b/scripts/rust_is_available_bindgen_libclang.h
new file mode 100644
index 000000000000..0ef6db10d674
--- /dev/null
+++ b/scripts/rust_is_available_bindgen_libclang.h
@@ -0,0 +1,2 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#pragma message("clang version " __clang_version__)
-- 
cgit v1.2.3


From e4b69cb9a99a567d1611f9cced92f475ae224cdb Mon Sep 17 00:00:00 2001
From: Daniel Xu <dxu@dxuuu.xyz>
Date: Thu, 4 Aug 2022 12:55:07 +0200
Subject: scripts: add `is_rust_module.sh`

This script is used to detect whether a kernel module is written
in Rust.

It will later be used to disable BTF generation on Rust modules as
BTF does not yet support Rust.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Co-developed-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 scripts/is_rust_module.sh | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100755 scripts/is_rust_module.sh

diff --git a/scripts/is_rust_module.sh b/scripts/is_rust_module.sh
new file mode 100755
index 000000000000..28b3831a7593
--- /dev/null
+++ b/scripts/is_rust_module.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# is_rust_module.sh module.ko
+#
+# Returns `0` if `module.ko` is a Rust module, `1` otherwise.
+
+set -e
+
+# Using the `16_` prefix ensures other symbols with the same substring
+# are not picked up (even if it would be unlikely). The last part is
+# used just in case LLVM decides to use the `.` suffix.
+#
+# In the future, checking for the `.comment` section may be another
+# option, see https://github.com/rust-lang/rust/pull/97550.
+${NM} "$*" | grep -qE '^[0-9a-fA-F]+ r _R[^[:space:]]+16___IS_RUST_MODULE[^[:space:]]*$'
-- 
cgit v1.2.3


From 80db40bac8f42f23132b2898b0490f8f76868a57 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Thu, 4 Aug 2022 12:54:09 +0200
Subject: rust: add `.rustfmt.toml`

This is the configuration file for the `rustfmt` tool.

`rustfmt` is a tool for formatting Rust code according to style guidelines.
It is very commonly used across Rust projects.

The default configuration options are used.

Reviewed-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 .gitignore    |  1 +
 .rustfmt.toml | 12 ++++++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 .rustfmt.toml

diff --git a/.gitignore b/.gitignore
index 80989914c97d..97e085d613a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,6 +97,7 @@ modules.order
 !.gitattributes
 !.gitignore
 !.mailmap
+!.rustfmt.toml
 
 #
 # Generated include files
diff --git a/.rustfmt.toml b/.rustfmt.toml
new file mode 100644
index 000000000000..3de5cc497465
--- /dev/null
+++ b/.rustfmt.toml
@@ -0,0 +1,12 @@
+edition = "2021"
+newline_style = "Unix"
+
+# Unstable options that help catching some mistakes in formatting and that we may want to enable
+# when they become stable.
+#
+# They are kept here since they are useful to run from time to time.
+#format_code_in_doc_comments = true
+#reorder_impl_items = true
+#comment_width = 100
+#wrap_comments = true
+#normalize_comments = true
-- 
cgit v1.2.3


From 2f7ab1267dc9b2d1f29695aff3211c87483480f3 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sat, 3 Jul 2021 16:42:57 +0200
Subject: Kbuild: add Rust support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Having most of the new files in place, we now enable Rust support
in the build system, including `Kconfig` entries related to Rust,
the Rust configuration printer and a few other bits.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Tested-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Finn Behrens <me@kloenk.de>
Signed-off-by: Finn Behrens <me@kloenk.de>
Co-developed-by: Adam Bratschi-Kaye <ark.email@gmail.com>
Signed-off-by: Adam Bratschi-Kaye <ark.email@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Co-developed-by: Sven Van Asbroeck <thesven73@gmail.com>
Signed-off-by: Sven Van Asbroeck <thesven73@gmail.com>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Boris-Chengbiao Zhou <bobo1239@web.de>
Signed-off-by: Boris-Chengbiao Zhou <bobo1239@web.de>
Co-developed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Co-developed-by: Douglas Su <d0u9.su@outlook.com>
Signed-off-by: Douglas Su <d0u9.su@outlook.com>
Co-developed-by: Dariusz Sosnowski <dsosnowski@dsosnowski.pl>
Signed-off-by: Dariusz Sosnowski <dsosnowski@dsosnowski.pl>
Co-developed-by: Antonio Terceiro <antonio.terceiro@linaro.org>
Signed-off-by: Antonio Terceiro <antonio.terceiro@linaro.org>
Co-developed-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Co-developed-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Signed-off-by: Björn Roy Baron <bjorn3_gh@protonmail.com>
Co-developed-by: Martin Rodriguez Reboredo <yakoyoku@gmail.com>
Signed-off-by: Martin Rodriguez Reboredo <yakoyoku@gmail.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 .gitignore                     |   2 +
 Makefile                       | 172 ++++++++++++++++++-
 arch/Kconfig                   |   6 +
 include/linux/compiler_types.h |   6 +-
 init/Kconfig                   |  46 ++++-
 kernel/configs/rust.config     |   1 +
 lib/Kconfig.debug              |  34 ++++
 rust/.gitignore                |   8 +
 rust/Makefile                  | 381 +++++++++++++++++++++++++++++++++++++++++
 rust/bindgen_parameters        |  21 +++
 scripts/Kconfig.include        |   6 +-
 scripts/Makefile               |   3 +
 scripts/Makefile.build         |  60 +++++++
 scripts/Makefile.debug         |   8 +
 scripts/Makefile.host          |  34 +++-
 scripts/Makefile.lib           |  12 ++
 scripts/Makefile.modfinal      |   8 +-
 scripts/cc-version.sh          |  12 +-
 scripts/kconfig/confdata.c     |  75 ++++++++
 19 files changed, 869 insertions(+), 26 deletions(-)
 create mode 100644 kernel/configs/rust.config
 create mode 100644 rust/.gitignore
 create mode 100644 rust/Makefile
 create mode 100644 rust/bindgen_parameters

diff --git a/.gitignore b/.gitignore
index 97e085d613a2..5da004814678 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,6 +37,8 @@
 *.o
 *.o.*
 *.patch
+*.rmeta
+*.rsi
 *.s
 *.so
 *.so.dbg
diff --git a/Makefile b/Makefile
index 647a42a1f800..c759ee315254 100644
--- a/Makefile
+++ b/Makefile
@@ -120,6 +120,15 @@ endif
 
 export KBUILD_CHECKSRC
 
+# Enable "clippy" (a linter) as part of the Rust compilation.
+#
+# Use 'make CLIPPY=1' to enable it.
+ifeq ("$(origin CLIPPY)", "command line")
+  KBUILD_CLIPPY := $(CLIPPY)
+endif
+
+export KBUILD_CLIPPY
+
 # Use make M=dir or set the environment variable KBUILD_EXTMOD to specify the
 # directory of external module to build. Setting M= takes precedence.
 ifeq ("$(origin M)", "command line")
@@ -270,14 +279,14 @@ no-dot-config-targets := $(clean-targets) \
 			 cscope gtags TAGS tags help% %docs check% coccicheck \
 			 $(version_h) headers headers_% archheaders archscripts \
 			 %asm-generic kernelversion %src-pkg dt_binding_check \
-			 outputmakefile
+			 outputmakefile rustavailable rustfmt rustfmtcheck
 # Installation targets should not require compiler. Unfortunately, vdso_install
 # is an exception where build artifacts may be updated. This must be fixed.
 no-compiler-targets := $(no-dot-config-targets) install dtbs_install \
 			headers_install modules_install kernelrelease image_name
 no-sync-config-targets := $(no-dot-config-targets) %install kernelrelease \
 			  image_name
-single-targets := %.a %.i %.ko %.lds %.ll %.lst %.mod %.o %.s %.symtypes %/
+single-targets := %.a %.i %.rsi %.ko %.lds %.ll %.lst %.mod %.o %.s %.symtypes %/
 
 config-build	:=
 mixed-build	:=
@@ -439,6 +448,7 @@ else
 HOSTCC	= gcc
 HOSTCXX	= g++
 endif
+HOSTRUSTC = rustc
 HOSTPKG_CONFIG	= pkg-config
 
 KBUILD_USERHOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \
@@ -447,8 +457,26 @@ KBUILD_USERHOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \
 KBUILD_USERCFLAGS  := $(KBUILD_USERHOSTCFLAGS) $(USERCFLAGS)
 KBUILD_USERLDFLAGS := $(USERLDFLAGS)
 
+# These flags apply to all Rust code in the tree, including the kernel and
+# host programs.
+export rust_common_flags := --edition=2021 \
+			    -Zbinary_dep_depinfo=y \
+			    -Dunsafe_op_in_unsafe_fn -Drust_2018_idioms \
+			    -Dunreachable_pub -Dnon_ascii_idents \
+			    -Wmissing_docs \
+			    -Drustdoc::missing_crate_level_docs \
+			    -Dclippy::correctness -Dclippy::style \
+			    -Dclippy::suspicious -Dclippy::complexity \
+			    -Dclippy::perf \
+			    -Dclippy::let_unit_value -Dclippy::mut_mut \
+			    -Dclippy::needless_bitwise_bool \
+			    -Dclippy::needless_continue \
+			    -Wclippy::dbg_macro
+
 KBUILD_HOSTCFLAGS   := $(KBUILD_USERHOSTCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS)
 KBUILD_HOSTCXXFLAGS := -Wall -O2 $(HOST_LFS_CFLAGS) $(HOSTCXXFLAGS)
+KBUILD_HOSTRUSTFLAGS := $(rust_common_flags) -O -Cstrip=debuginfo \
+			-Zallow-features= $(HOSTRUSTFLAGS)
 KBUILD_HOSTLDFLAGS  := $(HOST_LFS_LDFLAGS) $(HOSTLDFLAGS)
 KBUILD_HOSTLDLIBS   := $(HOST_LFS_LIBS) $(HOSTLDLIBS)
 
@@ -473,6 +501,12 @@ OBJDUMP		= $(CROSS_COMPILE)objdump
 READELF		= $(CROSS_COMPILE)readelf
 STRIP		= $(CROSS_COMPILE)strip
 endif
+RUSTC		= rustc
+RUSTDOC		= rustdoc
+RUSTFMT		= rustfmt
+CLIPPY_DRIVER	= clippy-driver
+BINDGEN		= bindgen
+CARGO		= cargo
 PAHOLE		= pahole
 RESOLVE_BTFIDS	= $(objtree)/tools/bpf/resolve_btfids/resolve_btfids
 LEX		= flex
@@ -498,9 +532,11 @@ CHECKFLAGS     := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \
 		  -Wbitwise -Wno-return-void -Wno-unknown-attribute $(CF)
 NOSTDINC_FLAGS :=
 CFLAGS_MODULE   =
+RUSTFLAGS_MODULE =
 AFLAGS_MODULE   =
 LDFLAGS_MODULE  =
 CFLAGS_KERNEL	=
+RUSTFLAGS_KERNEL =
 AFLAGS_KERNEL	=
 LDFLAGS_vmlinux =
 
@@ -529,15 +565,43 @@ KBUILD_CFLAGS   := -Wall -Wundef -Werror=strict-prototypes -Wno-trigraphs \
 		   -Werror=return-type -Wno-format-security \
 		   -std=gnu11
 KBUILD_CPPFLAGS := -D__KERNEL__
+KBUILD_RUSTFLAGS := $(rust_common_flags) \
+		    --target=$(objtree)/rust/target.json \
+		    -Cpanic=abort -Cembed-bitcode=n -Clto=n \
+		    -Cforce-unwind-tables=n -Ccodegen-units=1 \
+		    -Csymbol-mangling-version=v0 \
+		    -Crelocation-model=static \
+		    -Zfunction-sections=n \
+		    -Dclippy::float_arithmetic
+
 KBUILD_AFLAGS_KERNEL :=
 KBUILD_CFLAGS_KERNEL :=
+KBUILD_RUSTFLAGS_KERNEL :=
 KBUILD_AFLAGS_MODULE  := -DMODULE
 KBUILD_CFLAGS_MODULE  := -DMODULE
+KBUILD_RUSTFLAGS_MODULE := --cfg MODULE
 KBUILD_LDFLAGS_MODULE :=
 KBUILD_LDFLAGS :=
 CLANG_FLAGS :=
 
+ifeq ($(KBUILD_CLIPPY),1)
+	RUSTC_OR_CLIPPY_QUIET := CLIPPY
+	RUSTC_OR_CLIPPY = $(CLIPPY_DRIVER)
+else
+	RUSTC_OR_CLIPPY_QUIET := RUSTC
+	RUSTC_OR_CLIPPY = $(RUSTC)
+endif
+
+ifdef RUST_LIB_SRC
+	export RUST_LIB_SRC
+endif
+
+# Allows the usage of unstable features in stable compilers.
+export RUSTC_BOOTSTRAP := 1
+
 export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC HOSTPKG_CONFIG
+export RUSTC RUSTDOC RUSTFMT RUSTC_OR_CLIPPY_QUIET RUSTC_OR_CLIPPY BINDGEN CARGO
+export HOSTRUSTC KBUILD_HOSTRUSTFLAGS
 export CPP AR NM STRIP OBJCOPY OBJDUMP READELF PAHOLE RESOLVE_BTFIDS LEX YACC AWK INSTALLKERNEL
 export PERL PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX
 export KGZIP KBZIP2 KLZOP LZMA LZ4 XZ ZSTD
@@ -546,9 +610,10 @@ export KBUILD_USERCFLAGS KBUILD_USERLDFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS KBUILD_LDFLAGS
 export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE
+export KBUILD_RUSTFLAGS RUSTFLAGS_KERNEL RUSTFLAGS_MODULE
 export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
-export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
-export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
+export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_RUSTFLAGS_MODULE KBUILD_LDFLAGS_MODULE
+export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL KBUILD_RUSTFLAGS_KERNEL
 export PAHOLE_FLAGS
 
 # Files to ignore in find ... statements
@@ -729,7 +794,7 @@ $(KCONFIG_CONFIG):
 #
 # Do not use $(call cmd,...) here. That would suppress prompts from syncconfig,
 # so you cannot notice that Kconfig is waiting for the user input.
-%/config/auto.conf %/config/auto.conf.cmd %/generated/autoconf.h: $(KCONFIG_CONFIG)
+%/config/auto.conf %/config/auto.conf.cmd %/generated/autoconf.h %/generated/rustc_cfg: $(KCONFIG_CONFIG)
 	$(Q)$(kecho) "  SYNC    $@"
 	$(Q)$(MAKE) -f $(srctree)/Makefile syncconfig
 else # !may-sync-config
@@ -758,10 +823,17 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, address-of-packed-member)
 
 ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
 KBUILD_CFLAGS += -O2
+KBUILD_RUSTFLAGS += -Copt-level=2
 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS += -Os
+KBUILD_RUSTFLAGS += -Copt-level=s
 endif
 
+# Always set `debug-assertions` and `overflow-checks` because their default
+# depends on `opt-level` and `debug-assertions`, respectively.
+KBUILD_RUSTFLAGS += -Cdebug-assertions=$(if $(CONFIG_RUST_DEBUG_ASSERTIONS),y,n)
+KBUILD_RUSTFLAGS += -Coverflow-checks=$(if $(CONFIG_RUST_OVERFLOW_CHECKS),y,n)
+
 # Tell gcc to never replace conditional load with a non-conditional one
 ifdef CONFIG_CC_IS_GCC
 # gcc-10 renamed --param=allow-store-data-races=0 to
@@ -792,6 +864,9 @@ KBUILD_CFLAGS-$(CONFIG_WERROR) += -Werror
 KBUILD_CFLAGS-$(CONFIG_CC_NO_ARRAY_BOUNDS) += -Wno-array-bounds
 KBUILD_CFLAGS += $(KBUILD_CFLAGS-y) $(CONFIG_CC_IMPLICIT_FALLTHROUGH)
 
+KBUILD_RUSTFLAGS-$(CONFIG_WERROR) += -Dwarnings
+KBUILD_RUSTFLAGS += $(KBUILD_RUSTFLAGS-y)
+
 ifdef CONFIG_CC_IS_CLANG
 KBUILD_CPPFLAGS += -Qunused-arguments
 # The kernel builds with '-std=gnu11' so use of GNU extensions is acceptable.
@@ -812,12 +887,15 @@ KBUILD_CFLAGS += $(call cc-disable-warning, dangling-pointer)
 
 ifdef CONFIG_FRAME_POINTER
 KBUILD_CFLAGS	+= -fno-omit-frame-pointer -fno-optimize-sibling-calls
+KBUILD_RUSTFLAGS += -Cforce-frame-pointers=y
 else
 # Some targets (ARM with Thumb2, for example), can't be built with frame
 # pointers.  For those, we don't have FUNCTION_TRACER automatically
 # select FRAME_POINTER.  However, FUNCTION_TRACER adds -pg, and this is
 # incompatible with -fomit-frame-pointer with current GCC, so we don't use
 # -fomit-frame-pointer with FUNCTION_TRACER.
+# In the Rust target specification, "frame-pointer" is set explicitly
+# to "may-omit".
 ifndef CONFIG_FUNCTION_TRACER
 KBUILD_CFLAGS	+= -fomit-frame-pointer
 endif
@@ -882,8 +960,10 @@ ifdef CONFIG_DEBUG_SECTION_MISMATCH
 KBUILD_CFLAGS += -fno-inline-functions-called-once
 endif
 
+# `rustc`'s `-Zfunction-sections` applies to data too (as of 1.59.0).
 ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION
 KBUILD_CFLAGS_KERNEL += -ffunction-sections -fdata-sections
+KBUILD_RUSTFLAGS_KERNEL += -Zfunction-sections=y
 LDFLAGS_vmlinux += --gc-sections
 endif
 
@@ -1026,10 +1106,11 @@ include $(addprefix $(srctree)/, $(include-y))
 # Do not add $(call cc-option,...) below this line. When you build the kernel
 # from the clean source tree, the GCC plugins do not exist at this point.
 
-# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
+# Add user supplied CPPFLAGS, AFLAGS, CFLAGS and RUSTFLAGS as the last assignments
 KBUILD_CPPFLAGS += $(KCPPFLAGS)
 KBUILD_AFLAGS   += $(KAFLAGS)
 KBUILD_CFLAGS   += $(KCFLAGS)
+KBUILD_RUSTFLAGS += $(KRUSTFLAGS)
 
 KBUILD_LDFLAGS_MODULE += --build-id=sha1
 LDFLAGS_vmlinux += --build-id=sha1
@@ -1104,6 +1185,7 @@ ifeq ($(KBUILD_EXTMOD),)
 core-y			+= kernel/ certs/ mm/ fs/ ipc/ security/ crypto/
 core-$(CONFIG_BLOCK)	+= block/
 core-$(CONFIG_IO_URING)	+= io_uring/
+core-$(CONFIG_RUST)	+= rust/
 
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
@@ -1206,6 +1288,10 @@ prepare0: archprepare
 
 # All the preparing..
 prepare: prepare0
+ifdef CONFIG_RUST
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh -v
+	$(Q)$(MAKE) $(build)=rust
+endif
 
 PHONY += remove-stale-files
 remove-stale-files:
@@ -1499,7 +1585,7 @@ endif # CONFIG_MODULES
 # Directories & files removed with 'make clean'
 CLEAN_FILES += include/ksym vmlinux.symvers modules-only.symvers \
 	       modules.builtin modules.builtin.modinfo modules.nsdeps \
-	       compile_commands.json .thinlto-cache
+	       compile_commands.json .thinlto-cache rust/test rust/doc
 
 # Directories & files removed with 'make mrproper'
 MRPROPER_FILES += include/config include/generated          \
@@ -1510,7 +1596,8 @@ MRPROPER_FILES += include/config include/generated          \
 		  certs/signing_key.pem \
 		  certs/x509.genkey \
 		  vmlinux-gdb.py \
-		  *.spec
+		  *.spec \
+		  rust/target.json rust/libmacros.so
 
 # clean - Delete most, but leave enough to build external modules
 #
@@ -1535,6 +1622,9 @@ $(mrproper-dirs):
 
 mrproper: clean $(mrproper-dirs)
 	$(call cmd,rmfiles)
+	@find . $(RCS_FIND_IGNORE) \
+		\( -name '*.rmeta' \) \
+		-type f -print | xargs rm -f
 
 # distclean
 #
@@ -1622,6 +1712,24 @@ help:
 	@echo  '  kselftest-merge   - Merge all the config dependencies of'
 	@echo  '		      kselftest to existing .config.'
 	@echo  ''
+	@echo  'Rust targets:'
+	@echo  '  rustavailable   - Checks whether the Rust toolchain is'
+	@echo  '		    available and, if not, explains why.'
+	@echo  '  rustfmt	  - Reformat all the Rust code in the kernel'
+	@echo  '  rustfmtcheck	  - Checks if all the Rust code in the kernel'
+	@echo  '		    is formatted, printing a diff otherwise.'
+	@echo  '  rustdoc	  - Generate Rust documentation'
+	@echo  '		    (requires kernel .config)'
+	@echo  '  rusttest        - Runs the Rust tests'
+	@echo  '                    (requires kernel .config; downloads external repos)'
+	@echo  '  rust-analyzer	  - Generate rust-project.json rust-analyzer support file'
+	@echo  '		    (requires kernel .config)'
+	@echo  '  dir/file.[os]   - Build specified target only'
+	@echo  '  dir/file.rsi    - Build macro expanded source, similar to C preprocessing.'
+	@echo  '                    Run with RUSTFMT=n to skip reformatting if needed.'
+	@echo  '                    The output is not intended to be compilable.'
+	@echo  '  dir/file.ll     - Build the LLVM assembly file'
+	@echo  ''
 	@$(if $(dtstree), \
 		echo 'Devicetree:'; \
 		echo '* dtbs             - Build device tree blobs for enabled boards'; \
@@ -1694,6 +1802,52 @@ PHONY += $(DOC_TARGETS)
 $(DOC_TARGETS):
 	$(Q)$(MAKE) $(build)=Documentation $@
 
+
+# Rust targets
+# ---------------------------------------------------------------------------
+
+# "Is Rust available?" target
+PHONY += rustavailable
+rustavailable:
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust_is_available.sh -v && echo "Rust is available!"
+
+# Documentation target
+#
+# Using the singular to avoid running afoul of `no-dot-config-targets`.
+PHONY += rustdoc
+rustdoc: prepare
+	$(Q)$(MAKE) $(build)=rust $@
+
+# Testing target
+PHONY += rusttest
+rusttest: prepare
+	$(Q)$(MAKE) $(build)=rust $@
+
+# Formatting targets
+PHONY += rustfmt rustfmtcheck
+
+# We skip `rust/alloc` since we want to minimize the diff w.r.t. upstream.
+#
+# We match using absolute paths since `find` does not resolve them
+# when matching, which is a problem when e.g. `srctree` is `..`.
+# We `grep` afterwards in order to remove the directory entry itself.
+rustfmt:
+	$(Q)find $(abs_srctree) -type f -name '*.rs' \
+		-o -path $(abs_srctree)/rust/alloc -prune \
+		-o -path $(abs_objtree)/rust/test -prune \
+		| grep -Fv $(abs_srctree)/rust/alloc \
+		| grep -Fv $(abs_objtree)/rust/test \
+		| grep -Fv generated \
+		| xargs $(RUSTFMT) $(rustfmt_flags)
+
+rustfmtcheck: rustfmt_flags = --check
+rustfmtcheck: rustfmt
+
+# IDE support targets
+PHONY += rust-analyzer
+rust-analyzer:
+	$(Q)$(MAKE) $(build)=rust $@
+
 # Misc
 # ---------------------------------------------------------------------------
 
@@ -1861,7 +2015,7 @@ $(clean-dirs):
 clean: $(clean-dirs)
 	$(call cmd,rmfiles)
 	@find $(or $(KBUILD_EXTMOD), .) $(RCS_FIND_IGNORE) \
-		\( -name '*.[aios]' -o -name '*.ko' -o -name '.*.cmd' \
+		\( -name '*.[aios]' -o -name '*.rsi' -o -name '*.ko' -o -name '.*.cmd' \
 		-o -name '*.ko.*' \
 		-o -name '*.dtb' -o -name '*.dtbo' -o -name '*.dtb.S' -o -name '*.dt.yaml' \
 		-o -name '*.dwo' -o -name '*.lst' \
diff --git a/arch/Kconfig b/arch/Kconfig
index 8b311e400ec1..d9b4ae0fc805 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -355,6 +355,12 @@ config HAVE_RSEQ
 	  This symbol should be selected by an architecture if it
 	  supports an implementation of restartable sequences.
 
+config HAVE_RUST
+	bool
+	help
+	  This symbol should be selected by an architecture if it
+	  supports Rust.
+
 config HAVE_FUNCTION_ARG_ACCESS_API
 	bool
 	help
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 4f2a819fd60a..50b3f6b9502e 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -4,8 +4,12 @@
 
 #ifndef __ASSEMBLY__
 
+/*
+ * Skipped when running bindgen due to a libclang issue;
+ * see https://github.com/rust-lang/rust-bindgen/issues/2244.
+ */
 #if defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \
-	__has_attribute(btf_type_tag)
+	__has_attribute(btf_type_tag) && !defined(__BINDGEN__)
 # define BTF_TYPE_TAG(value) __attribute__((btf_type_tag(#value)))
 #else
 # define BTF_TYPE_TAG(value) /* nothing */
diff --git a/init/Kconfig b/init/Kconfig
index 532362fcfe31..a078cb026523 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -60,6 +60,17 @@ config LLD_VERSION
 	default $(ld-version) if LD_IS_LLD
 	default 0
 
+config RUST_IS_AVAILABLE
+	def_bool $(success,$(srctree)/scripts/rust_is_available.sh)
+	help
+	  This shows whether a suitable Rust toolchain is available (found).
+
+	  Please see Documentation/rust/quick-start.rst for instructions on how
+	  to satify the build requirements of Rust support.
+
+	  In particular, the Makefile target 'rustavailable' is useful to check
+	  why the Rust toolchain is not being detected.
+
 config CC_CAN_LINK
 	bool
 	default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag)) if 64BIT
@@ -147,7 +158,8 @@ config WERROR
 	default COMPILE_TEST
 	help
 	  A kernel build should not cause any compiler warnings, and this
-	  enables the '-Werror' flag to enforce that rule by default.
+	  enables the '-Werror' (for C) and '-Dwarnings' (for Rust) flags
+	  to enforce that rule by default.
 
 	  However, if you have a new (or very old) compiler with odd and
 	  unusual warnings, or you have some architecture with problems,
@@ -1899,6 +1911,38 @@ config PROFILING
 	  Say Y here to enable the extended profiling support mechanisms used
 	  by profilers.
 
+config RUST
+	bool "Rust support"
+	depends on HAVE_RUST
+	depends on RUST_IS_AVAILABLE
+	depends on !MODVERSIONS
+	depends on !GCC_PLUGINS
+	depends on !RANDSTRUCT
+	depends on !DEBUG_INFO_BTF
+	select CONSTRUCTORS
+	help
+	  Enables Rust support in the kernel.
+
+	  This allows other Rust-related options, like drivers written in Rust,
+	  to be selected.
+
+	  It is also required to be able to load external kernel modules
+	  written in Rust.
+
+	  See Documentation/rust/ for more information.
+
+	  If unsure, say N.
+
+config RUSTC_VERSION_TEXT
+	string
+	depends on RUST
+	default $(shell,command -v $(RUSTC) >/dev/null 2>&1 && $(RUSTC) --version || echo n)
+
+config BINDGEN_VERSION_TEXT
+	string
+	depends on RUST
+	default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version || echo n)
+
 #
 # Place an empty function call at each tracepoint site. Can be
 # dynamically changed for a probe function.
diff --git a/kernel/configs/rust.config b/kernel/configs/rust.config
new file mode 100644
index 000000000000..38a7c5362c9c
--- /dev/null
+++ b/kernel/configs/rust.config
@@ -0,0 +1 @@
+CONFIG_RUST=y
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d3e5f36bb01e..e62271da937f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2710,6 +2710,40 @@ config HYPERV_TESTING
 
 endmenu # "Kernel Testing and Coverage"
 
+menu "Rust hacking"
+
+config RUST_DEBUG_ASSERTIONS
+	bool "Debug assertions"
+	depends on RUST
+	help
+	  Enables rustc's `-Cdebug-assertions` codegen option.
+
+	  This flag lets you turn `cfg(debug_assertions)` conditional
+	  compilation on or off. This can be used to enable extra debugging
+	  code in development but not in production. For example, it controls
+	  the behavior of the standard library's `debug_assert!` macro.
+
+	  Note that this will apply to all Rust code, including `core`.
+
+	  If unsure, say N.
+
+config RUST_OVERFLOW_CHECKS
+	bool "Overflow checks"
+	default y
+	depends on RUST
+	help
+	  Enables rustc's `-Coverflow-checks` codegen option.
+
+	  This flag allows you to control the behavior of runtime integer
+	  overflow. When overflow-checks are enabled, a Rust panic will occur
+	  on overflow.
+
+	  Note that this will apply to all Rust code, including `core`.
+
+	  If unsure, say Y.
+
+endmenu # "Rust"
+
 source "Documentation/Kconfig"
 
 endmenu # Kernel hacking
diff --git a/rust/.gitignore b/rust/.gitignore
new file mode 100644
index 000000000000..9bd1af8e05a1
--- /dev/null
+++ b/rust/.gitignore
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+
+target.json
+bindings_generated.rs
+bindings_helpers_generated.rs
+exports_*_generated.h
+doc/
+test/
diff --git a/rust/Makefile b/rust/Makefile
new file mode 100644
index 000000000000..7700d3853404
--- /dev/null
+++ b/rust/Makefile
@@ -0,0 +1,381 @@
+# SPDX-License-Identifier: GPL-2.0
+
+always-$(CONFIG_RUST) += target.json
+no-clean-files += target.json
+
+obj-$(CONFIG_RUST) += core.o compiler_builtins.o
+always-$(CONFIG_RUST) += exports_core_generated.h
+
+# Missing prototypes are expected in the helpers since these are exported
+# for Rust only, thus there is no header nor prototypes.
+obj-$(CONFIG_RUST) += helpers.o
+CFLAGS_REMOVE_helpers.o = -Wmissing-prototypes -Wmissing-declarations
+
+always-$(CONFIG_RUST) += libmacros.so
+no-clean-files += libmacros.so
+
+always-$(CONFIG_RUST) += bindings/bindings_generated.rs bindings/bindings_helpers_generated.rs
+obj-$(CONFIG_RUST) += alloc.o bindings.o kernel.o
+always-$(CONFIG_RUST) += exports_alloc_generated.h exports_bindings_generated.h \
+    exports_kernel_generated.h
+
+obj-$(CONFIG_RUST) += exports.o
+
+# Avoids running `$(RUSTC)` for the sysroot when it may not be available.
+ifdef CONFIG_RUST
+
+# `$(rust_flags)` is passed in case the user added `--sysroot`.
+rustc_sysroot := $(shell $(RUSTC) $(rust_flags) --print sysroot)
+rustc_host_target := $(shell $(RUSTC) --version --verbose | grep -F 'host: ' | cut -d' ' -f2)
+RUST_LIB_SRC ?= $(rustc_sysroot)/lib/rustlib/src/rust/library
+
+ifeq ($(quiet),silent_)
+cargo_quiet=-q
+rust_test_quiet=-q
+rustdoc_test_quiet=--test-args -q
+else ifeq ($(quiet),quiet_)
+rust_test_quiet=-q
+rustdoc_test_quiet=--test-args -q
+else
+cargo_quiet=--verbose
+endif
+
+core-cfgs = \
+    --cfg no_fp_fmt_parse
+
+alloc-cfgs = \
+    --cfg no_fmt \
+    --cfg no_global_oom_handling \
+    --cfg no_macros \
+    --cfg no_rc \
+    --cfg no_str \
+    --cfg no_string \
+    --cfg no_sync \
+    --cfg no_thin
+
+quiet_cmd_rustdoc = RUSTDOC $(if $(rustdoc_host),H, ) $<
+      cmd_rustdoc = \
+	OBJTREE=$(abspath $(objtree)) \
+	$(RUSTDOC) $(if $(rustdoc_host),$(rust_common_flags),$(rust_flags)) \
+		$(rustc_target_flags) -L$(objtree)/$(obj) \
+		--output $(objtree)/$(obj)/doc \
+		--crate-name $(subst rustdoc-,,$@) \
+		@$(objtree)/include/generated/rustc_cfg $<
+
+# The `html_logo_url` and `html_favicon_url` forms of the `doc` attribute
+# can be used to specify a custom logo. However:
+#   - The given value is used as-is, thus it cannot be relative or a local file
+#     (unlike the non-custom case) since the generated docs have subfolders.
+#   - It requires adding it to every crate.
+#   - It requires changing `core` which comes from the sysroot.
+#
+# Using `-Zcrate-attr` would solve the last two points, but not the first.
+# The https://github.com/rust-lang/rfcs/pull/3226 RFC suggests two new
+# command-like flags to solve the issue. Meanwhile, we use the non-custom case
+# and then retouch the generated files.
+rustdoc: rustdoc-core rustdoc-macros rustdoc-compiler_builtins \
+    rustdoc-alloc rustdoc-kernel
+	$(Q)cp $(srctree)/Documentation/images/logo.svg $(objtree)/$(obj)/doc
+	$(Q)cp $(srctree)/Documentation/images/COPYING-logo $(objtree)/$(obj)/doc
+	$(Q)find $(objtree)/$(obj)/doc -name '*.html' -type f -print0 | xargs -0 sed -Ei \
+		-e 's:rust-logo\.svg:logo.svg:g' \
+		-e 's:rust-logo\.png:logo.svg:g' \
+		-e 's:favicon\.svg:logo.svg:g' \
+		-e 's:<link rel="alternate icon" type="image/png" href="[./]*favicon-(16x16|32x32)\.png">::g'
+	$(Q)echo '.logo-container > img { object-fit: contain; }' \
+		>> $(objtree)/$(obj)/doc/rustdoc.css
+
+rustdoc-macros: private rustdoc_host = yes
+rustdoc-macros: private rustc_target_flags = --crate-type proc-macro \
+    --extern proc_macro
+rustdoc-macros: $(src)/macros/lib.rs FORCE
+	$(call if_changed,rustdoc)
+
+rustdoc-core: private rustc_target_flags = $(core-cfgs)
+rustdoc-core: $(RUST_LIB_SRC)/core/src/lib.rs FORCE
+	$(call if_changed,rustdoc)
+
+rustdoc-compiler_builtins: $(src)/compiler_builtins.rs rustdoc-core FORCE
+	$(call if_changed,rustdoc)
+
+# We need to allow `rustdoc::broken_intra_doc_links` because some
+# `no_global_oom_handling` functions refer to non-`no_global_oom_handling`
+# functions. Ideally `rustdoc` would have a way to distinguish broken links
+# due to things that are "configured out" vs. entirely non-existing ones.
+rustdoc-alloc: private rustc_target_flags = $(alloc-cfgs) \
+    -Arustdoc::broken_intra_doc_links
+rustdoc-alloc: $(src)/alloc/lib.rs rustdoc-core rustdoc-compiler_builtins FORCE
+	$(call if_changed,rustdoc)
+
+rustdoc-kernel: private rustc_target_flags = --extern alloc \
+    --extern macros=$(objtree)/$(obj)/libmacros.so \
+    --extern bindings
+rustdoc-kernel: $(src)/kernel/lib.rs rustdoc-core rustdoc-macros \
+    rustdoc-compiler_builtins rustdoc-alloc $(obj)/libmacros.so \
+    $(obj)/bindings.o FORCE
+	$(call if_changed,rustdoc)
+
+quiet_cmd_rustc_test_library = RUSTC TL $<
+      cmd_rustc_test_library = \
+	OBJTREE=$(abspath $(objtree)) \
+	$(RUSTC) $(rust_common_flags) \
+		@$(objtree)/include/generated/rustc_cfg $(rustc_target_flags) \
+		--crate-type $(if $(rustc_test_library_proc),proc-macro,rlib) \
+		--out-dir $(objtree)/$(obj)/test --cfg testlib \
+		--sysroot $(objtree)/$(obj)/test/sysroot \
+		-L$(objtree)/$(obj)/test \
+		--crate-name $(subst rusttest-,,$(subst rusttestlib-,,$@)) $<
+
+rusttestlib-macros: private rustc_target_flags = --extern proc_macro
+rusttestlib-macros: private rustc_test_library_proc = yes
+rusttestlib-macros: $(src)/macros/lib.rs rusttest-prepare FORCE
+	$(call if_changed,rustc_test_library)
+
+rusttestlib-bindings: $(src)/bindings/lib.rs rusttest-prepare FORCE
+	$(call if_changed,rustc_test_library)
+
+quiet_cmd_rustdoc_test = RUSTDOC T $<
+      cmd_rustdoc_test = \
+	OBJTREE=$(abspath $(objtree)) \
+	$(RUSTDOC) --test $(rust_common_flags) \
+		@$(objtree)/include/generated/rustc_cfg \
+		$(rustc_target_flags) $(rustdoc_test_target_flags) \
+		--sysroot $(objtree)/$(obj)/test/sysroot $(rustdoc_test_quiet) \
+		-L$(objtree)/$(obj)/test --output $(objtree)/$(obj)/doc \
+		--crate-name $(subst rusttest-,,$@) $<
+
+# We cannot use `-Zpanic-abort-tests` because some tests are dynamic,
+# so for the moment we skip `-Cpanic=abort`.
+quiet_cmd_rustc_test = RUSTC T  $<
+      cmd_rustc_test = \
+	OBJTREE=$(abspath $(objtree)) \
+	$(RUSTC) --test $(rust_common_flags) \
+		@$(objtree)/include/generated/rustc_cfg \
+		$(rustc_target_flags) --out-dir $(objtree)/$(obj)/test \
+		--sysroot $(objtree)/$(obj)/test/sysroot \
+		-L$(objtree)/$(obj)/test \
+		--crate-name $(subst rusttest-,,$@) $<; \
+	$(objtree)/$(obj)/test/$(subst rusttest-,,$@) $(rust_test_quiet) \
+		$(rustc_test_run_flags)
+
+rusttest: rusttest-macros rusttest-kernel
+
+# This prepares a custom sysroot with our custom `alloc` instead of
+# the standard one.
+#
+# This requires several hacks:
+#   - Unlike `core` and `alloc`, `std` depends on more than a dozen crates,
+#     including third-party crates that need to be downloaded, plus custom
+#     `build.rs` steps. Thus hardcoding things here is not maintainable.
+#   - `cargo` knows how to build the standard library, but it is an unstable
+#     feature so far (`-Zbuild-std`).
+#   - `cargo` only considers the use case of building the standard library
+#     to use it in a given package. Thus we need to create a dummy package
+#     and pick the generated libraries from there.
+#   - Since we only keep a subset of upstream `alloc` in-tree, we need
+#     to recreate it on the fly by putting our sources on top.
+#   - The usual ways of modifying the dependency graph in `cargo` do not seem
+#     to apply for the `-Zbuild-std` steps, thus we have to mislead it
+#     by modifying the sources in the sysroot.
+#   - To avoid messing with the user's Rust installation, we create a clone
+#     of the sysroot. However, `cargo` ignores `RUSTFLAGS` in the `-Zbuild-std`
+#     steps, thus we use a wrapper binary passed via `RUSTC` to pass the flag.
+#
+# In the future, we hope to avoid the whole ordeal by either:
+#   - Making the `test` crate not depend on `std` (either improving upstream
+#     or having our own custom crate).
+#   - Making the tests run in kernel space (requires the previous point).
+#   - Making `std` and friends be more like a "normal" crate, so that
+#     `-Zbuild-std` and related hacks are not needed.
+quiet_cmd_rustsysroot = RUSTSYSROOT
+      cmd_rustsysroot = \
+	rm -rf $(objtree)/$(obj)/test; \
+	mkdir -p $(objtree)/$(obj)/test; \
+	cp -a $(rustc_sysroot) $(objtree)/$(obj)/test/sysroot; \
+	cp -r $(srctree)/$(src)/alloc/* \
+		$(objtree)/$(obj)/test/sysroot/lib/rustlib/src/rust/library/alloc/src; \
+	echo '\#!/bin/sh' > $(objtree)/$(obj)/test/rustc_sysroot; \
+	echo "$(RUSTC) --sysroot=$(abspath $(objtree)/$(obj)/test/sysroot) \"\$$@\"" \
+		>> $(objtree)/$(obj)/test/rustc_sysroot; \
+	chmod u+x $(objtree)/$(obj)/test/rustc_sysroot; \
+	$(CARGO) -q new $(objtree)/$(obj)/test/dummy; \
+	RUSTC=$(objtree)/$(obj)/test/rustc_sysroot $(CARGO) $(cargo_quiet) \
+		test -Zbuild-std --target $(rustc_host_target) \
+		--manifest-path $(objtree)/$(obj)/test/dummy/Cargo.toml; \
+	rm $(objtree)/$(obj)/test/sysroot/lib/rustlib/$(rustc_host_target)/lib/*; \
+	cp $(objtree)/$(obj)/test/dummy/target/$(rustc_host_target)/debug/deps/* \
+		$(objtree)/$(obj)/test/sysroot/lib/rustlib/$(rustc_host_target)/lib
+
+rusttest-prepare: FORCE
+	$(call if_changed,rustsysroot)
+
+rusttest-macros: private rustc_target_flags = --extern proc_macro
+rusttest-macros: private rustdoc_test_target_flags = --crate-type proc-macro
+rusttest-macros: $(src)/macros/lib.rs rusttest-prepare FORCE
+	$(call if_changed,rustc_test)
+	$(call if_changed,rustdoc_test)
+
+rusttest-kernel: private rustc_target_flags = --extern alloc \
+    --extern macros --extern bindings
+rusttest-kernel: $(src)/kernel/lib.rs rusttest-prepare \
+    rusttestlib-macros rusttestlib-bindings FORCE
+	$(call if_changed,rustc_test)
+	$(call if_changed,rustc_test_library)
+
+filechk_rust_target = $(objtree)/scripts/generate_rust_target < $<
+
+$(obj)/target.json: $(objtree)/include/config/auto.conf FORCE
+	$(call filechk,rust_target)
+
+ifdef CONFIG_CC_IS_CLANG
+bindgen_c_flags = $(c_flags)
+else
+# bindgen relies on libclang to parse C. Ideally, bindgen would support a GCC
+# plugin backend and/or the Clang driver would be perfectly compatible with GCC.
+#
+# For the moment, here we are tweaking the flags on the fly. This is a hack,
+# and some kernel configurations may not work (e.g. `GCC_PLUGIN_RANDSTRUCT`
+# if we end up using one of those structs).
+bindgen_skip_c_flags := -mno-fp-ret-in-387 -mpreferred-stack-boundary=% \
+	-mskip-rax-setup -mgeneral-regs-only -msign-return-address=% \
+	-mindirect-branch=thunk-extern -mindirect-branch-register \
+	-mfunction-return=thunk-extern -mrecord-mcount -mabi=lp64 \
+	-mindirect-branch-cs-prefix -mstack-protector-guard% -mtraceback=no \
+	-mno-pointers-to-nested-functions -mno-string \
+	-mno-strict-align -mstrict-align \
+	-fconserve-stack -falign-jumps=% -falign-loops=% \
+	-femit-struct-debug-baseonly -fno-ipa-cp-clone -fno-ipa-sra \
+	-fno-partial-inlining -fplugin-arg-arm_ssp_per_task_plugin-% \
+	-fno-reorder-blocks -fno-allow-store-data-races -fasan-shadow-offset=% \
+	-fzero-call-used-regs=% -fno-stack-clash-protection \
+	-fno-inline-functions-called-once \
+	--param=% --param asan-%
+
+# Derived from `scripts/Makefile.clang`.
+BINDGEN_TARGET_x86	:= x86_64-linux-gnu
+BINDGEN_TARGET		:= $(BINDGEN_TARGET_$(SRCARCH))
+
+# All warnings are inhibited since GCC builds are very experimental,
+# many GCC warnings are not supported by Clang, they may only appear in
+# some configurations, with new GCC versions, etc.
+bindgen_extra_c_flags = -w --target=$(BINDGEN_TARGET)
+
+bindgen_c_flags = $(filter-out $(bindgen_skip_c_flags), $(c_flags)) \
+	$(bindgen_extra_c_flags)
+endif
+
+ifdef CONFIG_LTO
+bindgen_c_flags_lto = $(filter-out $(CC_FLAGS_LTO), $(bindgen_c_flags))
+else
+bindgen_c_flags_lto = $(bindgen_c_flags)
+endif
+
+bindgen_c_flags_final = $(bindgen_c_flags_lto) -D__BINDGEN__
+
+quiet_cmd_bindgen = BINDGEN $@
+      cmd_bindgen = \
+	$(BINDGEN) $< $(bindgen_target_flags) \
+		--use-core --with-derive-default --ctypes-prefix core::ffi --no-layout-tests \
+		--no-debug '.*' \
+		--size_t-is-usize -o $@ -- $(bindgen_c_flags_final) -DMODULE \
+		$(bindgen_target_cflags) $(bindgen_target_extra)
+
+$(obj)/bindings/bindings_generated.rs: private bindgen_target_flags = \
+    $(shell grep -v '^\#\|^$$' $(srctree)/$(src)/bindgen_parameters)
+$(obj)/bindings/bindings_generated.rs: $(src)/bindings/bindings_helper.h \
+    $(src)/bindgen_parameters FORCE
+	$(call if_changed_dep,bindgen)
+
+# See `CFLAGS_REMOVE_helpers.o` above. In addition, Clang on C does not warn
+# with `-Wmissing-declarations` (unlike GCC), so it is not strictly needed here
+# given it is `libclang`; but for consistency, future Clang changes and/or
+# a potential future GCC backend for `bindgen`, we disable it too.
+$(obj)/bindings/bindings_helpers_generated.rs: private bindgen_target_flags = \
+    --blacklist-type '.*' --whitelist-var '' \
+    --whitelist-function 'rust_helper_.*'
+$(obj)/bindings/bindings_helpers_generated.rs: private bindgen_target_cflags = \
+    -I$(objtree)/$(obj) -Wno-missing-prototypes -Wno-missing-declarations
+$(obj)/bindings/bindings_helpers_generated.rs: private bindgen_target_extra = ; \
+    sed -Ei 's/pub fn rust_helper_([a-zA-Z0-9_]*)/#[link_name="rust_helper_\1"]\n    pub fn \1/g' $@
+$(obj)/bindings/bindings_helpers_generated.rs: $(src)/helpers.c FORCE
+	$(call if_changed_dep,bindgen)
+
+quiet_cmd_exports = EXPORTS $@
+      cmd_exports = \
+	$(NM) -p --defined-only $< \
+		| grep -E ' (T|R|D) ' | cut -d ' ' -f 3 \
+		| xargs -Isymbol \
+		echo 'EXPORT_SYMBOL_RUST_GPL(symbol);' > $@
+
+$(obj)/exports_core_generated.h: $(obj)/core.o FORCE
+	$(call if_changed,exports)
+
+$(obj)/exports_alloc_generated.h: $(obj)/alloc.o FORCE
+	$(call if_changed,exports)
+
+$(obj)/exports_bindings_generated.h: $(obj)/bindings.o FORCE
+	$(call if_changed,exports)
+
+$(obj)/exports_kernel_generated.h: $(obj)/kernel.o FORCE
+	$(call if_changed,exports)
+
+quiet_cmd_rustc_procmacro = $(RUSTC_OR_CLIPPY_QUIET) P $@
+      cmd_rustc_procmacro = \
+	$(RUSTC_OR_CLIPPY) $(rust_common_flags) \
+		--emit=dep-info,link --extern proc_macro \
+		--crate-type proc-macro --out-dir $(objtree)/$(obj) \
+		--crate-name $(patsubst lib%.so,%,$(notdir $@)) $<; \
+	mv $(objtree)/$(obj)/$(patsubst lib%.so,%,$(notdir $@)).d $(depfile); \
+	sed -i '/^\#/d' $(depfile)
+
+# Procedural macros can only be used with the `rustc` that compiled it.
+# Therefore, to get `libmacros.so` automatically recompiled when the compiler
+# version changes, we add `core.o` as a dependency (even if it is not needed).
+$(obj)/libmacros.so: $(src)/macros/lib.rs $(obj)/core.o FORCE
+	$(call if_changed_dep,rustc_procmacro)
+
+quiet_cmd_rustc_library = $(if $(skip_clippy),RUSTC,$(RUSTC_OR_CLIPPY_QUIET)) L $@
+      cmd_rustc_library = \
+	OBJTREE=$(abspath $(objtree)) \
+	$(if $(skip_clippy),$(RUSTC),$(RUSTC_OR_CLIPPY)) \
+		$(filter-out $(skip_flags),$(rust_flags) $(rustc_target_flags)) \
+		--emit=dep-info,obj,metadata --crate-type rlib \
+		--out-dir $(objtree)/$(obj) -L$(objtree)/$(obj) \
+		--crate-name $(patsubst %.o,%,$(notdir $@)) $<; \
+	mv $(objtree)/$(obj)/$(patsubst %.o,%,$(notdir $@)).d $(depfile); \
+	sed -i '/^\#/d' $(depfile) \
+	$(if $(rustc_objcopy),;$(OBJCOPY) $(rustc_objcopy) $@)
+
+rust-analyzer:
+	$(Q)$(srctree)/scripts/generate_rust_analyzer.py $(srctree) $(objtree) \
+		$(RUST_LIB_SRC) > $(objtree)/rust-project.json
+
+$(obj)/core.o: private skip_clippy = 1
+$(obj)/core.o: private skip_flags = -Dunreachable_pub
+$(obj)/core.o: private rustc_target_flags = $(core-cfgs)
+$(obj)/core.o: $(RUST_LIB_SRC)/core/src/lib.rs $(obj)/target.json FORCE
+	$(call if_changed_dep,rustc_library)
+
+$(obj)/compiler_builtins.o: private rustc_objcopy = -w -W '__*'
+$(obj)/compiler_builtins.o: $(src)/compiler_builtins.rs $(obj)/core.o FORCE
+	$(call if_changed_dep,rustc_library)
+
+$(obj)/alloc.o: private skip_clippy = 1
+$(obj)/alloc.o: private skip_flags = -Dunreachable_pub
+$(obj)/alloc.o: private rustc_target_flags = $(alloc-cfgs)
+$(obj)/alloc.o: $(src)/alloc/lib.rs $(obj)/compiler_builtins.o FORCE
+	$(call if_changed_dep,rustc_library)
+
+$(obj)/bindings.o: $(src)/bindings/lib.rs \
+    $(obj)/compiler_builtins.o \
+    $(obj)/bindings/bindings_generated.rs \
+    $(obj)/bindings/bindings_helpers_generated.rs FORCE
+	$(call if_changed_dep,rustc_library)
+
+$(obj)/kernel.o: private rustc_target_flags = --extern alloc \
+    --extern macros --extern bindings
+$(obj)/kernel.o: $(src)/kernel/lib.rs $(obj)/alloc.o \
+    $(obj)/libmacros.so $(obj)/bindings.o FORCE
+	$(call if_changed_dep,rustc_library)
+
+endif # CONFIG_RUST
diff --git a/rust/bindgen_parameters b/rust/bindgen_parameters
new file mode 100644
index 000000000000..be4963bf7203
--- /dev/null
+++ b/rust/bindgen_parameters
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
+
+--opaque-type xregs_state
+--opaque-type desc_struct
+--opaque-type arch_lbr_state
+--opaque-type local_apic
+
+# Packed type cannot transitively contain a `#[repr(align)]` type.
+--opaque-type x86_msi_data
+--opaque-type x86_msi_addr_lo
+
+# `try` is a reserved keyword since Rust 2018; solved in `bindgen` v0.59.2,
+# commit 2aed6b021680 ("context: Escape the try keyword properly").
+--opaque-type kunit_try_catch
+
+# If SMP is disabled, `arch_spinlock_t` is defined as a ZST which triggers a Rust
+# warning. We don't need to peek into it anyway.
+--opaque-type spinlock
+
+# `seccomp`'s comment gets understood as a doctest
+--no-doc-comments
diff --git a/scripts/Kconfig.include b/scripts/Kconfig.include
index a0ccceb22cf8..274125307ebd 100644
--- a/scripts/Kconfig.include
+++ b/scripts/Kconfig.include
@@ -36,12 +36,12 @@ ld-option = $(success,$(LD) -v $(1))
 as-instr = $(success,printf "%b\n" "$(1)" | $(CC) $(CLANG_FLAGS) -c -x assembler -o /dev/null -)
 
 # check if $(CC) and $(LD) exist
-$(error-if,$(failure,command -v $(CC)),compiler '$(CC)' not found)
+$(error-if,$(failure,command -v $(CC)),C compiler '$(CC)' not found)
 $(error-if,$(failure,command -v $(LD)),linker '$(LD)' not found)
 
-# Get the compiler name, version, and error out if it is not supported.
+# Get the C compiler name, version, and error out if it is not supported.
 cc-info := $(shell,$(srctree)/scripts/cc-version.sh $(CC))
-$(error-if,$(success,test -z "$(cc-info)"),Sorry$(comma) this compiler is not supported.)
+$(error-if,$(success,test -z "$(cc-info)"),Sorry$(comma) this C compiler is not supported.)
 cc-name := $(shell,set -- $(cc-info) && echo $1)
 cc-version := $(shell,set -- $(cc-info) && echo $2)
 
diff --git a/scripts/Makefile b/scripts/Makefile
index f084f08ed176..1575af84d557 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -10,6 +10,9 @@ hostprogs-always-$(CONFIG_BUILDTIME_TABLE_SORT)		+= sorttable
 hostprogs-always-$(CONFIG_ASN1)				+= asn1_compiler
 hostprogs-always-$(CONFIG_MODULE_SIG_FORMAT)		+= sign-file
 hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE)	+= insert-sys-cert
+hostprogs-always-$(CONFIG_RUST)				+= generate_rust_target
+
+generate_rust_target-rust := y
 
 HOSTCFLAGS_sorttable.o = -I$(srctree)/tools/include
 HOSTLDLIBS_sorttable = -lpthread
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 784f46d41959..27be77c0d6d8 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -26,6 +26,7 @@ EXTRA_CPPFLAGS :=
 EXTRA_LDFLAGS  :=
 asflags-y  :=
 ccflags-y  :=
+rustflags-y :=
 cppflags-y :=
 ldflags-y  :=
 
@@ -271,6 +272,65 @@ quiet_cmd_cc_lst_c = MKLST   $@
 $(obj)/%.lst: $(src)/%.c FORCE
 	$(call if_changed_dep,cc_lst_c)
 
+# Compile Rust sources (.rs)
+# ---------------------------------------------------------------------------
+
+rust_allowed_features := core_ffi_c
+
+rust_common_cmd = \
+	RUST_MODFILE=$(modfile) $(RUSTC_OR_CLIPPY) $(rust_flags) \
+	-Zallow-features=$(rust_allowed_features) \
+	-Zcrate-attr=no_std \
+	-Zcrate-attr='feature($(rust_allowed_features))' \
+	--extern alloc --extern kernel \
+	--crate-type rlib --out-dir $(obj) -L $(objtree)/rust/ \
+	--crate-name $(basename $(notdir $@))
+
+rust_handle_depfile = \
+	mv $(obj)/$(basename $(notdir $@)).d $(depfile); \
+	sed -i '/^\#/d' $(depfile)
+
+# `--emit=obj`, `--emit=asm` and `--emit=llvm-ir` imply a single codegen unit
+# will be used. We explicitly request `-Ccodegen-units=1` in any case, and
+# the compiler shows a warning if it is not 1. However, if we ever stop
+# requesting it explicitly and we start using some other `--emit` that does not
+# imply it (and for which codegen is performed), then we would be out of sync,
+# i.e. the outputs we would get for the different single targets (e.g. `.ll`)
+# would not match each other.
+
+quiet_cmd_rustc_o_rs = $(RUSTC_OR_CLIPPY_QUIET) $(quiet_modtag) $@
+      cmd_rustc_o_rs = \
+	$(rust_common_cmd) --emit=dep-info,obj $<; \
+	$(rust_handle_depfile)
+
+$(obj)/%.o: $(src)/%.rs FORCE
+	$(call if_changed_dep,rustc_o_rs)
+
+quiet_cmd_rustc_rsi_rs = $(RUSTC_OR_CLIPPY_QUIET) $(quiet_modtag) $@
+      cmd_rustc_rsi_rs = \
+	$(rust_common_cmd) --emit=dep-info -Zunpretty=expanded $< >$@; \
+	command -v $(RUSTFMT) >/dev/null && $(RUSTFMT) $@; \
+	$(rust_handle_depfile)
+
+$(obj)/%.rsi: $(src)/%.rs FORCE
+	$(call if_changed_dep,rustc_rsi_rs)
+
+quiet_cmd_rustc_s_rs = $(RUSTC_OR_CLIPPY_QUIET) $(quiet_modtag) $@
+      cmd_rustc_s_rs = \
+	$(rust_common_cmd) --emit=dep-info,asm $<; \
+	$(rust_handle_depfile)
+
+$(obj)/%.s: $(src)/%.rs FORCE
+	$(call if_changed_dep,rustc_s_rs)
+
+quiet_cmd_rustc_ll_rs = $(RUSTC_OR_CLIPPY_QUIET) $(quiet_modtag) $@
+      cmd_rustc_ll_rs = \
+	$(rust_common_cmd) --emit=dep-info,llvm-ir $<; \
+	$(rust_handle_depfile)
+
+$(obj)/%.ll: $(src)/%.rs FORCE
+	$(call if_changed_dep,rustc_ll_rs)
+
 # Compile assembler sources (.S)
 # ---------------------------------------------------------------------------
 
diff --git a/scripts/Makefile.debug b/scripts/Makefile.debug
index 8cf1cb22dd93..332c486f705f 100644
--- a/scripts/Makefile.debug
+++ b/scripts/Makefile.debug
@@ -1,4 +1,6 @@
 DEBUG_CFLAGS	:=
+DEBUG_RUSTFLAGS	:=
+
 debug-flags-y	:= -g
 
 ifdef CONFIG_DEBUG_INFO_SPLIT
@@ -17,9 +19,12 @@ KBUILD_AFLAGS	+= $(debug-flags-y)
 
 ifdef CONFIG_DEBUG_INFO_REDUCED
 DEBUG_CFLAGS	+= -fno-var-tracking
+DEBUG_RUSTFLAGS	+= -Cdebuginfo=1
 ifdef CONFIG_CC_IS_GCC
 DEBUG_CFLAGS	+= -femit-struct-debug-baseonly
 endif
+else
+DEBUG_RUSTFLAGS	+= -Cdebuginfo=2
 endif
 
 ifdef CONFIG_DEBUG_INFO_COMPRESSED
@@ -30,3 +35,6 @@ endif
 
 KBUILD_CFLAGS	+= $(DEBUG_CFLAGS)
 export DEBUG_CFLAGS
+
+KBUILD_RUSTFLAGS += $(DEBUG_RUSTFLAGS)
+export DEBUG_RUSTFLAGS
diff --git a/scripts/Makefile.host b/scripts/Makefile.host
index 278b4d6ac945..da133780b751 100644
--- a/scripts/Makefile.host
+++ b/scripts/Makefile.host
@@ -22,6 +22,8 @@ $(obj)/%.tab.c $(obj)/%.tab.h: $(src)/%.y FORCE
 # to preprocess a data file.
 #
 # Both C and C++ are supported, but preferred language is C for such utilities.
+# Rust is also supported, but it may only be used in scenarios where a Rust
+# toolchain is required to be available (e.g. when  `CONFIG_RUST` is enabled).
 #
 # Sample syntax (see Documentation/kbuild/makefiles.rst for reference)
 # hostprogs := bin2hex
@@ -37,15 +39,20 @@ $(obj)/%.tab.c $(obj)/%.tab.h: $(src)/%.y FORCE
 # qconf-objs      := menu.o
 # Will compile qconf as a C++ program, and menu as a C program.
 # They are linked as C++ code to the executable qconf
+#
+# hostprogs   := target
+# target-rust := y
+# Will compile `target` as a Rust program, using `target.rs` as the crate root.
+# The crate may consist of several source files.
 
 # C code
 # Executables compiled from a single .c file
 host-csingle	:= $(foreach m,$(hostprogs), \
-			$(if $($(m)-objs)$($(m)-cxxobjs),,$(m)))
+			$(if $($(m)-objs)$($(m)-cxxobjs)$($(m)-rust),,$(m)))
 
 # C executables linked based on several .o files
 host-cmulti	:= $(foreach m,$(hostprogs),\
-		   $(if $($(m)-cxxobjs),,$(if $($(m)-objs),$(m))))
+		   $(if $($(m)-cxxobjs)$($(m)-rust),,$(if $($(m)-objs),$(m))))
 
 # Object (.o) files compiled from .c files
 host-cobjs	:= $(sort $(foreach m,$(hostprogs),$($(m)-objs)))
@@ -58,11 +65,17 @@ host-cxxmulti	:= $(foreach m,$(hostprogs),$(if $($(m)-cxxobjs),$(m)))
 # C++ Object (.o) files compiled from .cc files
 host-cxxobjs	:= $(sort $(foreach m,$(host-cxxmulti),$($(m)-cxxobjs)))
 
+# Rust code
+# Executables compiled from a single Rust crate (which may consist of
+# one or more .rs files)
+host-rust	:= $(foreach m,$(hostprogs),$(if $($(m)-rust),$(m)))
+
 host-csingle	:= $(addprefix $(obj)/,$(host-csingle))
 host-cmulti	:= $(addprefix $(obj)/,$(host-cmulti))
 host-cobjs	:= $(addprefix $(obj)/,$(host-cobjs))
 host-cxxmulti	:= $(addprefix $(obj)/,$(host-cxxmulti))
 host-cxxobjs	:= $(addprefix $(obj)/,$(host-cxxobjs))
+host-rust	:= $(addprefix $(obj)/,$(host-rust))
 
 #####
 # Handle options to gcc. Support building with separate output directory
@@ -71,6 +84,8 @@ _hostc_flags   = $(KBUILD_HOSTCFLAGS)   $(HOST_EXTRACFLAGS)   \
                  $(HOSTCFLAGS_$(target-stem).o)
 _hostcxx_flags = $(KBUILD_HOSTCXXFLAGS) $(HOST_EXTRACXXFLAGS) \
                  $(HOSTCXXFLAGS_$(target-stem).o)
+_hostrust_flags = $(KBUILD_HOSTRUSTFLAGS) $(HOST_EXTRARUSTFLAGS) \
+                  $(HOSTRUSTFLAGS_$(target-stem))
 
 # $(objtree)/$(obj) for including generated headers from checkin source files
 ifeq ($(KBUILD_EXTMOD),)
@@ -82,6 +97,7 @@ endif
 
 hostc_flags    = -Wp,-MMD,$(depfile) $(_hostc_flags)
 hostcxx_flags  = -Wp,-MMD,$(depfile) $(_hostcxx_flags)
+hostrust_flags = $(_hostrust_flags)
 
 #####
 # Compile programs on the host
@@ -128,5 +144,17 @@ quiet_cmd_host-cxxobjs	= HOSTCXX $@
 $(host-cxxobjs): $(obj)/%.o: $(src)/%.cc FORCE
 	$(call if_changed_dep,host-cxxobjs)
 
+# Create executable from a single Rust crate (which may consist of
+# one or more `.rs` files)
+# host-rust -> Executable
+quiet_cmd_host-rust	= HOSTRUSTC $@
+      cmd_host-rust	= \
+	$(HOSTRUSTC) $(hostrust_flags) --emit=dep-info,link \
+		--out-dir=$(obj)/ $<; \
+	mv $(obj)/$(target-stem).d $(depfile); \
+	sed -i '/^\#/d' $(depfile)
+$(host-rust): $(obj)/%: $(src)/%.rs FORCE
+	$(call if_changed_dep,host-rust)
+
 targets += $(host-csingle) $(host-cmulti) $(host-cobjs) \
-	   $(host-cxxmulti) $(host-cxxobjs)
+	   $(host-cxxmulti) $(host-cxxobjs) $(host-rust)
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 3fb6a99e78c4..c88b98b5dc44 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -8,6 +8,7 @@ ldflags-y  += $(EXTRA_LDFLAGS)
 # flags that take effect in current and sub directories
 KBUILD_AFLAGS += $(subdir-asflags-y)
 KBUILD_CFLAGS += $(subdir-ccflags-y)
+KBUILD_RUSTFLAGS += $(subdir-rustflags-y)
 
 # Figure out what we need to build from the various variables
 # ===========================================================================
@@ -128,6 +129,10 @@ _c_flags       = $(filter-out $(CFLAGS_REMOVE_$(target-stem).o), \
                      $(filter-out $(ccflags-remove-y), \
                          $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(ccflags-y)) \
                      $(CFLAGS_$(target-stem).o))
+_rust_flags    = $(filter-out $(RUSTFLAGS_REMOVE_$(target-stem).o), \
+                     $(filter-out $(rustflags-remove-y), \
+                         $(KBUILD_RUSTFLAGS) $(rustflags-y)) \
+                     $(RUSTFLAGS_$(target-stem).o))
 _a_flags       = $(filter-out $(AFLAGS_REMOVE_$(target-stem).o), \
                      $(filter-out $(asflags-remove-y), \
                          $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(asflags-y)) \
@@ -202,6 +207,11 @@ modkern_cflags =                                          \
 		$(KBUILD_CFLAGS_MODULE) $(CFLAGS_MODULE), \
 		$(KBUILD_CFLAGS_KERNEL) $(CFLAGS_KERNEL) $(modfile_flags))
 
+modkern_rustflags =                                              \
+	$(if $(part-of-module),                                   \
+		$(KBUILD_RUSTFLAGS_MODULE) $(RUSTFLAGS_MODULE), \
+		$(KBUILD_RUSTFLAGS_KERNEL) $(RUSTFLAGS_KERNEL))
+
 modkern_aflags = $(if $(part-of-module),				\
 			$(KBUILD_AFLAGS_MODULE) $(AFLAGS_MODULE),	\
 			$(KBUILD_AFLAGS_KERNEL) $(AFLAGS_KERNEL))
@@ -211,6 +221,8 @@ c_flags        = -Wp,-MMD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE)     \
 		 $(_c_flags) $(modkern_cflags)                           \
 		 $(basename_flags) $(modname_flags)
 
+rust_flags     = $(_rust_flags) $(modkern_rustflags) @$(objtree)/include/generated/rustc_cfg
+
 a_flags        = -Wp,-MMD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE)     \
 		 $(_a_flags) $(modkern_aflags)
 
diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal
index 35100e981f4a..9a1fa6aa30fe 100644
--- a/scripts/Makefile.modfinal
+++ b/scripts/Makefile.modfinal
@@ -39,11 +39,13 @@ quiet_cmd_ld_ko_o = LD [M]  $@
 
 quiet_cmd_btf_ko = BTF [M] $@
       cmd_btf_ko = 							\
-	if [ -f vmlinux ]; then						\
+	if [ ! -f vmlinux ]; then					\
+		printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \
+	elif [ -n "$(CONFIG_RUST)" ] && $(srctree)/scripts/is_rust_module.sh $@; then 		\
+		printf "Skipping BTF generation for %s because it's a Rust module\n" $@ 1>&2; \
+	else								\
 		LLVM_OBJCOPY="$(OBJCOPY)" $(PAHOLE) -J $(PAHOLE_FLAGS) --btf_base vmlinux $@; \
 		$(RESOLVE_BTFIDS) -b vmlinux $@; 			\
-	else								\
-		printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \
 	fi;
 
 # Same as newer-prereqs, but allows to exclude specified extra dependencies
diff --git a/scripts/cc-version.sh b/scripts/cc-version.sh
index f1952c522466..2401c86fcf53 100755
--- a/scripts/cc-version.sh
+++ b/scripts/cc-version.sh
@@ -1,13 +1,13 @@
 #!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 #
-# Print the compiler name and its version in a 5 or 6-digit form.
+# Print the C compiler name and its version in a 5 or 6-digit form.
 # Also, perform the minimum version check.
 
 set -e
 
-# Print the compiler name and some version components.
-get_compiler_info()
+# Print the C compiler name and some version components.
+get_c_compiler_info()
 {
 	cat <<- EOF | "$@" -E -P -x c - 2>/dev/null
 	#if defined(__clang__)
@@ -32,7 +32,7 @@ get_canonical_version()
 
 # $@ instead of $1 because multiple words might be given, e.g. CC="ccache gcc".
 orig_args="$@"
-set -- $(get_compiler_info "$@")
+set -- $(get_c_compiler_info "$@")
 
 name=$1
 
@@ -52,7 +52,7 @@ ICC)
 	min_version=$($min_tool_version icc)
 	;;
 *)
-	echo "$orig_args: unknown compiler" >&2
+	echo "$orig_args: unknown C compiler" >&2
 	exit 1
 	;;
 esac
@@ -62,7 +62,7 @@ min_cversion=$(get_canonical_version $min_version)
 
 if [ "$cversion" -lt "$min_cversion" ]; then
 	echo >&2 "***"
-	echo >&2 "*** Compiler is too old."
+	echo >&2 "*** C compiler is too old."
 	echo >&2 "***   Your $name version:    $version"
 	echo >&2 "***   Minimum $name version: $min_version"
 	echo >&2 "***"
diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c
index c4340c90e172..b7c9f1dd5e42 100644
--- a/scripts/kconfig/confdata.c
+++ b/scripts/kconfig/confdata.c
@@ -216,6 +216,13 @@ static const char *conf_get_autoheader_name(void)
 	return name ? name : "include/generated/autoconf.h";
 }
 
+static const char *conf_get_rustccfg_name(void)
+{
+	char *name = getenv("KCONFIG_RUSTCCFG");
+
+	return name ? name : "include/generated/rustc_cfg";
+}
+
 static int conf_set_sym_val(struct symbol *sym, int def, int def_flags, char *p)
 {
 	char *p2;
@@ -605,6 +612,9 @@ static const struct comment_style comment_style_c = {
 
 static void conf_write_heading(FILE *fp, const struct comment_style *cs)
 {
+	if (!cs)
+		return;
+
 	fprintf(fp, "%s\n", cs->prefix);
 
 	fprintf(fp, "%s Automatically generated file; DO NOT EDIT.\n",
@@ -745,6 +755,65 @@ static void print_symbol_for_c(FILE *fp, struct symbol *sym)
 	free(escaped);
 }
 
+static void print_symbol_for_rustccfg(FILE *fp, struct symbol *sym)
+{
+	const char *val;
+	const char *val_prefix = "";
+	char *val_prefixed = NULL;
+	size_t val_prefixed_len;
+	char *escaped = NULL;
+
+	if (sym->type == S_UNKNOWN)
+		return;
+
+	val = sym_get_string_value(sym);
+
+	switch (sym->type) {
+	case S_BOOLEAN:
+	case S_TRISTATE:
+		/*
+		 * We do not care about disabled ones, i.e. no need for
+		 * what otherwise are "comments" in other printers.
+		 */
+		if (*val == 'n')
+			return;
+
+		/*
+		 * To have similar functionality to the C macro `IS_ENABLED()`
+		 * we provide an empty `--cfg CONFIG_X` here in both `y`
+		 * and `m` cases.
+		 *
+		 * Then, the common `fprintf()` below will also give us
+		 * a `--cfg CONFIG_X="y"` or `--cfg CONFIG_X="m"`, which can
+		 * be used as the equivalent of `IS_BUILTIN()`/`IS_MODULE()`.
+		 */
+		fprintf(fp, "--cfg=%s%s\n", CONFIG_, sym->name);
+		break;
+	case S_HEX:
+		if (val[0] != '0' || (val[1] != 'x' && val[1] != 'X'))
+			val_prefix = "0x";
+		break;
+	default:
+		break;
+	}
+
+	if (strlen(val_prefix) > 0) {
+		val_prefixed_len = strlen(val) + strlen(val_prefix) + 1;
+		val_prefixed = xmalloc(val_prefixed_len);
+		snprintf(val_prefixed, val_prefixed_len, "%s%s", val_prefix, val);
+		val = val_prefixed;
+	}
+
+	/* All values get escaped: the `--cfg` option only takes strings */
+	escaped = escape_string_value(val);
+	val = escaped;
+
+	fprintf(fp, "--cfg=%s%s=%s\n", CONFIG_, sym->name, val);
+
+	free(escaped);
+	free(val_prefixed);
+}
+
 /*
  * Write out a minimal config.
  * All values that has default values are skipped as this is redundant.
@@ -1132,6 +1201,12 @@ int conf_write_autoconf(int overwrite)
 	if (ret)
 		return ret;
 
+	ret = __conf_write_autoconf(conf_get_rustccfg_name(),
+				    print_symbol_for_rustccfg,
+				    NULL);
+	if (ret)
+		return ret;
+
 	/*
 	 * Create include/config/auto.conf. This must be the last step because
 	 * Kbuild has a dependency on auto.conf and this marks the successful
-- 
cgit v1.2.3


From d07479b211b7a86c93883c74b8f9b1e33d06e262 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sat, 3 Jul 2021 17:23:16 +0200
Subject: docs: add Rust documentation

Most of the documentation for Rust is written within the source code
itself, as it is idiomatic for Rust projects. This applies to both
the shared infrastructure at `rust/` as well as any other Rust module
(e.g. drivers) written across the kernel.

However, these documents contain general information that does not
fit particularly well in the source code, like the Quick Start guide.

It also contains a few other small changes elsewhere in the
documentation folder.

Reviewed-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Finn Behrens <me@kloenk.de>
Signed-off-by: Finn Behrens <me@kloenk.de>
Co-developed-by: Adam Bratschi-Kaye <ark.email@gmail.com>
Signed-off-by: Adam Bratschi-Kaye <ark.email@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Co-developed-by: Sven Van Asbroeck <thesven73@gmail.com>
Signed-off-by: Sven Van Asbroeck <thesven73@gmail.com>
Co-developed-by: Wu XiangCheng <bobwxc@email.cn>
Signed-off-by: Wu XiangCheng <bobwxc@email.cn>
Co-developed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Gary Guo <gary@garyguo.net>
Co-developed-by: Boris-Chengbiao Zhou <bobo1239@web.de>
Signed-off-by: Boris-Chengbiao Zhou <bobo1239@web.de>
Co-developed-by: Yuki Okushi <jtitor@2k36.org>
Signed-off-by: Yuki Okushi <jtitor@2k36.org>
Co-developed-by: Wei Liu <wei.liu@kernel.org>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
Co-developed-by: Daniel Xu <dxu@dxuuu.xyz>
Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
Co-developed-by: Julian Merkle <me@jvmerkle.de>
Signed-off-by: Julian Merkle <me@jvmerkle.de>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Documentation/doc-guide/kernel-doc.rst     |   3 +
 Documentation/index.rst                    |   1 +
 Documentation/kbuild/kbuild.rst            |  17 +++
 Documentation/kbuild/makefiles.rst         |  50 ++++++-
 Documentation/process/changes.rst          |  41 +++++
 Documentation/rust/arch-support.rst        |  18 +++
 Documentation/rust/coding-guidelines.rst   | 216 +++++++++++++++++++++++++++
 Documentation/rust/general-information.rst |  79 ++++++++++
 Documentation/rust/index.rst               |  22 +++
 Documentation/rust/quick-start.rst         | 232 +++++++++++++++++++++++++++++
 10 files changed, 675 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/rust/arch-support.rst
 create mode 100644 Documentation/rust/coding-guidelines.rst
 create mode 100644 Documentation/rust/general-information.rst
 create mode 100644 Documentation/rust/index.rst
 create mode 100644 Documentation/rust/quick-start.rst

diff --git a/Documentation/doc-guide/kernel-doc.rst b/Documentation/doc-guide/kernel-doc.rst
index 9c779bd7a751..1dcbd7332476 100644
--- a/Documentation/doc-guide/kernel-doc.rst
+++ b/Documentation/doc-guide/kernel-doc.rst
@@ -14,6 +14,9 @@ when it is embedded in source files.
    reasons. The kernel source contains tens of thousands of kernel-doc
    comments. Please stick to the style described here.
 
+.. note:: kernel-doc does not cover Rust code: please see
+   Documentation/rust/general-information.rst instead.
+
 The kernel-doc structure is extracted from the comments, and proper
 `Sphinx C Domain`_ function and type descriptions with anchors are
 generated from them. The descriptions are filtered for special kernel-doc
diff --git a/Documentation/index.rst b/Documentation/index.rst
index 4737c18c97ff..00722aa20cd7 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -82,6 +82,7 @@ merged much easier.
    maintainer/index
    fault-injection/index
    livepatch/index
+   rust/index
 
 
 Kernel API documentation
diff --git a/Documentation/kbuild/kbuild.rst b/Documentation/kbuild/kbuild.rst
index ef19b9c13523..08f575e6236c 100644
--- a/Documentation/kbuild/kbuild.rst
+++ b/Documentation/kbuild/kbuild.rst
@@ -48,6 +48,10 @@ KCFLAGS
 -------
 Additional options to the C compiler (for built-in and modules).
 
+KRUSTFLAGS
+----------
+Additional options to the Rust compiler (for built-in and modules).
+
 CFLAGS_KERNEL
 -------------
 Additional options for $(CC) when used to compile
@@ -57,6 +61,15 @@ CFLAGS_MODULE
 -------------
 Additional module specific options to use for $(CC).
 
+RUSTFLAGS_KERNEL
+----------------
+Additional options for $(RUSTC) when used to compile
+code that is compiled as built-in.
+
+RUSTFLAGS_MODULE
+----------------
+Additional module specific options to use for $(RUSTC).
+
 LDFLAGS_MODULE
 --------------
 Additional options used for $(LD) when linking modules.
@@ -69,6 +82,10 @@ HOSTCXXFLAGS
 ------------
 Additional flags to be passed to $(HOSTCXX) when building host programs.
 
+HOSTRUSTFLAGS
+-------------
+Additional flags to be passed to $(HOSTRUSTC) when building host programs.
+
 HOSTLDFLAGS
 -----------
 Additional flags to be passed when linking host programs.
diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst
index 11a296e52d68..5ea1e72d89c8 100644
--- a/Documentation/kbuild/makefiles.rst
+++ b/Documentation/kbuild/makefiles.rst
@@ -29,8 +29,9 @@ This document describes the Linux kernel Makefiles.
 	   --- 4.1 Simple Host Program
 	   --- 4.2 Composite Host Programs
 	   --- 4.3 Using C++ for host programs
-	   --- 4.4 Controlling compiler options for host programs
-	   --- 4.5 When host programs are actually built
+	   --- 4.4 Using Rust for host programs
+	   --- 4.5 Controlling compiler options for host programs
+	   --- 4.6 When host programs are actually built
 
 	=== 5 Userspace Program support
 	   --- 5.1 Simple Userspace Program
@@ -835,7 +836,24 @@ Both possibilities are described in the following.
 		qconf-cxxobjs := qconf.o
 		qconf-objs    := check.o
 
-4.4 Controlling compiler options for host programs
+4.4 Using Rust for host programs
+--------------------------------
+
+	Kbuild offers support for host programs written in Rust. However,
+	since a Rust toolchain is not mandatory for kernel compilation,
+	it may only be used in scenarios where Rust is required to be
+	available (e.g. when  ``CONFIG_RUST`` is enabled).
+
+	Example::
+
+		hostprogs     := target
+		target-rust   := y
+
+	Kbuild will compile ``target`` using ``target.rs`` as the crate root,
+	located in the same directory as the ``Makefile``. The crate may
+	consist of several source files (see ``samples/rust/hostprogs``).
+
+4.5 Controlling compiler options for host programs
 --------------------------------------------------
 
 	When compiling host programs, it is possible to set specific flags.
@@ -867,7 +885,7 @@ Both possibilities are described in the following.
 	When linking qconf, it will be passed the extra option
 	"-L$(QTDIR)/lib".
 
-4.5 When host programs are actually built
+4.6 When host programs are actually built
 -----------------------------------------
 
 	Kbuild will only build host-programs when they are referenced
@@ -1181,6 +1199,17 @@ When kbuild executes, the following steps are followed (roughly):
 	The first example utilises the trick that a config option expands
 	to 'y' when selected.
 
+    KBUILD_RUSTFLAGS
+	$(RUSTC) compiler flags
+
+	Default value - see top level Makefile
+	Append or modify as required per architecture.
+
+	Often, the KBUILD_RUSTFLAGS variable depends on the configuration.
+
+	Note that target specification file generation (for ``--target``)
+	is handled in ``scripts/generate_rust_target.rs``.
+
     KBUILD_AFLAGS_KERNEL
 	Assembler options specific for built-in
 
@@ -1208,6 +1237,19 @@ When kbuild executes, the following steps are followed (roughly):
 	are used for $(CC).
 	From commandline CFLAGS_MODULE shall be used (see kbuild.rst).
 
+    KBUILD_RUSTFLAGS_KERNEL
+	$(RUSTC) options specific for built-in
+
+	$(KBUILD_RUSTFLAGS_KERNEL) contains extra Rust compiler flags used to
+	compile resident kernel code.
+
+    KBUILD_RUSTFLAGS_MODULE
+	Options for $(RUSTC) when building modules
+
+	$(KBUILD_RUSTFLAGS_MODULE) is used to add arch-specific options that
+	are used for $(RUSTC).
+	From commandline RUSTFLAGS_MODULE shall be used (see kbuild.rst).
+
     KBUILD_LDFLAGS_MODULE
 	Options for $(LD) when linking modules
 
diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index 19c286c23786..9a90197989dd 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -31,6 +31,8 @@ you probably needn't concern yourself with pcmciautils.
 ====================== ===============  ========================================
 GNU C                  5.1              gcc --version
 Clang/LLVM (optional)  11.0.0           clang --version
+Rust (optional)        1.62.0           rustc --version
+bindgen (optional)     0.56.0           bindgen --version
 GNU make               3.81             make --version
 bash                   4.2              bash --version
 binutils               2.23             ld -v
@@ -80,6 +82,29 @@ kernels. Older releases aren't guaranteed to work, and we may drop workarounds
 from the kernel that were used to support older versions. Please see additional
 docs on :ref:`Building Linux with Clang/LLVM <kbuild_llvm>`.
 
+Rust (optional)
+---------------
+
+A particular version of the Rust toolchain is required. Newer versions may or
+may not work because the kernel depends on some unstable Rust features, for
+the moment.
+
+Each Rust toolchain comes with several "components", some of which are required
+(like ``rustc``) and some that are optional. The ``rust-src`` component (which
+is optional) needs to be installed to build the kernel. Other components are
+useful for developing.
+
+Please see Documentation/rust/quick-start.rst for instructions on how to
+satisfy the build requirements of Rust support. In particular, the ``Makefile``
+target ``rustavailable`` is useful to check why the Rust toolchain may not
+be detected.
+
+bindgen (optional)
+------------------
+
+``bindgen`` is used to generate the Rust bindings to the C side of the kernel.
+It depends on ``libclang``.
+
 Make
 ----
 
@@ -348,6 +373,12 @@ Sphinx
 Please see :ref:`sphinx_install` in :ref:`Documentation/doc-guide/sphinx.rst <sphinxdoc>`
 for details about Sphinx requirements.
 
+rustdoc
+-------
+
+``rustdoc`` is used to generate the documentation for Rust code. Please see
+Documentation/rust/general-information.rst for more information.
+
 Getting updated software
 ========================
 
@@ -364,6 +395,16 @@ Clang/LLVM
 
 - :ref:`Getting LLVM <getting_llvm>`.
 
+Rust
+----
+
+- Documentation/rust/quick-start.rst.
+
+bindgen
+-------
+
+- Documentation/rust/quick-start.rst.
+
 Make
 ----
 
diff --git a/Documentation/rust/arch-support.rst b/Documentation/rust/arch-support.rst
new file mode 100644
index 000000000000..1152e0fbdad0
--- /dev/null
+++ b/Documentation/rust/arch-support.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Arch Support
+============
+
+Currently, the Rust compiler (``rustc``) uses LLVM for code generation,
+which limits the supported architectures that can be targeted. In addition,
+support for building the kernel with LLVM/Clang varies (please see
+Documentation/kbuild/llvm.rst). This support is needed for ``bindgen``
+which uses ``libclang``.
+
+Below is a general summary of architectures that currently work. Level of
+support corresponds to ``S`` values in the ``MAINTAINERS`` file.
+
+============  ================  ==============================================
+Architecture  Level of support  Constraints
+============  ================  ==============================================
+============  ================  ==============================================
diff --git a/Documentation/rust/coding-guidelines.rst b/Documentation/rust/coding-guidelines.rst
new file mode 100644
index 000000000000..aa8ed082613e
--- /dev/null
+++ b/Documentation/rust/coding-guidelines.rst
@@ -0,0 +1,216 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Coding Guidelines
+=================
+
+This document describes how to write Rust code in the kernel.
+
+
+Style & formatting
+------------------
+
+The code should be formatted using ``rustfmt``. In this way, a person
+contributing from time to time to the kernel does not need to learn and
+remember one more style guide. More importantly, reviewers and maintainers
+do not need to spend time pointing out style issues anymore, and thus
+less patch roundtrips may be needed to land a change.
+
+.. note:: Conventions on comments and documentation are not checked by
+  ``rustfmt``. Thus those are still needed to be taken care of.
+
+The default settings of ``rustfmt`` are used. This means the idiomatic Rust
+style is followed. For instance, 4 spaces are used for indentation rather
+than tabs.
+
+It is convenient to instruct editors/IDEs to format while typing,
+when saving or at commit time. However, if for some reason reformatting
+the entire kernel Rust sources is needed at some point, the following can be
+run::
+
+	make LLVM=1 rustfmt
+
+It is also possible to check if everything is formatted (printing a diff
+otherwise), for instance for a CI, with::
+
+	make LLVM=1 rustfmtcheck
+
+Like ``clang-format`` for the rest of the kernel, ``rustfmt`` works on
+individual files, and does not require a kernel configuration. Sometimes it may
+even work with broken code.
+
+
+Comments
+--------
+
+"Normal" comments (i.e. ``//``, rather than code documentation which starts
+with ``///`` or ``//!``) are written in Markdown the same way as documentation
+comments are, even though they will not be rendered. This improves consistency,
+simplifies the rules and allows to move content between the two kinds of
+comments more easily. For instance:
+
+.. code-block:: rust
+
+	// `object` is ready to be handled now.
+	f(object);
+
+Furthermore, just like documentation, comments are capitalized at the beginning
+of a sentence and ended with a period (even if it is a single sentence). This
+includes ``// SAFETY:``, ``// TODO:`` and other "tagged" comments, e.g.:
+
+.. code-block:: rust
+
+	// FIXME: The error should be handled properly.
+
+Comments should not be used for documentation purposes: comments are intended
+for implementation details, not users. This distinction is useful even if the
+reader of the source file is both an implementor and a user of an API. In fact,
+sometimes it is useful to use both comments and documentation at the same time.
+For instance, for a ``TODO`` list or to comment on the documentation itself.
+For the latter case, comments can be inserted in the middle; that is, closer to
+the line of documentation to be commented. For any other case, comments are
+written after the documentation, e.g.:
+
+.. code-block:: rust
+
+	/// Returns a new [`Foo`].
+	///
+	/// # Examples
+	///
+	// TODO: Find a better example.
+	/// ```
+	/// let foo = f(42);
+	/// ```
+	// FIXME: Use fallible approach.
+	pub fn f(x: i32) -> Foo {
+	    // ...
+	}
+
+One special kind of comments are the ``// SAFETY:`` comments. These must appear
+before every ``unsafe`` block, and they explain why the code inside the block is
+correct/sound, i.e. why it cannot trigger undefined behavior in any case, e.g.:
+
+.. code-block:: rust
+
+	// SAFETY: `p` is valid by the safety requirements.
+	unsafe { *p = 0; }
+
+``// SAFETY:`` comments are not to be confused with the ``# Safety`` sections
+in code documentation. ``# Safety`` sections specify the contract that callers
+(for functions) or implementors (for traits) need to abide by. ``// SAFETY:``
+comments show why a call (for functions) or implementation (for traits) actually
+respects the preconditions stated in a ``# Safety`` section or the language
+reference.
+
+
+Code documentation
+------------------
+
+Rust kernel code is not documented like C kernel code (i.e. via kernel-doc).
+Instead, the usual system for documenting Rust code is used: the ``rustdoc``
+tool, which uses Markdown (a lightweight markup language).
+
+To learn Markdown, there are many guides available out there. For instance,
+the one at:
+
+	https://commonmark.org/help/
+
+This is how a well-documented Rust function may look like:
+
+.. code-block:: rust
+
+	/// Returns the contained [`Some`] value, consuming the `self` value,
+	/// without checking that the value is not [`None`].
+	///
+	/// # Safety
+	///
+	/// Calling this method on [`None`] is *[undefined behavior]*.
+	///
+	/// [undefined behavior]: https://doc.rust-lang.org/reference/behavior-considered-undefined.html
+	///
+	/// # Examples
+	///
+	/// ```
+	/// let x = Some("air");
+	/// assert_eq!(unsafe { x.unwrap_unchecked() }, "air");
+	/// ```
+	pub unsafe fn unwrap_unchecked(self) -> T {
+	    match self {
+	        Some(val) => val,
+
+	        // SAFETY: The safety contract must be upheld by the caller.
+	        None => unsafe { hint::unreachable_unchecked() },
+	    }
+	}
+
+This example showcases a few ``rustdoc`` features and some conventions followed
+in the kernel:
+
+  - The first paragraph must be a single sentence briefly describing what
+    the documented item does. Further explanations must go in extra paragraphs.
+
+  - Unsafe functions must document their safety preconditions under
+    a ``# Safety`` section.
+
+  - While not shown here, if a function may panic, the conditions under which
+    that happens must be described under a ``# Panics`` section.
+
+    Please note that panicking should be very rare and used only with a good
+    reason. In almost all cases, a fallible approach should be used, typically
+    returning a ``Result``.
+
+  - If providing examples of usage would help readers, they must be written in
+    a section called ``# Examples``.
+
+  - Rust items (functions, types, constants...) must be linked appropriately
+    (``rustdoc`` will create a link automatically).
+
+  - Any ``unsafe`` block must be preceded by a ``// SAFETY:`` comment
+    describing why the code inside is sound.
+
+    While sometimes the reason might look trivial and therefore unneeded,
+    writing these comments is not just a good way of documenting what has been
+    taken into account, but most importantly, it provides a way to know that
+    there are no *extra* implicit constraints.
+
+To learn more about how to write documentation for Rust and extra features,
+please take a look at the ``rustdoc`` book at:
+
+	https://doc.rust-lang.org/rustdoc/how-to-write-documentation.html
+
+
+Naming
+------
+
+Rust kernel code follows the usual Rust naming conventions:
+
+	https://rust-lang.github.io/api-guidelines/naming.html
+
+When existing C concepts (e.g. macros, functions, objects...) are wrapped into
+a Rust abstraction, a name as close as reasonably possible to the C side should
+be used in order to avoid confusion and to improve readability when switching
+back and forth between the C and Rust sides. For instance, macros such as
+``pr_info`` from C are named the same in the Rust side.
+
+Having said that, casing should be adjusted to follow the Rust naming
+conventions, and namespacing introduced by modules and types should not be
+repeated in the item names. For instance, when wrapping constants like:
+
+.. code-block:: c
+
+	#define GPIO_LINE_DIRECTION_IN	0
+	#define GPIO_LINE_DIRECTION_OUT	1
+
+The equivalent in Rust may look like (ignoring documentation):
+
+.. code-block:: rust
+
+	pub mod gpio {
+	    pub enum LineDirection {
+	        In = bindings::GPIO_LINE_DIRECTION_IN as _,
+	        Out = bindings::GPIO_LINE_DIRECTION_OUT as _,
+	    }
+	}
+
+That is, the equivalent of ``GPIO_LINE_DIRECTION_IN`` would be referred to as
+``gpio::LineDirection::In``. In particular, it should not be named
+``gpio::gpio_line_direction::GPIO_LINE_DIRECTION_IN``.
diff --git a/Documentation/rust/general-information.rst b/Documentation/rust/general-information.rst
new file mode 100644
index 000000000000..49029ee82e55
--- /dev/null
+++ b/Documentation/rust/general-information.rst
@@ -0,0 +1,79 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+General Information
+===================
+
+This document contains useful information to know when working with
+the Rust support in the kernel.
+
+
+Code documentation
+------------------
+
+Rust kernel code is documented using ``rustdoc``, its built-in documentation
+generator.
+
+The generated HTML docs include integrated search, linked items (e.g. types,
+functions, constants), source code, etc. They may be read at (TODO: link when
+in mainline and generated alongside the rest of the documentation):
+
+	http://kernel.org/
+
+The docs can also be easily generated and read locally. This is quite fast
+(same order as compiling the code itself) and no special tools or environment
+are needed. This has the added advantage that they will be tailored to
+the particular kernel configuration used. To generate them, use the ``rustdoc``
+target with the same invocation used for compilation, e.g.::
+
+	make LLVM=1 rustdoc
+
+To read the docs locally in your web browser, run e.g.::
+
+	xdg-open rust/doc/kernel/index.html
+
+To learn about how to write the documentation, please see coding-guidelines.rst.
+
+
+Extra lints
+-----------
+
+While ``rustc`` is a very helpful compiler, some extra lints and analyses are
+available via ``clippy``, a Rust linter. To enable it, pass ``CLIPPY=1`` to
+the same invocation used for compilation, e.g.::
+
+	make LLVM=1 CLIPPY=1
+
+Please note that Clippy may change code generation, thus it should not be
+enabled while building a production kernel.
+
+
+Abstractions vs. bindings
+-------------------------
+
+Abstractions are Rust code wrapping kernel functionality from the C side.
+
+In order to use functions and types from the C side, bindings are created.
+Bindings are the declarations for Rust of those functions and types from
+the C side.
+
+For instance, one may write a ``Mutex`` abstraction in Rust which wraps
+a ``struct mutex`` from the C side and calls its functions through the bindings.
+
+Abstractions are not available for all the kernel internal APIs and concepts,
+but it is intended that coverage is expanded as time goes on. "Leaf" modules
+(e.g. drivers) should not use the C bindings directly. Instead, subsystems
+should provide as-safe-as-possible abstractions as needed.
+
+
+Conditional compilation
+-----------------------
+
+Rust code has access to conditional compilation based on the kernel
+configuration:
+
+.. code-block:: rust
+
+	#[cfg(CONFIG_X)]       // Enabled               (`y` or `m`)
+	#[cfg(CONFIG_X="y")]   // Enabled as a built-in (`y`)
+	#[cfg(CONFIG_X="m")]   // Enabled as a module   (`m`)
+	#[cfg(not(CONFIG_X))]  // Disabled
diff --git a/Documentation/rust/index.rst b/Documentation/rust/index.rst
new file mode 100644
index 000000000000..4ae8c66b94fa
--- /dev/null
+++ b/Documentation/rust/index.rst
@@ -0,0 +1,22 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Rust
+====
+
+Documentation related to Rust within the kernel. To start using Rust
+in the kernel, please read the quick-start.rst guide.
+
+.. toctree::
+    :maxdepth: 1
+
+    quick-start
+    general-information
+    coding-guidelines
+    arch-support
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst
new file mode 100644
index 000000000000..13b7744b1e27
--- /dev/null
+++ b/Documentation/rust/quick-start.rst
@@ -0,0 +1,232 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Quick Start
+===========
+
+This document describes how to get started with kernel development in Rust.
+
+
+Requirements: Building
+----------------------
+
+This section explains how to fetch the tools needed for building.
+
+Some of these requirements might be available from Linux distributions
+under names like ``rustc``, ``rust-src``, ``rust-bindgen``, etc. However,
+at the time of writing, they are likely not to be recent enough unless
+the distribution tracks the latest releases.
+
+To easily check whether the requirements are met, the following target
+can be used::
+
+	make LLVM=1 rustavailable
+
+This triggers the same logic used by Kconfig to determine whether
+``RUST_IS_AVAILABLE`` should be enabled; but it also explains why not
+if that is the case.
+
+
+rustc
+*****
+
+A particular version of the Rust compiler is required. Newer versions may or
+may not work because, for the moment, the kernel depends on some unstable
+Rust features.
+
+If ``rustup`` is being used, enter the checked out source code directory
+and run::
+
+	rustup override set $(scripts/min-tool-version.sh rustc)
+
+Otherwise, fetch a standalone installer or install ``rustup`` from:
+
+	https://www.rust-lang.org
+
+
+Rust standard library source
+****************************
+
+The Rust standard library source is required because the build system will
+cross-compile ``core`` and ``alloc``.
+
+If ``rustup`` is being used, run::
+
+	rustup component add rust-src
+
+The components are installed per toolchain, thus upgrading the Rust compiler
+version later on requires re-adding the component.
+
+Otherwise, if a standalone installer is used, the Rust repository may be cloned
+into the installation folder of the toolchain::
+
+	git clone --recurse-submodules \
+		--branch $(scripts/min-tool-version.sh rustc) \
+		https://github.com/rust-lang/rust \
+		$(rustc --print sysroot)/lib/rustlib/src/rust
+
+In this case, upgrading the Rust compiler version later on requires manually
+updating this clone.
+
+
+libclang
+********
+
+``libclang`` (part of LLVM) is used by ``bindgen`` to understand the C code
+in the kernel, which means LLVM needs to be installed; like when the kernel
+is compiled with ``CC=clang`` or ``LLVM=1``.
+
+Linux distributions are likely to have a suitable one available, so it is
+best to check that first.
+
+There are also some binaries for several systems and architectures uploaded at:
+
+	https://releases.llvm.org/download.html
+
+Otherwise, building LLVM takes quite a while, but it is not a complex process:
+
+	https://llvm.org/docs/GettingStarted.html#getting-the-source-code-and-building-llvm
+
+Please see Documentation/kbuild/llvm.rst for more information and further ways
+to fetch pre-built releases and distribution packages.
+
+
+bindgen
+*******
+
+The bindings to the C side of the kernel are generated at build time using
+the ``bindgen`` tool. A particular version is required.
+
+Install it via (note that this will download and build the tool from source)::
+
+	cargo install --locked --version $(scripts/min-tool-version.sh bindgen) bindgen
+
+
+Requirements: Developing
+------------------------
+
+This section explains how to fetch the tools needed for developing. That is,
+they are not needed when just building the kernel.
+
+
+rustfmt
+*******
+
+The ``rustfmt`` tool is used to automatically format all the Rust kernel code,
+including the generated C bindings (for details, please see
+coding-guidelines.rst).
+
+If ``rustup`` is being used, its ``default`` profile already installs the tool,
+thus nothing needs to be done. If another profile is being used, the component
+can be installed manually::
+
+	rustup component add rustfmt
+
+The standalone installers also come with ``rustfmt``.
+
+
+clippy
+******
+
+``clippy`` is a Rust linter. Running it provides extra warnings for Rust code.
+It can be run by passing ``CLIPPY=1`` to ``make`` (for details, please see
+general-information.rst).
+
+If ``rustup`` is being used, its ``default`` profile already installs the tool,
+thus nothing needs to be done. If another profile is being used, the component
+can be installed manually::
+
+	rustup component add clippy
+
+The standalone installers also come with ``clippy``.
+
+
+cargo
+*****
+
+``cargo`` is the Rust native build system. It is currently required to run
+the tests since it is used to build a custom standard library that contains
+the facilities provided by the custom ``alloc`` in the kernel. The tests can
+be run using the ``rusttest`` Make target.
+
+If ``rustup`` is being used, all the profiles already install the tool,
+thus nothing needs to be done.
+
+The standalone installers also come with ``cargo``.
+
+
+rustdoc
+*******
+
+``rustdoc`` is the documentation tool for Rust. It generates pretty HTML
+documentation for Rust code (for details, please see
+general-information.rst).
+
+``rustdoc`` is also used to test the examples provided in documented Rust code
+(called doctests or documentation tests). The ``rusttest`` Make target uses
+this feature.
+
+If ``rustup`` is being used, all the profiles already install the tool,
+thus nothing needs to be done.
+
+The standalone installers also come with ``rustdoc``.
+
+
+rust-analyzer
+*************
+
+The `rust-analyzer <https://rust-analyzer.github.io/>`_ language server can
+be used with many editors to enable syntax highlighting, completion, go to
+definition, and other features.
+
+``rust-analyzer`` needs a configuration file, ``rust-project.json``, which
+can be generated by the ``rust-analyzer`` Make target.
+
+
+Configuration
+-------------
+
+``Rust support`` (``CONFIG_RUST``) needs to be enabled in the ``General setup``
+menu. The option is only shown if a suitable Rust toolchain is found (see
+above), as long as the other requirements are met. In turn, this will make
+visible the rest of options that depend on Rust.
+
+Afterwards, go to::
+
+	Kernel hacking
+	    -> Sample kernel code
+	        -> Rust samples
+
+And enable some sample modules either as built-in or as loadable.
+
+
+Building
+--------
+
+Building a kernel with a complete LLVM toolchain is the best supported setup
+at the moment. That is::
+
+	make LLVM=1
+
+For architectures that do not support a full LLVM toolchain, use::
+
+	make CC=clang
+
+Using GCC also works for some configurations, but it is very experimental at
+the moment.
+
+
+Hacking
+-------
+
+To dive deeper, take a look at the source code of the samples
+at ``samples/rust/``, the Rust support code under ``rust/`` and
+the ``Rust hacking`` menu under ``Kernel hacking``.
+
+If GDB/Binutils is used and Rust symbols are not getting demangled, the reason
+is the toolchain does not support Rust's new v0 mangling scheme yet.
+There are a few ways out:
+
+  - Install a newer release (GDB >= 10.2, Binutils >= 2.36).
+
+  - Some versions of GDB (e.g. vanilla GDB 10.1) are able to use
+    the pre-demangled names embedded in the debug info (``CONFIG_DEBUG_INFO``).
-- 
cgit v1.2.3


From 094981352ce27bc36018c009d07ddf974c9725f5 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Thu, 4 Aug 2022 12:16:44 +0200
Subject: x86: enable initial Rust support

Note that only x86_64 is covered and not all features nor mitigations
are handled, but it is enough as a starting point and showcases
the basics needed to add Rust support for a new architecture.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: David Gow <davidgow@google.com>
Signed-off-by: David Gow <davidgow@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 Documentation/rust/arch-support.rst |  1 +
 arch/x86/Kconfig                    |  1 +
 arch/x86/Makefile                   | 10 ++++++++++
 scripts/generate_rust_target.rs     | 15 +++++++++++++--
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/Documentation/rust/arch-support.rst b/Documentation/rust/arch-support.rst
index 1152e0fbdad0..6982b63775da 100644
--- a/Documentation/rust/arch-support.rst
+++ b/Documentation/rust/arch-support.rst
@@ -15,4 +15,5 @@ support corresponds to ``S`` values in the ``MAINTAINERS`` file.
 ============  ================  ==============================================
 Architecture  Level of support  Constraints
 ============  ================  ==============================================
+``x86``       Maintained        ``x86_64`` only.
 ============  ================  ==============================================
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f9920f1341c8..3ca198742b10 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -257,6 +257,7 @@ config X86
 	select HAVE_STATIC_CALL_INLINE		if HAVE_OBJTOOL
 	select HAVE_PREEMPT_DYNAMIC_CALL
 	select HAVE_RSEQ
+	select HAVE_RUST			if X86_64
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UACCESS_VALIDATION		if HAVE_OBJTOOL
 	select HAVE_UNSTABLE_SCHED_CLOCK
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index bafbd905e6e7..2d7e640674c6 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -68,6 +68,7 @@ export BITS
 #    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
 #
 KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
 
 ifeq ($(CONFIG_X86_KERNEL_IBT),y)
 #
@@ -155,8 +156,17 @@ else
         cflags-$(CONFIG_GENERIC_CPU)	+= -mtune=generic
         KBUILD_CFLAGS += $(cflags-y)
 
+        rustflags-$(CONFIG_MK8)		+= -Ctarget-cpu=k8
+        rustflags-$(CONFIG_MPSC)	+= -Ctarget-cpu=nocona
+        rustflags-$(CONFIG_MCORE2)	+= -Ctarget-cpu=core2
+        rustflags-$(CONFIG_MATOM)	+= -Ctarget-cpu=atom
+        rustflags-$(CONFIG_GENERIC_CPU)	+= -Ztune-cpu=generic
+        KBUILD_RUSTFLAGS += $(rustflags-y)
+
         KBUILD_CFLAGS += -mno-red-zone
         KBUILD_CFLAGS += -mcmodel=kernel
+        KBUILD_RUSTFLAGS += -Cno-redzone=y
+        KBUILD_RUSTFLAGS += -Ccode-model=kernel
 endif
 
 #
diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs
index 7256c9606cf0..3c6cbe2b278d 100644
--- a/scripts/generate_rust_target.rs
+++ b/scripts/generate_rust_target.rs
@@ -148,8 +148,19 @@ fn main() {
     let mut ts = TargetSpec::new();
 
     // `llvm-target`s are taken from `scripts/Makefile.clang`.
-    if cfg.has("DUMMY_ARCH") {
-        ts.push("arch", "dummy_arch");
+    if cfg.has("X86_64") {
+        ts.push("arch", "x86_64");
+        ts.push(
+            "data-layout",
+            "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+        );
+        let mut features = "-3dnow,-3dnowa,-mmx,+soft-float".to_string();
+        if cfg.has("RETPOLINE") {
+            features += ",+retpoline-external-thunk";
+        }
+        ts.push("features", features);
+        ts.push("llvm-target", "x86_64-linux-gnu");
+        ts.push("target-pointer-width", "64");
     } else {
         panic!("Unsupported architecture");
     }
-- 
cgit v1.2.3


From e4fc6580b0796bcba8ca12c2c4b0352d280c91e5 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sat, 3 Jul 2021 17:21:12 +0200
Subject: samples: add first Rust examples

The beginning of a set of Rust modules that showcase how Rust
modules look like and how to use the abstracted kernel features.

It also includes an example of a Rust host program with
several modules.

These samples also double as tests in the CI.

Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Finn Behrens <me@kloenk.de>
Signed-off-by: Finn Behrens <me@kloenk.de>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Co-developed-by: Milan Landaverde <milan@mdaverde.com>
Signed-off-by: Milan Landaverde <milan@mdaverde.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 samples/Kconfig                   |  2 ++
 samples/Makefile                  |  1 +
 samples/rust/Kconfig              | 30 ++++++++++++++++++++++++++++++
 samples/rust/Makefile             |  5 +++++
 samples/rust/hostprogs/.gitignore |  3 +++
 samples/rust/hostprogs/Makefile   |  5 +++++
 samples/rust/hostprogs/a.rs       |  7 +++++++
 samples/rust/hostprogs/b.rs       |  5 +++++
 samples/rust/hostprogs/single.rs  | 12 ++++++++++++
 samples/rust/rust_minimal.rs      | 38 ++++++++++++++++++++++++++++++++++++++
 10 files changed, 108 insertions(+)
 create mode 100644 samples/rust/Kconfig
 create mode 100644 samples/rust/Makefile
 create mode 100644 samples/rust/hostprogs/.gitignore
 create mode 100644 samples/rust/hostprogs/Makefile
 create mode 100644 samples/rust/hostprogs/a.rs
 create mode 100644 samples/rust/hostprogs/b.rs
 create mode 100644 samples/rust/hostprogs/single.rs
 create mode 100644 samples/rust/rust_minimal.rs

diff --git a/samples/Kconfig b/samples/Kconfig
index 470ee3baf2e1..0d81c00289ee 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -263,6 +263,8 @@ config SAMPLE_CORESIGHT_SYSCFG
 	  This demonstrates how a user may create their own CoreSight
 	  configurations and easily load them into the system at runtime.
 
+source "samples/rust/Kconfig"
+
 endif # SAMPLES
 
 config HAVE_SAMPLE_FTRACE_DIRECT
diff --git a/samples/Makefile b/samples/Makefile
index 701e912ab5af..9832ef3f8fcb 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -35,3 +35,4 @@ subdir-$(CONFIG_SAMPLE_WATCH_QUEUE)	+= watch_queue
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST)	+= kmemleak/
 obj-$(CONFIG_SAMPLE_CORESIGHT_SYSCFG)	+= coresight/
 obj-$(CONFIG_SAMPLE_FPROBE)		+= fprobe/
+obj-$(CONFIG_SAMPLES_RUST)		+= rust/
diff --git a/samples/rust/Kconfig b/samples/rust/Kconfig
new file mode 100644
index 000000000000..841e0906e943
--- /dev/null
+++ b/samples/rust/Kconfig
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menuconfig SAMPLES_RUST
+	bool "Rust samples"
+	depends on RUST
+	help
+	  You can build sample Rust kernel code here.
+
+	  If unsure, say N.
+
+if SAMPLES_RUST
+
+config SAMPLE_RUST_MINIMAL
+	tristate "Minimal"
+	help
+	  This option builds the Rust minimal module sample.
+
+	  To compile this as a module, choose M here:
+	  the module will be called rust_minimal.
+
+	  If unsure, say N.
+
+config SAMPLE_RUST_HOSTPROGS
+	bool "Host programs"
+	help
+	  This option builds the Rust host program samples.
+
+	  If unsure, say N.
+
+endif # SAMPLES_RUST
diff --git a/samples/rust/Makefile b/samples/rust/Makefile
new file mode 100644
index 000000000000..1daba5f8658a
--- /dev/null
+++ b/samples/rust/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_SAMPLE_RUST_MINIMAL)		+= rust_minimal.o
+
+subdir-$(CONFIG_SAMPLE_RUST_HOSTPROGS)		+= hostprogs
diff --git a/samples/rust/hostprogs/.gitignore b/samples/rust/hostprogs/.gitignore
new file mode 100644
index 000000000000..a6c173da5048
--- /dev/null
+++ b/samples/rust/hostprogs/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+single
diff --git a/samples/rust/hostprogs/Makefile b/samples/rust/hostprogs/Makefile
new file mode 100644
index 000000000000..8ddcbd7416db
--- /dev/null
+++ b/samples/rust/hostprogs/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+hostprogs-always-y := single
+
+single-rust := y
diff --git a/samples/rust/hostprogs/a.rs b/samples/rust/hostprogs/a.rs
new file mode 100644
index 000000000000..f7a4a3d0f4e0
--- /dev/null
+++ b/samples/rust/hostprogs/a.rs
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust single host program sample: module `a`.
+
+pub(crate) fn f(x: i32) {
+    println!("The number is {}.", x);
+}
diff --git a/samples/rust/hostprogs/b.rs b/samples/rust/hostprogs/b.rs
new file mode 100644
index 000000000000..c1675890648f
--- /dev/null
+++ b/samples/rust/hostprogs/b.rs
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust single host program sample: module `b`.
+
+pub(crate) const CONSTANT: i32 = 42;
diff --git a/samples/rust/hostprogs/single.rs b/samples/rust/hostprogs/single.rs
new file mode 100644
index 000000000000..8c48a119339a
--- /dev/null
+++ b/samples/rust/hostprogs/single.rs
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust single host program sample.
+
+mod a;
+mod b;
+
+fn main() {
+    println!("Hello world!");
+
+    a::f(b::CONSTANT);
+}
diff --git a/samples/rust/rust_minimal.rs b/samples/rust/rust_minimal.rs
new file mode 100644
index 000000000000..54ad17685742
--- /dev/null
+++ b/samples/rust/rust_minimal.rs
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+
+//! Rust minimal sample.
+
+use kernel::prelude::*;
+
+module! {
+    type: RustMinimal,
+    name: b"rust_minimal",
+    author: b"Rust for Linux Contributors",
+    description: b"Rust minimal sample",
+    license: b"GPL",
+}
+
+struct RustMinimal {
+    numbers: Vec<i32>,
+}
+
+impl kernel::Module for RustMinimal {
+    fn init(_module: &'static ThisModule) -> Result<Self> {
+        pr_info!("Rust minimal sample (init)\n");
+        pr_info!("Am I built-in? {}\n", !cfg!(MODULE));
+
+        let mut numbers = Vec::new();
+        numbers.try_push(72)?;
+        numbers.try_push(108)?;
+        numbers.try_push(200)?;
+
+        Ok(RustMinimal { numbers })
+    }
+}
+
+impl Drop for RustMinimal {
+    fn drop(&mut self) {
+        pr_info!("My numbers are {:?}\n", self.numbers);
+        pr_info!("Rust minimal sample (exit)\n");
+    }
+}
-- 
cgit v1.2.3


From 4f4920669d21e1060b7243e5118dc3b71ced1276 Mon Sep 17 00:00:00 2001
From: Liu Jian <liujian56@huawei.com>
Date: Sat, 24 Sep 2022 16:01:57 +0800
Subject: xfrm: Reinject transport-mode packets through workqueue

The following warning is displayed when the tcp6-multi-diffip11 stress
test case of the LTP test suite is tested:

watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [ns-tcpserver:48198]
CPU: 0 PID: 48198 Comm: ns-tcpserver Kdump: loaded Not tainted 6.0.0-rc6+ #39
Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
pc : des3_ede_encrypt+0x27c/0x460 [libdes]
lr : 0x3f
sp : ffff80000ceaa1b0
x29: ffff80000ceaa1b0 x28: ffff0000df056100 x27: ffff0000e51e5280
x26: ffff80004df75030 x25: ffff0000e51e4600 x24: 000000000000003b
x23: 0000000000802080 x22: 000000000000003d x21: 0000000000000038
x20: 0000000080000020 x19: 000000000000000a x18: 0000000000000033
x17: ffff0000e51e4780 x16: ffff80004e2d1448 x15: ffff80004e2d1248
x14: ffff0000e51e4680 x13: ffff80004e2d1348 x12: ffff80004e2d1548
x11: ffff80004e2d1848 x10: ffff80004e2d1648 x9 : ffff80004e2d1748
x8 : ffff80004e2d1948 x7 : 000000000bcaf83d x6 : 000000000000001b
x5 : ffff80004e2d1048 x4 : 00000000761bf3bf x3 : 000000007f1dd0a3
x2 : ffff0000e51e4780 x1 : ffff0000e3b9a2f8 x0 : 00000000db44e872
Call trace:
 des3_ede_encrypt+0x27c/0x460 [libdes]
 crypto_des3_ede_encrypt+0x1c/0x30 [des_generic]
 crypto_cbc_encrypt+0x148/0x190
 crypto_skcipher_encrypt+0x2c/0x40
 crypto_authenc_encrypt+0xc8/0xfc [authenc]
 crypto_aead_encrypt+0x2c/0x40
 echainiv_encrypt+0x144/0x1a0 [echainiv]
 crypto_aead_encrypt+0x2c/0x40
 esp6_output_tail+0x1c8/0x5d0 [esp6]
 esp6_output+0x120/0x278 [esp6]
 xfrm_output_one+0x458/0x4ec
 xfrm_output_resume+0x6c/0x1f0
 xfrm_output+0xac/0x4ac
 __xfrm6_output+0x130/0x270
 xfrm6_output+0x60/0xec
 ip6_xmit+0x2ec/0x5bc
 inet6_csk_xmit+0xbc/0x10c
 __tcp_transmit_skb+0x460/0x8c0
 tcp_write_xmit+0x348/0x890
 __tcp_push_pending_frames+0x44/0x110
 tcp_rcv_established+0x3c8/0x720
 tcp_v6_do_rcv+0xdc/0x4a0
 tcp_v6_rcv+0xc24/0xcb0
 ip6_protocol_deliver_rcu+0xf0/0x574
 ip6_input_finish+0x48/0x7c
 ip6_input+0x48/0xc0
 ip6_rcv_finish+0x80/0x9c
 xfrm_trans_reinject+0xb0/0xf4
 tasklet_action_common.constprop.0+0xf8/0x134
 tasklet_action+0x30/0x3c
 __do_softirq+0x128/0x368
 do_softirq+0xb4/0xc0
 __local_bh_enable_ip+0xb0/0xb4
 put_cpu_fpsimd_context+0x40/0x70
 kernel_neon_end+0x20/0x40
 sha1_base_do_update.constprop.0.isra.0+0x11c/0x140 [sha1_ce]
 sha1_ce_finup+0x94/0x110 [sha1_ce]
 crypto_shash_finup+0x34/0xc0
 hmac_finup+0x48/0xe0
 crypto_shash_finup+0x34/0xc0
 shash_digest_unaligned+0x74/0x90
 crypto_shash_digest+0x4c/0x9c
 shash_ahash_digest+0xc8/0xf0
 shash_async_digest+0x28/0x34
 crypto_ahash_digest+0x48/0xcc
 crypto_authenc_genicv+0x88/0xcc [authenc]
 crypto_authenc_encrypt+0xd8/0xfc [authenc]
 crypto_aead_encrypt+0x2c/0x40
 echainiv_encrypt+0x144/0x1a0 [echainiv]
 crypto_aead_encrypt+0x2c/0x40
 esp6_output_tail+0x1c8/0x5d0 [esp6]
 esp6_output+0x120/0x278 [esp6]
 xfrm_output_one+0x458/0x4ec
 xfrm_output_resume+0x6c/0x1f0
 xfrm_output+0xac/0x4ac
 __xfrm6_output+0x130/0x270
 xfrm6_output+0x60/0xec
 ip6_xmit+0x2ec/0x5bc
 inet6_csk_xmit+0xbc/0x10c
 __tcp_transmit_skb+0x460/0x8c0
 tcp_write_xmit+0x348/0x890
 __tcp_push_pending_frames+0x44/0x110
 tcp_push+0xb4/0x14c
 tcp_sendmsg_locked+0x71c/0xb64
 tcp_sendmsg+0x40/0x6c
 inet6_sendmsg+0x4c/0x80
 sock_sendmsg+0x5c/0x6c
 __sys_sendto+0x128/0x15c
 __arm64_sys_sendto+0x30/0x40
 invoke_syscall+0x50/0x120
 el0_svc_common.constprop.0+0x170/0x194
 do_el0_svc+0x38/0x4c
 el0_svc+0x28/0xe0
 el0t_64_sync_handler+0xbc/0x13c
 el0t_64_sync+0x180/0x184

Get softirq info by bcc tool:
./softirqs -NT 10
Tracing soft irq event time... Hit Ctrl-C to end.

15:34:34
SOFTIRQ          TOTAL_nsecs
block                 158990
timer               20030920
sched               46577080
net_rx             676746820
tasklet           9906067650

15:34:45
SOFTIRQ          TOTAL_nsecs
block                  86100
sched               38849790
net_rx             676532470
timer             1163848790
tasklet           9409019620

15:34:55
SOFTIRQ          TOTAL_nsecs
sched               58078450
net_rx             475156720
timer              533832410
tasklet           9431333300

The tasklet software interrupt takes too much time. Therefore, the
xfrm_trans_reinject executor is changed from tasklet to workqueue. Add add
spin lock to protect the queue. This reduces the processing flow of the
tcp_sendmsg function in this scenario.

Fixes: acf568ee859f0 ("xfrm: Reinject transport-mode packets through tasklet")
Signed-off-by: Liu Jian <liujian56@huawei.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_input.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index b2f4ec9c537f..aa5220565763 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -24,7 +24,8 @@
 #include "xfrm_inout.h"
 
 struct xfrm_trans_tasklet {
-	struct tasklet_struct tasklet;
+	struct work_struct work;
+	spinlock_t queue_lock;
 	struct sk_buff_head queue;
 };
 
@@ -760,18 +761,22 @@ int xfrm_input_resume(struct sk_buff *skb, int nexthdr)
 }
 EXPORT_SYMBOL(xfrm_input_resume);
 
-static void xfrm_trans_reinject(struct tasklet_struct *t)
+static void xfrm_trans_reinject(struct work_struct *work)
 {
-	struct xfrm_trans_tasklet *trans = from_tasklet(trans, t, tasklet);
+	struct xfrm_trans_tasklet *trans = container_of(work, struct xfrm_trans_tasklet, work);
 	struct sk_buff_head queue;
 	struct sk_buff *skb;
 
 	__skb_queue_head_init(&queue);
+	spin_lock_bh(&trans->queue_lock);
 	skb_queue_splice_init(&trans->queue, &queue);
+	spin_unlock_bh(&trans->queue_lock);
 
+	local_bh_disable();
 	while ((skb = __skb_dequeue(&queue)))
 		XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net,
 					       NULL, skb);
+	local_bh_enable();
 }
 
 int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
@@ -789,8 +794,10 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb,
 
 	XFRM_TRANS_SKB_CB(skb)->finish = finish;
 	XFRM_TRANS_SKB_CB(skb)->net = net;
+	spin_lock_bh(&trans->queue_lock);
 	__skb_queue_tail(&trans->queue, skb);
-	tasklet_schedule(&trans->tasklet);
+	spin_unlock_bh(&trans->queue_lock);
+	schedule_work(&trans->work);
 	return 0;
 }
 EXPORT_SYMBOL(xfrm_trans_queue_net);
@@ -817,7 +824,8 @@ void __init xfrm_input_init(void)
 		struct xfrm_trans_tasklet *trans;
 
 		trans = &per_cpu(xfrm_trans_tasklet, i);
+		spin_lock_init(&trans->queue_lock);
 		__skb_queue_head_init(&trans->queue);
-		tasklet_setup(&trans->tasklet, xfrm_trans_reinject);
+		INIT_WORK(&trans->work, xfrm_trans_reinject);
 	}
 }
-- 
cgit v1.2.3


From 615131b8e9bcd88e2d3ef78a4954ff4abfbb1fb7 Mon Sep 17 00:00:00 2001
From: Miguel Ojeda <ojeda@kernel.org>
Date: Sat, 3 Jul 2021 17:24:21 +0200
Subject: MAINTAINERS: Rust
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Miguel, Alex and Wedson will be maintaining the Rust support.

Boqun, Gary and Björn will be reviewers.

Reviewed-by: Kees Cook <keescook@chromium.org>
Co-developed-by: Alex Gaynor <alex.gaynor@gmail.com>
Signed-off-by: Alex Gaynor <alex.gaynor@gmail.com>
Co-developed-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Wedson Almeida Filho <wedsonaf@google.com>
Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
---
 MAINTAINERS | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f5ca4aefd184..ac1db33b75b9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17758,6 +17758,24 @@ F:	include/rv/
 F:	kernel/trace/rv/
 F:	tools/verification/
 
+RUST
+M:	Miguel Ojeda <ojeda@kernel.org>
+M:	Alex Gaynor <alex.gaynor@gmail.com>
+M:	Wedson Almeida Filho <wedsonaf@gmail.com>
+R:	Boqun Feng <boqun.feng@gmail.com>
+R:	Gary Guo <gary@garyguo.net>
+R:	Björn Roy Baron <bjorn3_gh@protonmail.com>
+L:	rust-for-linux@vger.kernel.org
+S:	Supported
+W:	https://github.com/Rust-for-Linux/linux
+B:	https://github.com/Rust-for-Linux/linux/issues
+T:	git https://github.com/Rust-for-Linux/linux.git rust-next
+F:	Documentation/rust/
+F:	rust/
+F:	samples/rust/
+F:	scripts/*rust*
+K:	\b(?i:rust)\b
+
 RXRPC SOCKETS (AF_RXRPC)
 M:	David Howells <dhowells@redhat.com>
 M:	Marc Dionne <marc.dionne@auristor.com>
-- 
cgit v1.2.3


From 506442439317153c1aa646fcb35f8678d42efce9 Mon Sep 17 00:00:00 2001
From: Athira Rajeev <atrajeev@linux.vnet.ibm.com>
Date: Wed, 28 Sep 2022 10:22:18 +0530
Subject: perf tests vmlinux-kallsyms: Update is_ignored_symbol function to
 match the kernel ignored list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The testcase “vmlinux-kallsyms.c” fails in powerpc.

	vmlinux symtab matches kallsyms: FAILED!

This test look at the symbols in the vmlinux DSO and check if we find
all of them in the kallsyms dso.

But from the powerpc logs , observed that the failure happens for:

	ERR : 0xc0000000000fe9c8: .Lmfspr_table not on kallsyms
	ERR : 0xc0000000001009c8: .Lmtspr_table not on kallsyms

These are labels ( with .L) in the source code and has to be ignored.
Reference code with .Lmtspr_table: arch/powerpc/xmon/spr_access.S

The testcases invokes is_ignored_symbol() function to ignore hidden
symbols in the dso like local symbols. This function is adapted from
is_ignored_symbol() kernel function in code: scripts/kallsyms.c . The
kernel function got some updates which is not reflected in the testcase
function and the new updates also handles ignoring "labels".

Below is the changes that went in the kernel function.

	 /* Symbol names that begin with the following are ignored.*/
	 static const char * const ignored_prefixes[] = {
	 		"$",			/* local symbols for ARM, MIPS, etc. */
	-		".LASANPC",		/* s390 kasan local symbols */
	+		".L",			/* local labels, .LBB,.Ltmpxxx,.L__unnamed_xx,.LASANPC, etc. */
	 		"__crc_",		/* modversions */
	 		"__efistub_",		/* arm64 EFI stub namespace */
	-		"__kvm_nvhe_",		/* arm64 non-VHE KVM namespace */
	+		"__kvm_nvhe_$",		/* arm64 local symbols in non-VHE KVM namespace */
	+		"__kvm_nvhe_.L",	/* arm64 local symbols in non-VHE KVM namespace */
	 		"__AArch64ADRPThunk_",	/* arm64 lld */
	 		"__ARMV5PILongThunk_",	/* arm lld */
	 		"__ARMV7PILongThunk_",

This change is part of below commits and will handle the
symbols with “.L”

commit d4c858643263 ("kallsyms: ignore all local labels prefixed by '.L'")
commit 6ccf9cb557bd ("KVM: arm64: Symbolize the nVHE HYP addresses")

Update the testcase function to include the new changes.

Reported-by: Disha Goel <disgoel@linux.vnet.ibm.com>
Signed-off-by: Athira Jajeev <atrajeev@linux.vnet.ibm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Kajol Jain <kjain@linux.ibm.com>
Cc: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nageswara R Sastry <rnsastry@linux.ibm.com>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lore.kernel.org/r/20220928045218.37322-1-atrajeev@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/vmlinux-kallsyms.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/perf/tests/vmlinux-kallsyms.c b/tools/perf/tests/vmlinux-kallsyms.c
index 4fd8d703ff19..8ab035b55875 100644
--- a/tools/perf/tests/vmlinux-kallsyms.c
+++ b/tools/perf/tests/vmlinux-kallsyms.c
@@ -43,10 +43,11 @@ static bool is_ignored_symbol(const char *name, char type)
 	/* Symbol names that begin with the following are ignored.*/
 	static const char * const ignored_prefixes[] = {
 		"$",			/* local symbols for ARM, MIPS, etc. */
-		".LASANPC",		/* s390 kasan local symbols */
+		".L",			/* local labels, .LBB,.Ltmpxxx,.L__unnamed_xx,.LASANPC, etc. */
 		"__crc_",		/* modversions */
 		"__efistub_",		/* arm64 EFI stub namespace */
-		"__kvm_nvhe_",		/* arm64 non-VHE KVM namespace */
+		"__kvm_nvhe_$",		/* arm64 local symbols in non-VHE KVM namespace */
+		"__kvm_nvhe_.L",	/* arm64 local symbols in non-VHE KVM namespace */
 		"__AArch64ADRPThunk_",	/* arm64 lld */
 		"__ARMV5PILongThunk_",	/* arm lld */
 		"__ARMV7PILongThunk_",
-- 
cgit v1.2.3


From 74a61d53a6d1ca1172d85964d15c83c2cc3670b3 Mon Sep 17 00:00:00 2001
From: Jing Zhang <renyu.zj@linux.alibaba.com>
Date: Mon, 26 Sep 2022 21:03:16 +0800
Subject: perf arm-spe: augment the data source type with neoverse_spe list

When synthesizing event with SPE data source, commit 4e6430cbb1a9("perf
arm-spe: Use SPE data source for neoverse cores") augment the type with
source information by MIDR. However, is_midr_in_range only compares the
first entry in neoverse_spe.

Change is_midr_in_range to is_midr_in_range_list to traverse the
neoverse_spe array so that all neoverse cores synthesize event with data
source packet.

Fixes: 4e6430cbb1a9f1dc ("perf arm-spe: Use SPE data source for neoverse cores")
Reviewed-by: Ali Saidi <alisaidi@amazon.com>
Reviewed-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Jing Zhang <renyu.zj@linux.alibaba.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ali Saidi <alisaidi@amazon.com>
Cc: German Gomez <german.gomez@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@arm.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Garry <john.garry@huawei.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuai Xue <xueshuai@linux.alibaba.com>
Cc: Timothy Hayes <timothy.hayes@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Zhuo Song <zhuo.song@linux.alibaba.com>
Link: https://lore.kernel.org/r/1664197396-42672-1-git-send-email-renyu.zj@linux.alibaba.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/arm-spe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c
index 22dcfe07e886..906476a839e1 100644
--- a/tools/perf/util/arm-spe.c
+++ b/tools/perf/util/arm-spe.c
@@ -498,7 +498,7 @@ static void arm_spe__synth_data_source_generic(const struct arm_spe_record *reco
 static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
 {
 	union perf_mem_data_src	data_src = { 0 };
-	bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
+	bool is_neoverse = is_midr_in_range_list(midr, neoverse_spe);
 
 	if (record->op == ARM_SPE_LD)
 		data_src.mem_op = PERF_MEM_OP_LOAD;
-- 
cgit v1.2.3


From b9962b699878c2cc61b97304b24c9582d366c58f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 27 Sep 2022 16:10:31 +0200
Subject: ACPI: DPTF: Drop stale link from Kconfig help

The web site pointed to from the DPTF Kconfig help is not accessible
any more, so drop the link to it from there.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/dptf/Kconfig | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/acpi/dptf/Kconfig b/drivers/acpi/dptf/Kconfig
index 1e8c7ce89bf1..4b3fdc03e4ed 100644
--- a/drivers/acpi/dptf/Kconfig
+++ b/drivers/acpi/dptf/Kconfig
@@ -11,9 +11,6 @@ menuconfig ACPI_DPTF
 	  a coordinated approach for different policies to effect the hardware
 	  state of a system.
 
-	  For more information see:
-	  <https://01.org/intel%C2%AE-dynamic-platform-and-thermal-framework-dptf-chromium-os/overview>
-
 if ACPI_DPTF
 
 config DPTF_POWER
-- 
cgit v1.2.3


From d206cef03c4827984e6ac88a9472b70c41f5b28d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 26 Sep 2022 22:20:03 +0200
Subject: ACPI: docs: Drop useless DSDT override documentation

Because https://01.org/linux-acpi web site has become permanently
inaccessible, the "Overriding DSDT" document in the kernel tree
pointing to it as the main source of information is useless (and
the config option name mentioned by it is incorrect), so drop it
and drop the pointer to it from the ACPI Kconfig.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/admin-guide/acpi/dsdt-override.rst | 13 -------------
 drivers/acpi/Kconfig                             |  1 -
 2 files changed, 14 deletions(-)
 delete mode 100644 Documentation/admin-guide/acpi/dsdt-override.rst

diff --git a/Documentation/admin-guide/acpi/dsdt-override.rst b/Documentation/admin-guide/acpi/dsdt-override.rst
deleted file mode 100644
index 50bd7f194bf4..000000000000
--- a/Documentation/admin-guide/acpi/dsdt-override.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-===============
-Overriding DSDT
-===============
-
-Linux supports a method of overriding the BIOS DSDT:
-
-CONFIG_ACPI_CUSTOM_DSDT - builds the image into the kernel.
-
-When to use this method is described in detail on the
-Linux/ACPI home page:
-https://01.org/linux-acpi/documentation/overriding-dsdt
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 7802d8846a8d..402851c92d9d 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -347,7 +347,6 @@ config ACPI_CUSTOM_DSDT_FILE
 	depends on !STANDALONE
 	help
 	  This option supports a custom DSDT by linking it into the kernel.
-	  See Documentation/admin-guide/acpi/dsdt-override.rst
 
 	  Enter the full path name to the file which includes the AmlCode
 	  or dsdt_aml_code declaration.
-- 
cgit v1.2.3


From a97edbaa1906f4d8e01248939cf21c6382b11a6b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 26 Sep 2022 22:27:56 +0200
Subject: ACPI: Kconfig: Drop link to https://01.org/linux-acpi

Because the https://01.org/linux-acpi web site has become permanently
inaccessible, drop the remaining link to it from the ACPI Kconfig.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/Kconfig | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 7802d8846a8d..7723e2495eb5 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -27,9 +27,6 @@ menuconfig ACPI
 	  Management (APM) specification.  If both ACPI and APM support 
 	  are configured, ACPI is used.
 
-	  The project home page for the Linux ACPI subsystem is here:
-	  <https://01.org/linux-acpi>
-
 	  Linux support for ACPI is based on Intel Corporation's ACPI
 	  Component Architecture (ACPI CA).  For more information on the
 	  ACPI CA, see:
-- 
cgit v1.2.3


From 67ef3d7a96b28eba29a0b161541897fd6fab7b73 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 27 Sep 2022 16:09:13 +0200
Subject: MAINTAINERS: Drop records pointing to 01.org/linux-acpi

The https://01.org/linux-acpi web site has become permanently
inaccessible, so drop the records pointing to it from MAINTAINERS.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 MAINTAINERS | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9d7f64dc0efe..b9846614ad7f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -348,7 +348,6 @@ M:	"Rafael J. Wysocki" <rafael@kernel.org>
 R:	Len Brown <lenb@kernel.org>
 L:	linux-acpi@vger.kernel.org
 S:	Supported
-W:	https://01.org/linux-acpi
 Q:	https://patchwork.kernel.org/project/linux-acpi/list/
 B:	https://bugzilla.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
@@ -427,7 +426,6 @@ M:	Rafael J. Wysocki <rafael@kernel.org>
 R:	Zhang Rui <rui.zhang@intel.com>
 L:	linux-acpi@vger.kernel.org
 S:	Supported
-W:	https://01.org/linux-acpi
 B:	https://bugzilla.kernel.org
 F:	drivers/acpi/*thermal*
 
@@ -10375,7 +10373,6 @@ INTEL MENLOW THERMAL DRIVER
 M:	Sujith Thomas <sujith.thomas@intel.com>
 L:	linux-pm@vger.kernel.org
 S:	Supported
-W:	https://01.org/linux-acpi
 F:	drivers/thermal/intel/intel_menlow.c
 
 INTEL P-Unit IPC DRIVER
-- 
cgit v1.2.3


From df5b035b5683d6a25f077af889fb88e09827f8bc Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 19 Aug 2022 19:47:44 +0200
Subject: x86/cacheinfo: Add a cpu_llc_shared_mask() UP variant

On a CONFIG_SMP=n kernel, the LLC shared mask is 0, which prevents
__cache_amd_cpumap_setup() from doing the L3 masks setup, and more
specifically from setting up the shared_cpu_map and shared_cpu_list
files in sysfs, leading to lscpu from util-linux getting confused and
segfaulting.

Add a cpu_llc_shared_mask() UP variant which returns a mask with a
single bit set, i.e., for CPU0.

Fixes: 2b83809a5e6d ("x86/cpu/amd: Derive L3 shared_cpu_map from cpu_llc_shared_mask")
Reported-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/1660148115-302-1-git-send-email-ssengar@linux.microsoft.com
---
 arch/x86/include/asm/smp.h | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 81a0211a372d..a73bced40e24 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -21,16 +21,6 @@ DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
 DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
 DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
 
-static inline struct cpumask *cpu_llc_shared_mask(int cpu)
-{
-	return per_cpu(cpu_llc_shared_map, cpu);
-}
-
-static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
-{
-	return per_cpu(cpu_l2c_shared_map, cpu);
-}
-
 DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
 DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
 DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
@@ -172,6 +162,16 @@ extern int safe_smp_processor_id(void);
 # define safe_smp_processor_id()	smp_processor_id()
 #endif
 
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+	return per_cpu(cpu_llc_shared_map, cpu);
+}
+
+static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
+{
+	return per_cpu(cpu_l2c_shared_map, cpu);
+}
+
 #else /* !CONFIG_SMP */
 #define wbinvd_on_cpu(cpu)     wbinvd()
 static inline int wbinvd_on_all_cpus(void)
@@ -179,6 +179,11 @@ static inline int wbinvd_on_all_cpus(void)
 	wbinvd();
 	return 0;
 }
+
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+	return (struct cpumask *)cpumask_of(0);
+}
 #endif /* CONFIG_SMP */
 
 extern unsigned disabled_cpus;
-- 
cgit v1.2.3


From 06bbaa6dc53cb72040db952053432541acb9adc7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 26 Sep 2022 11:59:14 -0400
Subject: [coredump] don't use __kernel_write() on kmap_local_page()

passing kmap_local_page() result to __kernel_write() is unsafe -
random ->write_iter() might (and 9p one does) get unhappy when
passed ITER_KVEC with pointer that came from kmap_local_page().

Fix by providing a variant of __kernel_write() that takes an iov_iter
from caller (__kernel_write() becomes a trivial wrapper) and adding
dump_emit_page() that parallels dump_emit(), except that instead of
__kernel_write() it uses __kernel_write_iter() with ITER_BVEC source.

Fixes: 3159ed57792b "fs/coredump: use kmap_local_page()"
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coredump.c   | 38 +++++++++++++++++++++++++++++++++-----
 fs/internal.h   |  3 +++
 fs/read_write.c | 22 ++++++++++++++--------
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index 9f4aae202109..1ab4f5b76a1e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -832,6 +832,38 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr)
 	}
 }
 
+static int dump_emit_page(struct coredump_params *cprm, struct page *page)
+{
+	struct bio_vec bvec = {
+		.bv_page	= page,
+		.bv_offset	= 0,
+		.bv_len		= PAGE_SIZE,
+	};
+	struct iov_iter iter;
+	struct file *file = cprm->file;
+	loff_t pos = file->f_pos;
+	ssize_t n;
+
+	if (cprm->to_skip) {
+		if (!__dump_skip(cprm, cprm->to_skip))
+			return 0;
+		cprm->to_skip = 0;
+	}
+	if (cprm->written + PAGE_SIZE > cprm->limit)
+		return 0;
+	if (dump_interrupted())
+		return 0;
+	iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE);
+	n = __kernel_write_iter(cprm->file, &iter, &pos);
+	if (n != PAGE_SIZE)
+		return 0;
+	file->f_pos = pos;
+	cprm->written += PAGE_SIZE;
+	cprm->pos += PAGE_SIZE;
+
+	return 1;
+}
+
 int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
 {
 	if (cprm->to_skip) {
@@ -863,7 +895,6 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
 
 	for (addr = start; addr < start + len; addr += PAGE_SIZE) {
 		struct page *page;
-		int stop;
 
 		/*
 		 * To avoid having to allocate page tables for virtual address
@@ -874,10 +905,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		 */
 		page = get_dump_page(addr);
 		if (page) {
-			void *kaddr = kmap_local_page(page);
-
-			stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
-			kunmap_local(kaddr);
+			int stop = !dump_emit_page(cprm, page);
 			put_page(page);
 			if (stop)
 				return 0;
diff --git a/fs/internal.h b/fs/internal.h
index 87e96b9024ce..3e206d3e317c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -16,6 +16,7 @@ struct shrink_control;
 struct fs_context;
 struct user_namespace;
 struct pipe_inode_info;
+struct iov_iter;
 
 /*
  * block/bdev.c
@@ -221,3 +222,5 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns,
 int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
 int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		struct xattr_ctx *ctx);
+
+ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);
diff --git a/fs/read_write.c b/fs/read_write.c
index 1a261dcf1778..328ce8cf9a85 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -496,14 +496,9 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
 }
 
 /* caller is responsible for file_start_write/file_end_write */
-ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
 {
-	struct kvec iov = {
-		.iov_base	= (void *)buf,
-		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
-	};
 	struct kiocb kiocb;
-	struct iov_iter iter;
 	ssize_t ret;
 
 	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
@@ -519,8 +514,7 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
 
 	init_sync_kiocb(&kiocb, file);
 	kiocb.ki_pos = pos ? *pos : 0;
-	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
-	ret = file->f_op->write_iter(&kiocb, &iter);
+	ret = file->f_op->write_iter(&kiocb, from);
 	if (ret > 0) {
 		if (pos)
 			*pos = kiocb.ki_pos;
@@ -530,6 +524,18 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
 	inc_syscw(current);
 	return ret;
 }
+
+/* caller is responsible for file_start_write/file_end_write */
+ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+{
+	struct kvec iov = {
+		.iov_base	= (void *)buf,
+		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
+	};
+	struct iov_iter iter;
+	iov_iter_kvec(&iter, WRITE, &iov, 1, iov.iov_len);
+	return __kernel_write_iter(file, &iter, pos);
+}
 /*
  * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
  * but autofs is one of the few internal kernel users that actually
-- 
cgit v1.2.3


From 4b8af331bb4d4cc8bb91c284b11b98dd1e265185 Mon Sep 17 00:00:00 2001
From: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Date: Tue, 27 Sep 2022 09:58:15 -0700
Subject: Bluetooth: Prevent double register of suspend

Suspend notifier should only be registered and unregistered once per
hdev. Simplify this by only registering during driver registration and
simply exiting early when HCI_USER_CHANNEL is set.

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 359ee4f834f5 (Bluetooth: Unregister suspend with userchannel)
Signed-off-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_core.c | 4 ++++
 net/bluetooth/hci_sock.c | 3 ---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 063fbb8e07ca..0540555b3704 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2406,6 +2406,10 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action,
 		container_of(nb, struct hci_dev, suspend_notifier);
 	int ret = 0;
 
+	/* Userspace has full control of this device. Do nothing. */
+	if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
+		return NOTIFY_DONE;
+
 	if (action == PM_SUSPEND_PREPARE)
 		ret = hci_suspend_dev(hdev);
 	else if (action == PM_POST_SUSPEND)
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index b2a33a05c93e..06581223238c 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -887,7 +887,6 @@ static int hci_sock_release(struct socket *sock)
 			 */
 			hci_dev_do_close(hdev);
 			hci_dev_clear_flag(hdev, HCI_USER_CHANNEL);
-			hci_register_suspend_notifier(hdev);
 			mgmt_index_added(hdev);
 		}
 
@@ -1216,7 +1215,6 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 		}
 
 		mgmt_index_removed(hdev);
-		hci_unregister_suspend_notifier(hdev);
 
 		err = hci_dev_open(hdev->id);
 		if (err) {
@@ -1231,7 +1229,6 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 				err = 0;
 			} else {
 				hci_dev_clear_flag(hdev, HCI_USER_CHANNEL);
-				hci_register_suspend_notifier(hdev);
 				mgmt_index_added(hdev);
 				hci_dev_put(hdev);
 				goto done;
-- 
cgit v1.2.3


From 8dbc3e75a0a56fb0ab70781338a2283d28a09164 Mon Sep 17 00:00:00 2001
From: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Date: Tue, 27 Sep 2022 13:17:20 -0700
Subject: Bluetooth: Call shutdown for HCI_USER_CHANNEL

Some drivers depend on shutdown being called for proper operation.
Unset HCI_USER_CHANNEL and call the full close routine since shutdown is
complementary to setup.

Signed-off-by: Abhishek Pandit-Subedi <abhishekpandit@chromium.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 net/bluetooth/hci_sync.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 422f7c6911d9..15c75ef4c271 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4727,6 +4727,31 @@ static void hci_pend_le_actions_clear(struct hci_dev *hdev)
 	BT_DBG("All LE pending actions cleared");
 }
 
+static int hci_dev_shutdown(struct hci_dev *hdev)
+{
+	int err = 0;
+	/* Similar to how we first do setup and then set the exclusive access
+	 * bit for userspace, we must first unset userchannel and then clean up.
+	 * Otherwise, the kernel can't properly use the hci channel to clean up
+	 * the controller (some shutdown routines require sending additional
+	 * commands to the controller for example).
+	 */
+	bool was_userchannel =
+		hci_dev_test_and_clear_flag(hdev, HCI_USER_CHANNEL);
+
+	if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) &&
+	    test_bit(HCI_UP, &hdev->flags)) {
+		/* Execute vendor specific shutdown routine */
+		if (hdev->shutdown)
+			err = hdev->shutdown(hdev);
+	}
+
+	if (was_userchannel)
+		hci_dev_set_flag(hdev, HCI_USER_CHANNEL);
+
+	return err;
+}
+
 int hci_dev_close_sync(struct hci_dev *hdev)
 {
 	bool auto_off;
@@ -4746,13 +4771,7 @@ int hci_dev_close_sync(struct hci_dev *hdev)
 		hdev->adv_instance_timeout = 0;
 	}
 
-	if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) &&
-	    !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
-	    test_bit(HCI_UP, &hdev->flags)) {
-		/* Execute vendor specific shutdown routine */
-		if (hdev->shutdown)
-			err = hdev->shutdown(hdev);
-	}
+	err = hci_dev_shutdown(hdev);
 
 	if (!test_and_clear_bit(HCI_UP, &hdev->flags)) {
 		cancel_delayed_work_sync(&hdev->cmd_timer);
-- 
cgit v1.2.3


From 39f81776c6807d605cfec6fa7a2439fa4786034e Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 28 Sep 2022 11:19:34 -0500
Subject: ACPI: x86: s2idle: Fix a NULL pointer dereference

Ryzen 7000 (Raphael) introduced AMDI0008 for _HID.  This ID was added
in commit ed470febf837 ("ACPI: PM: s2idle: Add support for upcoming AMD
uPEP HID AMDI008"), but then removed in favor of aligning all new IDs
to Rembrandt support in commit fd894f05cf30 ("ACPI: x86: s2idle: If a
new AMD _HID is missing assume Rembrandt").

Unfortunately there was a mistake in commit 100a57379380 ("ACPI: x86:
s2idle: Move _HID handling for AMD systems into structures") that can
lead to a NULL pointer dereference accessing `dev_id->driver_data` in
the sentinel of `amd_hid_ids`.  Fix this dereference.

Reported-by: Richard Gong <Richard.Gong@amd.com>
Fixes: 100a57379380 ("ACPI: x86: s2idle: Move _HID handling for AMD systems into structures")
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/x86/s2idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index eedd21d8a284..3ae2ba74de92 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -470,7 +470,7 @@ static int lps0_device_attach(struct acpi_device *adev,
 		for (dev_id = &amd_hid_ids[0]; dev_id->id[0]; dev_id++)
 			if (acpi_dev_hid_uid_match(adev, dev_id->id, NULL))
 				break;
-		if (dev_id)
+		if (dev_id->id[0])
 			data = (const struct amd_lps0_hid_device_data *) dev_id->driver_data;
 		else
 			data = &amd_rembrandt;
-- 
cgit v1.2.3


From 54bd1e548701640f2aff299aab192eec55571e01 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Wed, 28 Sep 2022 11:19:35 -0500
Subject: ACPI: x86: s2idle: Add another ID to s2idle_dmi_table

It's reported that "ASUSTeK COMPUTER INC. ROG Flow X16 GV601RW" has
non-functional fans after resume when using the AMD codepath.  This
issue is fixed using the Microsoft codepath.

Add the 3 variants of this system to the Microsoft codepath DMI table.
* GV601RW
* GV601RM
* GV601RE

Link: https://www.reddit.com/r/linuxhardware/comments/wh50nd/compatibility_report_asus_rog_flow_x16_gv601rm/
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2148#note_1571241
Reported-by: Luke Jones <luke@ljones.dev>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/x86/s2idle.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/acpi/x86/s2idle.c b/drivers/acpi/x86/s2idle.c
index 3ae2ba74de92..0155c1d2d608 100644
--- a/drivers/acpi/x86/s2idle.c
+++ b/drivers/acpi/x86/s2idle.c
@@ -451,6 +451,17 @@ static const struct dmi_system_id s2idle_dmi_table[] __initconst = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "ROG Flow X13 GV301"),
 		},
 	},
+	{
+		/*
+		 * ASUSTeK COMPUTER INC. ROG Flow X16 GV601RW_GV601RW
+		 * https://gitlab.freedesktop.org/drm/amd/-/issues/2148
+		 */
+		.callback = lps0_prefer_microsoft,
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "ROG Flow X16 GV601"),
+		},
+	},
 	{}
 };
 
-- 
cgit v1.2.3


From 7c236d93c6764dcaca7ab66d76768a044647876d Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 27 Sep 2022 15:08:34 +0200
Subject: m68k: Process bootinfo records before saving them

The RNG seed boot record is memzeroed after processing, in order to
preserve forward secrecy. By saving the bootinfo for procfs prior to
that, forward secrecy is violated, since it becomes possible to recover
past states. So, save the bootinfo block only after first processing
them.

Fixes: a1ee38ab1a75 ("m68k: virt: Use RNG seed from bootinfo block")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20220927130835.1629806-1-Jason@zx2c4.com
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/kernel/setup_mm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c
index 06ee394db610..78967c96ee8f 100644
--- a/arch/m68k/kernel/setup_mm.c
+++ b/arch/m68k/kernel/setup_mm.c
@@ -109,10 +109,9 @@ extern void paging_init(void);
 
 static void __init m68k_parse_bootinfo(const struct bi_record *record)
 {
+	const struct bi_record *first_record = record;
 	uint16_t tag;
 
-	save_bootinfo(record);
-
 	while ((tag = be16_to_cpu(record->tag)) != BI_LAST) {
 		int unknown = 0;
 		const void *data = record->data;
@@ -182,6 +181,8 @@ static void __init m68k_parse_bootinfo(const struct bi_record *record)
 		record = (struct bi_record *)((unsigned long)record + size);
 	}
 
+	save_bootinfo(first_record);
+
 	m68k_realnum_memory = m68k_num_memory;
 #ifdef CONFIG_SINGLE_MEMORY_CHUNK
 	if (m68k_num_memory > 1) {
-- 
cgit v1.2.3


From dc63a086daee92c63e392e4e7cd7ed61f3693026 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 27 Sep 2022 15:08:35 +0200
Subject: m68k: Rework BI_VIRT_RNG_SEED as BI_RNG_SEED

This is useful on !virt platforms for kexec, so change things from
BI_VIRT_RNG_SEED to be BI_RNG_SEED, and simply remove BI_VIRT_RNG_SEED
because it only ever lasted one release, and nothing is broken by not
having it. At the same time, keep a comment noting that it's been
removed, so that ID isn't reused. In addition, we previously documented
2-byte alignment, but 4-byte alignment is actually necessary, so update
that comment.

Suggested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Fixes: a1ee38ab1a75 ("m68k: virt: Use RNG seed from bootinfo block")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Link: https://lore.kernel.org/r/20220927130835.1629806-2-Jason@zx2c4.com
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/include/uapi/asm/bootinfo-virt.h |  9 ++-------
 arch/m68k/include/uapi/asm/bootinfo.h      |  7 +++++++
 arch/m68k/kernel/setup_mm.c                | 12 ++++++++++++
 arch/m68k/virt/config.c                    | 11 -----------
 4 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/arch/m68k/include/uapi/asm/bootinfo-virt.h b/arch/m68k/include/uapi/asm/bootinfo-virt.h
index b091ee9b06e0..7dbcd7bec103 100644
--- a/arch/m68k/include/uapi/asm/bootinfo-virt.h
+++ b/arch/m68k/include/uapi/asm/bootinfo-virt.h
@@ -13,13 +13,8 @@
 #define BI_VIRT_VIRTIO_BASE	0x8004
 #define BI_VIRT_CTRL_BASE	0x8005
 
-/*
- * A random seed used to initialize the RNG. Record format:
- *
- *   - length       [ 2 bytes, 16-bit big endian ]
- *   - seed data    [ `length` bytes, padded to preserve 2-byte alignment ]
- */
-#define BI_VIRT_RNG_SEED	0x8006
+/* No longer used -- replaced with BI_RNG_SEED -- but don't reuse this index:
+ * #define BI_VIRT_RNG_SEED	0x8006 */
 
 #define VIRT_BOOTI_VERSION	MK_BI_VERSION(2, 0)
 
diff --git a/arch/m68k/include/uapi/asm/bootinfo.h b/arch/m68k/include/uapi/asm/bootinfo.h
index 95ecf3ae4c49..024e87d7095f 100644
--- a/arch/m68k/include/uapi/asm/bootinfo.h
+++ b/arch/m68k/include/uapi/asm/bootinfo.h
@@ -64,6 +64,13 @@ struct mem_info {
 					/* (struct mem_info) */
 #define BI_COMMAND_LINE		0x0007	/* kernel command line parameters */
 					/* (string) */
+/*
+ * A random seed used to initialize the RNG. Record format:
+ *
+ *   - length       [ 2 bytes, 16-bit big endian ]
+ *   - seed data    [ `length` bytes, padded to preserve 4-byte struct alignment ]
+ */
+#define BI_RNG_SEED		0x0008
 
 
     /*
diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c
index 78967c96ee8f..3a2bb2e8fdad 100644
--- a/arch/m68k/kernel/setup_mm.c
+++ b/arch/m68k/kernel/setup_mm.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/nvram.h>
 #include <linux/initrd.h>
+#include <linux/random.h>
 
 #include <asm/bootinfo.h>
 #include <asm/byteorder.h>
@@ -151,6 +152,17 @@ static void __init m68k_parse_bootinfo(const struct bi_record *record)
 				sizeof(m68k_command_line));
 			break;
 
+		case BI_RNG_SEED: {
+			u16 len = be16_to_cpup(data);
+			add_bootloader_randomness(data + 2, len);
+			/*
+			 * Zero the data to preserve forward secrecy, and zero the
+			 * length to prevent kexec from using it.
+			 */
+			memzero_explicit((void *)data, len + 2);
+			break;
+		}
+
 		default:
 			if (MACH_IS_AMIGA)
 				unknown = amiga_parse_bootinfo(record);
diff --git a/arch/m68k/virt/config.c b/arch/m68k/virt/config.c
index 4ab22946ff68..632ba200ad42 100644
--- a/arch/m68k/virt/config.c
+++ b/arch/m68k/virt/config.c
@@ -2,7 +2,6 @@
 
 #include <linux/reboot.h>
 #include <linux/serial_core.h>
-#include <linux/random.h>
 #include <clocksource/timer-goldfish.h>
 
 #include <asm/bootinfo.h>
@@ -93,16 +92,6 @@ int __init virt_parse_bootinfo(const struct bi_record *record)
 		data += 4;
 		virt_bi_data.virtio.irq = be32_to_cpup(data);
 		break;
-	case BI_VIRT_RNG_SEED: {
-		u16 len = be16_to_cpup(data);
-		add_bootloader_randomness(data + 2, len);
-		/*
-		 * Zero the data to preserve forward secrecy, and zero the
-		 * length to prevent kexec from using it.
-		 */
-		memzero_explicit((void *)data, len + 2);
-		break;
-	}
 	default:
 		unknown = 1;
 		break;
-- 
cgit v1.2.3


From b1ff1bfe81e763420afd5f3f25f0b3cbfd97055c Mon Sep 17 00:00:00 2001
From: Han Xu <han.xu@nxp.com>
Date: Thu, 15 Sep 2022 10:09:59 -0500
Subject: clk: imx: imx6sx: remove the SET_RATE_PARENT flag for QSPI clocks

There is no dedicate parent clock for QSPI so SET_RATE_PARENT flag
should not be used. For instance, the default parent clock for QSPI is
pll2_bus, which is also the parent clock for quite a few modules, such
as MMDC, once GPMI NAND set clock rate for EDO5 mode can cause system
hang due to pll2_bus rate changed.

Fixes: f1541e15e38e ("clk: imx6sx: Switch to clk_hw based API")
Signed-off-by: Han Xu <han.xu@nxp.com>
Link: https://lore.kernel.org/r/20220915150959.3646702-1-han.xu@nxp.com
Tested-by: Fabio Estevam <festevam@denx.de>
Reviewed-by: Abel Vesa <abel.vesa@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/imx/clk-imx6sx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clk/imx/clk-imx6sx.c b/drivers/clk/imx/clk-imx6sx.c
index fc1bd23d4583..598f3cf4eba4 100644
--- a/drivers/clk/imx/clk-imx6sx.c
+++ b/drivers/clk/imx/clk-imx6sx.c
@@ -280,13 +280,13 @@ static void __init imx6sx_clocks_init(struct device_node *ccm_node)
 	hws[IMX6SX_CLK_SSI3_SEL]           = imx_clk_hw_mux("ssi3_sel",         base + 0x1c,  14,     2,      ssi_sels,          ARRAY_SIZE(ssi_sels));
 	hws[IMX6SX_CLK_SSI2_SEL]           = imx_clk_hw_mux("ssi2_sel",         base + 0x1c,  12,     2,      ssi_sels,          ARRAY_SIZE(ssi_sels));
 	hws[IMX6SX_CLK_SSI1_SEL]           = imx_clk_hw_mux("ssi1_sel",         base + 0x1c,  10,     2,      ssi_sels,          ARRAY_SIZE(ssi_sels));
-	hws[IMX6SX_CLK_QSPI1_SEL]          = imx_clk_hw_mux_flags("qspi1_sel", base + 0x1c,  7, 3, qspi1_sels, ARRAY_SIZE(qspi1_sels), CLK_SET_RATE_PARENT);
+	hws[IMX6SX_CLK_QSPI1_SEL]          = imx_clk_hw_mux("qspi1_sel",        base + 0x1c,  7,      3,      qspi1_sels,        ARRAY_SIZE(qspi1_sels));
 	hws[IMX6SX_CLK_PERCLK_SEL]         = imx_clk_hw_mux("perclk_sel",       base + 0x1c,  6,      1,      perclk_sels,       ARRAY_SIZE(perclk_sels));
 	hws[IMX6SX_CLK_VID_SEL]            = imx_clk_hw_mux("vid_sel",          base + 0x20,  21,     3,      vid_sels,          ARRAY_SIZE(vid_sels));
 	hws[IMX6SX_CLK_ESAI_SEL]           = imx_clk_hw_mux("esai_sel",         base + 0x20,  19,     2,      audio_sels,        ARRAY_SIZE(audio_sels));
 	hws[IMX6SX_CLK_CAN_SEL]            = imx_clk_hw_mux("can_sel",          base + 0x20,  8,      2,      can_sels,          ARRAY_SIZE(can_sels));
 	hws[IMX6SX_CLK_UART_SEL]           = imx_clk_hw_mux("uart_sel",         base + 0x24,  6,      1,      uart_sels,         ARRAY_SIZE(uart_sels));
-	hws[IMX6SX_CLK_QSPI2_SEL]          = imx_clk_hw_mux_flags("qspi2_sel", base + 0x2c, 15, 3, qspi2_sels, ARRAY_SIZE(qspi2_sels), CLK_SET_RATE_PARENT);
+	hws[IMX6SX_CLK_QSPI2_SEL]          = imx_clk_hw_mux("qspi2_sel",        base + 0x2c,  15,     3,      qspi2_sels,        ARRAY_SIZE(qspi2_sels));
 	hws[IMX6SX_CLK_SPDIF_SEL]          = imx_clk_hw_mux("spdif_sel",        base + 0x30,  20,     2,      audio_sels,        ARRAY_SIZE(audio_sels));
 	hws[IMX6SX_CLK_AUDIO_SEL]          = imx_clk_hw_mux("audio_sel",        base + 0x30,  7,      2,      audio_sels,        ARRAY_SIZE(audio_sels));
 	hws[IMX6SX_CLK_ENET_PRE_SEL]       = imx_clk_hw_mux("enet_pre_sel",     base + 0x34,  15,     3,      enet_pre_sels,     ARRAY_SIZE(enet_pre_sels));
-- 
cgit v1.2.3


From 7738be973fc4e2ba22154fafd3a5d7b9666f9abf Mon Sep 17 00:00:00 2001
From: Ashutosh Dixit <ashutosh.dixit@intel.com>
Date: Wed, 28 Sep 2022 12:02:12 -0700
Subject: drm/i915/gt: Perf_limit_reasons are only available for Gen11+

Register GT0_PERF_LIMIT_REASONS (0x1381a8) is available only for
Gen11+. Therefore ensure perf_limit_reasons sysfs files are created only
for Gen11+. Otherwise on Gen < 5 accessing these files results in the
following oops:

<1> [88.829420] BUG: unable to handle page fault for address: ffffc90000bb81a8
<1> [88.829438] #PF: supervisor read access in kernel mode
<1> [88.829447] #PF: error_code(0x0000) - not-present page

This patch is a backport of the drm-tip commit 0d2d201095e9
("drm/i915: Perf_limit_reasons are only available for Gen11+") to
drm-intel-fixes. The backport is not identical to the original, it only
includes the sysfs portions of if. The debugfs portion is not available
in drm-intel-fixes so has not been backported.

Bspec: 20008
Bug: https://gitlab.freedesktop.org/drm/intel/-/issues/6863
Fixes: fa68bff7cf27 ("drm/i915/gt: Add sysfs throttle frequency interfaces")
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220919162401.2077713-1-ashutosh.dixit@intel.com
(backported from commit 0d2d201095e9f141d6a9fb44320afce761f8b5c2)
---
 drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c
index 73a8b46e0234..d09a0e845d09 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_sysfs_pm.c
@@ -545,8 +545,7 @@ static INTEL_GT_RPS_BOOL_ATTR_RO(throttle_reason_ratl, RATL_MASK);
 static INTEL_GT_RPS_BOOL_ATTR_RO(throttle_reason_vr_thermalert, VR_THERMALERT_MASK);
 static INTEL_GT_RPS_BOOL_ATTR_RO(throttle_reason_vr_tdc, VR_TDC_MASK);
 
-static const struct attribute *freq_attrs[] = {
-	&dev_attr_punit_req_freq_mhz.attr,
+static const struct attribute *throttle_reason_attrs[] = {
 	&attr_throttle_reason_status.attr,
 	&attr_throttle_reason_pl1.attr,
 	&attr_throttle_reason_pl2.attr,
@@ -763,12 +762,20 @@ void intel_gt_sysfs_pm_init(struct intel_gt *gt, struct kobject *kobj)
 	if (!is_object_gt(kobj))
 		return;
 
-	ret = sysfs_create_files(kobj, freq_attrs);
+	ret = sysfs_create_file(kobj, &dev_attr_punit_req_freq_mhz.attr);
 	if (ret)
 		drm_warn(&gt->i915->drm,
-			 "failed to create gt%u throttle sysfs files (%pe)",
+			 "failed to create gt%u punit_req_freq_mhz sysfs (%pe)",
 			 gt->info.id, ERR_PTR(ret));
 
+	if (GRAPHICS_VER(gt->i915) >= 11) {
+		ret = sysfs_create_files(kobj, throttle_reason_attrs);
+		if (ret)
+			drm_warn(&gt->i915->drm,
+				 "failed to create gt%u throttle sysfs files (%pe)",
+				 gt->info.id, ERR_PTR(ret));
+	}
+
 	if (HAS_MEDIA_RATIO_MODE(gt->i915) && intel_uc_uses_guc_slpc(&gt->uc)) {
 		ret = sysfs_create_files(kobj, media_perf_power_attrs);
 		if (ret)
-- 
cgit v1.2.3


From f0d74c4da1f060d2a66976193712a5e6abd361f5 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Mon, 26 Sep 2022 11:49:53 -0700
Subject: bpf: Parameterize task iterators.

Allow creating an iterator that loops through resources of one
thread/process.

People could only create iterators to loop through all resources of
files, vma, and tasks in the system, even though they were interested
in only the resources of a specific task or process.  Passing the
additional parameters, people can now create an iterator to go
through all resources or only the resources of a task.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20220926184957.208194-2-kuifeng@fb.com
---
 include/linux/bpf.h            |  25 ++++++
 include/uapi/linux/bpf.h       |   6 ++
 kernel/bpf/task_iter.c         | 188 ++++++++++++++++++++++++++++++++++++-----
 tools/include/uapi/linux/bpf.h |   6 ++
 4 files changed, 203 insertions(+), 22 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5161fac0513f..0f3eaf3ed98c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1796,6 +1796,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 	extern int bpf_iter_ ## target(args);			\
 	int __init bpf_iter_ ## target(args) { return 0; }
 
+/*
+ * The task type of iterators.
+ *
+ * For BPF task iterators, they can be parameterized with various
+ * parameters to visit only some of tasks.
+ *
+ * BPF_TASK_ITER_ALL (default)
+ *	Iterate over resources of every task.
+ *
+ * BPF_TASK_ITER_TID
+ *	Iterate over resources of a task/tid.
+ *
+ * BPF_TASK_ITER_TGID
+ *	Iterate over resources of every task of a process / task group.
+ */
+enum bpf_iter_task_type {
+	BPF_TASK_ITER_ALL = 0,
+	BPF_TASK_ITER_TID,
+	BPF_TASK_ITER_TGID,
+};
+
 struct bpf_iter_aux_info {
 	/* for map_elem iter */
 	struct bpf_map *map;
@@ -1805,6 +1826,10 @@ struct bpf_iter_aux_info {
 		struct cgroup *start; /* starting cgroup */
 		enum bpf_cgroup_iter_order order;
 	} cgroup;
+	struct {
+		enum bpf_iter_task_type	type;
+		u32 pid;
+	} task;
 };
 
 typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d6bd10759eaf..455b21a53aac 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -110,6 +110,12 @@ union bpf_iter_link_info {
 		__u32	cgroup_fd;
 		__u64	cgroup_id;
 	} cgroup;
+	/* Parameters of task iterators. */
+	struct {
+		__u32	tid;
+		__u32	pid;
+		__u32	pid_fd;
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8c921799def4..8b2f47e7139d 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -12,6 +12,9 @@
 
 struct bpf_iter_seq_task_common {
 	struct pid_namespace *ns;
+	enum bpf_iter_task_type	type;
+	u32 pid;
+	u32 pid_visiting;
 };
 
 struct bpf_iter_seq_task_info {
@@ -22,18 +25,115 @@ struct bpf_iter_seq_task_info {
 	u32 tid;
 };
 
-static struct task_struct *task_seq_get_next(struct pid_namespace *ns,
+static struct task_struct *task_group_seq_get_next(struct bpf_iter_seq_task_common *common,
+						   u32 *tid,
+						   bool skip_if_dup_files)
+{
+	struct task_struct *task, *next_task;
+	struct pid *pid;
+	u32 saved_tid;
+
+	if (!*tid) {
+		/* The first time, the iterator calls this function. */
+		pid = find_pid_ns(common->pid, common->ns);
+		if (!pid)
+			return NULL;
+
+		task = get_pid_task(pid, PIDTYPE_TGID);
+		if (!task)
+			return NULL;
+
+		*tid = common->pid;
+		common->pid_visiting = common->pid;
+
+		return task;
+	}
+
+	/* If the control returns to user space and comes back to the
+	 * kernel again, *tid and common->pid_visiting should be the
+	 * same for task_seq_start() to pick up the correct task.
+	 */
+	if (*tid == common->pid_visiting) {
+		pid = find_pid_ns(common->pid_visiting, common->ns);
+		task = get_pid_task(pid, PIDTYPE_PID);
+
+		return task;
+	}
+
+	pid = find_pid_ns(common->pid_visiting, common->ns);
+	if (!pid)
+		return NULL;
+
+	task = get_pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		return NULL;
+
+retry:
+	if (!pid_alive(task)) {
+		put_task_struct(task);
+		return NULL;
+	}
+
+	next_task = next_thread(task);
+	put_task_struct(task);
+	if (!next_task)
+		return NULL;
+
+	saved_tid = *tid;
+	*tid = __task_pid_nr_ns(next_task, PIDTYPE_PID, common->ns);
+	if (!*tid || *tid == common->pid) {
+		/* Run out of tasks of a process.  The tasks of a
+		 * thread_group are linked as circular linked list.
+		 */
+		*tid = saved_tid;
+		return NULL;
+	}
+
+	get_task_struct(next_task);
+	common->pid_visiting = *tid;
+
+	if (skip_if_dup_files && task->files == task->group_leader->files) {
+		task = next_task;
+		goto retry;
+	}
+
+	return next_task;
+}
+
+static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *common,
 					     u32 *tid,
 					     bool skip_if_dup_files)
 {
 	struct task_struct *task = NULL;
 	struct pid *pid;
 
+	if (common->type == BPF_TASK_ITER_TID) {
+		if (*tid && *tid != common->pid)
+			return NULL;
+		rcu_read_lock();
+		pid = find_pid_ns(common->pid, common->ns);
+		if (pid) {
+			task = get_pid_task(pid, PIDTYPE_TGID);
+			*tid = common->pid;
+		}
+		rcu_read_unlock();
+
+		return task;
+	}
+
+	if (common->type == BPF_TASK_ITER_TGID) {
+		rcu_read_lock();
+		task = task_group_seq_get_next(common, tid, skip_if_dup_files);
+		rcu_read_unlock();
+
+		return task;
+	}
+
 	rcu_read_lock();
 retry:
-	pid = find_ge_pid(*tid, ns);
+	pid = find_ge_pid(*tid, common->ns);
 	if (pid) {
-		*tid = pid_nr_ns(pid, ns);
+		*tid = pid_nr_ns(pid, common->ns);
 		task = get_pid_task(pid, PIDTYPE_PID);
 		if (!task) {
 			++*tid;
@@ -56,7 +156,7 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
 	struct bpf_iter_seq_task_info *info = seq->private;
 	struct task_struct *task;
 
-	task = task_seq_get_next(info->common.ns, &info->tid, false);
+	task = task_seq_get_next(&info->common, &info->tid, false);
 	if (!task)
 		return NULL;
 
@@ -73,7 +173,7 @@ static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	++*pos;
 	++info->tid;
 	put_task_struct((struct task_struct *)v);
-	task = task_seq_get_next(info->common.ns, &info->tid, false);
+	task = task_seq_get_next(&info->common, &info->tid, false);
 	if (!task)
 		return NULL;
 
@@ -117,6 +217,41 @@ static void task_seq_stop(struct seq_file *seq, void *v)
 		put_task_struct((struct task_struct *)v);
 }
 
+static int bpf_iter_attach_task(struct bpf_prog *prog,
+				union bpf_iter_link_info *linfo,
+				struct bpf_iter_aux_info *aux)
+{
+	unsigned int flags;
+	struct pid *pid;
+	pid_t tgid;
+
+	if ((!!linfo->task.tid + !!linfo->task.pid + !!linfo->task.pid_fd) > 1)
+		return -EINVAL;
+
+	aux->task.type = BPF_TASK_ITER_ALL;
+	if (linfo->task.tid != 0) {
+		aux->task.type = BPF_TASK_ITER_TID;
+		aux->task.pid = linfo->task.tid;
+	}
+	if (linfo->task.pid != 0) {
+		aux->task.type = BPF_TASK_ITER_TGID;
+		aux->task.pid = linfo->task.pid;
+	}
+	if (linfo->task.pid_fd != 0) {
+		aux->task.type = BPF_TASK_ITER_TGID;
+
+		pid = pidfd_get_pid(linfo->task.pid_fd, &flags);
+		if (IS_ERR(pid))
+			return PTR_ERR(pid);
+
+		tgid = pid_nr_ns(pid, task_active_pid_ns(current));
+		aux->task.pid = tgid;
+		put_pid(pid);
+	}
+
+	return 0;
+}
+
 static const struct seq_operations task_seq_ops = {
 	.start	= task_seq_start,
 	.next	= task_seq_next,
@@ -137,8 +272,7 @@ struct bpf_iter_seq_task_file_info {
 static struct file *
 task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
 {
-	struct pid_namespace *ns = info->common.ns;
-	u32 curr_tid = info->tid;
+	u32 saved_tid = info->tid;
 	struct task_struct *curr_task;
 	unsigned int curr_fd = info->fd;
 
@@ -151,21 +285,18 @@ again:
 		curr_task = info->task;
 		curr_fd = info->fd;
 	} else {
-                curr_task = task_seq_get_next(ns, &curr_tid, true);
+		curr_task = task_seq_get_next(&info->common, &info->tid, true);
                 if (!curr_task) {
                         info->task = NULL;
-                        info->tid = curr_tid;
                         return NULL;
                 }
 
-                /* set info->task and info->tid */
+		/* set info->task */
 		info->task = curr_task;
-		if (curr_tid == info->tid) {
+		if (saved_tid == info->tid)
 			curr_fd = info->fd;
-		} else {
-			info->tid = curr_tid;
+		else
 			curr_fd = 0;
-		}
 	}
 
 	rcu_read_lock();
@@ -186,9 +317,15 @@ again:
 	/* the current task is done, go to the next task */
 	rcu_read_unlock();
 	put_task_struct(curr_task);
+
+	if (info->common.type == BPF_TASK_ITER_TID) {
+		info->task = NULL;
+		return NULL;
+	}
+
 	info->task = NULL;
 	info->fd = 0;
-	curr_tid = ++(info->tid);
+	saved_tid = ++(info->tid);
 	goto again;
 }
 
@@ -269,6 +406,9 @@ static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
 	struct bpf_iter_seq_task_common *common = priv_data;
 
 	common->ns = get_pid_ns(task_active_pid_ns(current));
+	common->type = aux->task.type;
+	common->pid = aux->task.pid;
+
 	return 0;
 }
 
@@ -307,11 +447,10 @@ enum bpf_task_vma_iter_find_op {
 static struct vm_area_struct *
 task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 {
-	struct pid_namespace *ns = info->common.ns;
 	enum bpf_task_vma_iter_find_op op;
 	struct vm_area_struct *curr_vma;
 	struct task_struct *curr_task;
-	u32 curr_tid = info->tid;
+	u32 saved_tid = info->tid;
 
 	/* If this function returns a non-NULL vma, it holds a reference to
 	 * the task_struct, and holds read lock on vma->mm->mmap_lock.
@@ -371,14 +510,13 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info)
 		}
 	} else {
 again:
-		curr_task = task_seq_get_next(ns, &curr_tid, true);
+		curr_task = task_seq_get_next(&info->common, &info->tid, true);
 		if (!curr_task) {
-			info->tid = curr_tid + 1;
+			info->tid++;
 			goto finish;
 		}
 
-		if (curr_tid != info->tid) {
-			info->tid = curr_tid;
+		if (saved_tid != info->tid) {
 			/* new task, process the first vma */
 			op = task_vma_iter_first_vma;
 		} else {
@@ -430,9 +568,12 @@ again:
 	return curr_vma;
 
 next_task:
+	if (info->common.type == BPF_TASK_ITER_TID)
+		goto finish;
+
 	put_task_struct(curr_task);
 	info->task = NULL;
-	curr_tid++;
+	info->tid++;
 	goto again;
 
 finish:
@@ -533,6 +674,7 @@ static const struct bpf_iter_seq_info task_seq_info = {
 
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
@@ -551,6 +693,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = {
 
 static struct bpf_iter_reg task_file_reg_info = {
 	.target			= "task_file",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
@@ -571,6 +714,7 @@ static const struct bpf_iter_seq_info task_vma_seq_info = {
 
 static struct bpf_iter_reg task_vma_reg_info = {
 	.target			= "task_vma",
+	.attach_target		= bpf_iter_attach_task,
 	.feature		= BPF_ITER_RESCHED,
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d6bd10759eaf..455b21a53aac 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -110,6 +110,12 @@ union bpf_iter_link_info {
 		__u32	cgroup_fd;
 		__u64	cgroup_id;
 	} cgroup;
+	/* Parameters of task iterators. */
+	struct {
+		__u32	tid;
+		__u32	pid;
+		__u32	pid_fd;
+	} task;
 };
 
 /* BPF syscall commands, see bpf(2) man-page for more details. */
-- 
cgit v1.2.3


From 21fb6f2aa3890b0d0abf88b7756d0098e9367a7c Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Mon, 26 Sep 2022 11:49:54 -0700
Subject: bpf: Handle bpf_link_info for the parameterized task BPF iterators.

Add new fields to bpf_link_info that users can query it through
bpf_obj_get_info_by_fd().

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20220926184957.208194-3-kuifeng@fb.com
---
 include/uapi/linux/bpf.h       |  4 ++++
 kernel/bpf/task_iter.c         | 18 ++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  4 ++++
 3 files changed, 26 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 455b21a53aac..3075018a4ef8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6265,6 +6265,10 @@ struct bpf_link_info {
 					__u64 cgroup_id;
 					__u32 order;
 				} cgroup;
+				struct {
+					__u32 tid;
+					__u32 pid;
+				} task;
 			};
 		} iter;
 		struct  {
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 8b2f47e7139d..46f836be22e2 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -672,6 +672,21 @@ static const struct bpf_iter_seq_info task_seq_info = {
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
 };
 
+static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info)
+{
+	switch (aux->task.type) {
+	case BPF_TASK_ITER_TID:
+		info->iter.task.tid = aux->task.pid;
+		break;
+	case BPF_TASK_ITER_TGID:
+		info->iter.task.pid = aux->task.pid;
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
 	.attach_target		= bpf_iter_attach_task,
@@ -682,6 +697,7 @@ static struct bpf_iter_reg task_reg_info = {
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
 	.seq_info		= &task_seq_info,
+	.fill_link_info		= bpf_iter_fill_link_info,
 };
 
 static const struct bpf_iter_seq_info task_file_seq_info = {
@@ -703,6 +719,7 @@ static struct bpf_iter_reg task_file_reg_info = {
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
 	.seq_info		= &task_file_seq_info,
+	.fill_link_info		= bpf_iter_fill_link_info,
 };
 
 static const struct bpf_iter_seq_info task_vma_seq_info = {
@@ -724,6 +741,7 @@ static struct bpf_iter_reg task_vma_reg_info = {
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
 	.seq_info		= &task_vma_seq_info,
+	.fill_link_info		= bpf_iter_fill_link_info,
 };
 
 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 455b21a53aac..3075018a4ef8 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6265,6 +6265,10 @@ struct bpf_link_info {
 					__u64 cgroup_id;
 					__u32 order;
 				} cgroup;
+				struct {
+					__u32 tid;
+					__u32 pid;
+				} task;
 			};
 		} iter;
 		struct  {
-- 
cgit v1.2.3


From 2c4fe44fb020f3cce904da2ba9e42bb1c118e8a3 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Mon, 26 Sep 2022 11:49:55 -0700
Subject: bpf: Handle show_fdinfo for the parameterized task BPF iterators

Show information of iterators in the respective files under
/proc/<pid>/fdinfo/.

For example, for a task file iterator with 1723 as the value of tid
parameter, its fdinfo would look like the following lines.

    pos:    0
    flags:  02000000
    mnt_id: 14
    ino:    38
    link_type:      iter
    link_id:        51
    prog_tag:       a590ac96db22b825
    prog_id:        299
    target_name:    task_file
    task_type:      TID
    tid: 1723

This patch add the last three fields.  task_type is the type of the
task parameter.  TID means the iterator visit only the thread
specified by tid.  The value of tid in the above example is 1723.  For
the case of PID task_type, it means the iterator visits only threads
of a process and will show the pid value of the process instead of a
tid.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20220926184957.208194-4-kuifeng@fb.com
---
 kernel/bpf/task_iter.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 46f836be22e2..67e03e1833ba 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -10,6 +10,12 @@
 #include <linux/btf_ids.h>
 #include "mmap_unlock_work.h"
 
+static const char * const iter_task_type_names[] = {
+	"ALL",
+	"TID",
+	"PID",
+};
+
 struct bpf_iter_seq_task_common {
 	struct pid_namespace *ns;
 	enum bpf_iter_task_type	type;
@@ -687,6 +693,15 @@ static int bpf_iter_fill_link_info(const struct bpf_iter_aux_info *aux, struct b
 	return 0;
 }
 
+static void bpf_iter_task_show_fdinfo(const struct bpf_iter_aux_info *aux, struct seq_file *seq)
+{
+	seq_printf(seq, "task_type:\t%s\n", iter_task_type_names[aux->task.type]);
+	if (aux->task.type == BPF_TASK_ITER_TID)
+		seq_printf(seq, "tid:\t%u\n", aux->task.pid);
+	else if (aux->task.type == BPF_TASK_ITER_TGID)
+		seq_printf(seq, "pid:\t%u\n", aux->task.pid);
+}
+
 static struct bpf_iter_reg task_reg_info = {
 	.target			= "task",
 	.attach_target		= bpf_iter_attach_task,
@@ -698,6 +713,7 @@ static struct bpf_iter_reg task_reg_info = {
 	},
 	.seq_info		= &task_seq_info,
 	.fill_link_info		= bpf_iter_fill_link_info,
+	.show_fdinfo		= bpf_iter_task_show_fdinfo,
 };
 
 static const struct bpf_iter_seq_info task_file_seq_info = {
@@ -720,6 +736,7 @@ static struct bpf_iter_reg task_file_reg_info = {
 	},
 	.seq_info		= &task_file_seq_info,
 	.fill_link_info		= bpf_iter_fill_link_info,
+	.show_fdinfo		= bpf_iter_task_show_fdinfo,
 };
 
 static const struct bpf_iter_seq_info task_vma_seq_info = {
@@ -742,6 +759,7 @@ static struct bpf_iter_reg task_vma_reg_info = {
 	},
 	.seq_info		= &task_vma_seq_info,
 	.fill_link_info		= bpf_iter_fill_link_info,
+	.show_fdinfo		= bpf_iter_task_show_fdinfo,
 };
 
 BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
-- 
cgit v1.2.3


From b3e1331eb925a45df1cc5d02a725e5ea70da0e2e Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Mon, 26 Sep 2022 11:49:56 -0700
Subject: selftests/bpf: Test parameterized task BPF iterators.

Test iterators of vma, files and tasks.

Ensure the API works appropriately to visit all tasks,
tasks in a process, or a particular task.

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20220926184957.208194-5-kuifeng@fb.com
---
 tools/testing/selftests/bpf/prog_tests/bpf_iter.c  | 282 +++++++++++++++++++--
 tools/testing/selftests/bpf/prog_tests/btf_dump.c  |   2 +-
 tools/testing/selftests/bpf/progs/bpf_iter_task.c  |   9 +
 .../selftests/bpf/progs/bpf_iter_task_file.c       |   9 +-
 .../selftests/bpf/progs/bpf_iter_task_vma.c        |   7 +-
 .../selftests/bpf/progs/bpf_iter_vma_offset.c      |  37 +++
 6 files changed, 322 insertions(+), 24 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_vma_offset.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index e89685bd587c..3369c5ec3a17 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright (c) 2020 Facebook */
 #include <test_progs.h>
+#include <unistd.h>
+#include <sys/syscall.h>
 #include "bpf_iter_ipv6_route.skel.h"
 #include "bpf_iter_netlink.skel.h"
 #include "bpf_iter_bpf_map.skel.h"
@@ -14,6 +16,7 @@
 #include "bpf_iter_udp4.skel.h"
 #include "bpf_iter_udp6.skel.h"
 #include "bpf_iter_unix.skel.h"
+#include "bpf_iter_vma_offset.skel.h"
 #include "bpf_iter_test_kern1.skel.h"
 #include "bpf_iter_test_kern2.skel.h"
 #include "bpf_iter_test_kern3.skel.h"
@@ -43,13 +46,13 @@ static void test_btf_id_or_null(void)
 	}
 }
 
-static void do_dummy_read(struct bpf_program *prog)
+static void do_dummy_read_opts(struct bpf_program *prog, struct bpf_iter_attach_opts *opts)
 {
 	struct bpf_link *link;
 	char buf[16] = {};
 	int iter_fd, len;
 
-	link = bpf_program__attach_iter(prog, NULL);
+	link = bpf_program__attach_iter(prog, opts);
 	if (!ASSERT_OK_PTR(link, "attach_iter"))
 		return;
 
@@ -68,6 +71,11 @@ free_link:
 	bpf_link__destroy(link);
 }
 
+static void do_dummy_read(struct bpf_program *prog)
+{
+	do_dummy_read_opts(prog, NULL);
+}
+
 static void do_read_map_iter_fd(struct bpf_object_skeleton **skel, struct bpf_program *prog,
 				struct bpf_map *map)
 {
@@ -167,19 +175,140 @@ static void test_bpf_map(void)
 	bpf_iter_bpf_map__destroy(skel);
 }
 
-static void test_task(void)
+static int pidfd_open(pid_t pid, unsigned int flags)
+{
+	return syscall(SYS_pidfd_open, pid, flags);
+}
+
+static void check_bpf_link_info(const struct bpf_program *prog)
+{
+	LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
+	struct bpf_link_info info = {};
+	struct bpf_link *link;
+	__u32 info_len;
+	int err;
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.task.tid = getpid();
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	link = bpf_program__attach_iter(prog, &opts);
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
+		return;
+
+	info_len = sizeof(info);
+	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
+	ASSERT_OK(err, "bpf_obj_get_info_by_fd");
+	ASSERT_EQ(info.iter.task.tid, getpid(), "check_task_tid");
+
+	bpf_link__destroy(link);
+}
+
+static pthread_mutex_t do_nothing_mutex;
+
+static void *do_nothing_wait(void *arg)
+{
+	pthread_mutex_lock(&do_nothing_mutex);
+	pthread_mutex_unlock(&do_nothing_mutex);
+
+	pthread_exit(arg);
+}
+
+static void test_task_common_nocheck(struct bpf_iter_attach_opts *opts,
+				     int *num_unknown, int *num_known)
 {
 	struct bpf_iter_task *skel;
+	pthread_t thread_id;
+	void *ret;
 
 	skel = bpf_iter_task__open_and_load();
 	if (!ASSERT_OK_PTR(skel, "bpf_iter_task__open_and_load"))
 		return;
 
-	do_dummy_read(skel->progs.dump_task);
+	ASSERT_OK(pthread_mutex_lock(&do_nothing_mutex), "pthread_mutex_lock");
+
+	ASSERT_OK(pthread_create(&thread_id, NULL, &do_nothing_wait, NULL),
+		  "pthread_create");
+
+	skel->bss->tid = getpid();
+
+	do_dummy_read_opts(skel->progs.dump_task, opts);
+
+	*num_unknown = skel->bss->num_unknown_tid;
+	*num_known = skel->bss->num_known_tid;
+
+	ASSERT_OK(pthread_mutex_unlock(&do_nothing_mutex), "pthread_mutex_unlock");
+	ASSERT_FALSE(pthread_join(thread_id, &ret) || ret != NULL,
+		     "pthread_join");
 
 	bpf_iter_task__destroy(skel);
 }
 
+static void test_task_common(struct bpf_iter_attach_opts *opts, int num_unknown, int num_known)
+{
+	int num_unknown_tid, num_known_tid;
+
+	test_task_common_nocheck(opts, &num_unknown_tid, &num_known_tid);
+	ASSERT_EQ(num_unknown_tid, num_unknown, "check_num_unknown_tid");
+	ASSERT_EQ(num_known_tid, num_known, "check_num_known_tid");
+}
+
+static void test_task_tid(void)
+{
+	LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
+	int num_unknown_tid, num_known_tid;
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.task.tid = getpid();
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+	test_task_common(&opts, 0, 1);
+
+	linfo.task.tid = 0;
+	linfo.task.pid = getpid();
+	test_task_common(&opts, 1, 1);
+
+	test_task_common_nocheck(NULL, &num_unknown_tid, &num_known_tid);
+	ASSERT_GT(num_unknown_tid, 1, "check_num_unknown_tid");
+	ASSERT_EQ(num_known_tid, 1, "check_num_known_tid");
+}
+
+static void test_task_pid(void)
+{
+	LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.task.pid = getpid();
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	test_task_common(&opts, 1, 1);
+}
+
+static void test_task_pidfd(void)
+{
+	LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
+	int pidfd;
+
+	pidfd = pidfd_open(getpid(), 0);
+	if (!ASSERT_GT(pidfd, 0, "pidfd_open"))
+		return;
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.task.pid_fd = pidfd;
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	test_task_common(&opts, 1, 1);
+
+	close(pidfd);
+}
+
 static void test_task_sleepable(void)
 {
 	struct bpf_iter_task *skel;
@@ -212,14 +341,11 @@ static void test_task_stack(void)
 	bpf_iter_task_stack__destroy(skel);
 }
 
-static void *do_nothing(void *arg)
-{
-	pthread_exit(arg);
-}
-
 static void test_task_file(void)
 {
+	LIBBPF_OPTS(bpf_iter_attach_opts, opts);
 	struct bpf_iter_task_file *skel;
+	union bpf_iter_link_info linfo;
 	pthread_t thread_id;
 	void *ret;
 
@@ -229,19 +355,36 @@ static void test_task_file(void)
 
 	skel->bss->tgid = getpid();
 
-	if (!ASSERT_OK(pthread_create(&thread_id, NULL, &do_nothing, NULL),
-		  "pthread_create"))
-		goto done;
+	ASSERT_OK(pthread_mutex_lock(&do_nothing_mutex), "pthread_mutex_lock");
 
-	do_dummy_read(skel->progs.dump_task_file);
+	ASSERT_OK(pthread_create(&thread_id, NULL, &do_nothing_wait, NULL),
+		  "pthread_create");
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.task.tid = getpid();
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
 
-	if (!ASSERT_FALSE(pthread_join(thread_id, &ret) || ret != NULL,
-		  "pthread_join"))
-		goto done;
+	do_dummy_read_opts(skel->progs.dump_task_file, &opts);
 
 	ASSERT_EQ(skel->bss->count, 0, "check_count");
+	ASSERT_EQ(skel->bss->unique_tgid_count, 1, "check_unique_tgid_count");
+
+	skel->bss->last_tgid = 0;
+	skel->bss->count = 0;
+	skel->bss->unique_tgid_count = 0;
+
+	do_dummy_read(skel->progs.dump_task_file);
+
+	ASSERT_EQ(skel->bss->count, 0, "check_count");
+	ASSERT_GT(skel->bss->unique_tgid_count, 1, "check_unique_tgid_count");
+
+	check_bpf_link_info(skel->progs.dump_task_file);
+
+	ASSERT_OK(pthread_mutex_unlock(&do_nothing_mutex), "pthread_mutex_unlock");
+	ASSERT_OK(pthread_join(thread_id, &ret), "pthread_join");
+	ASSERT_NULL(ret, "pthread_join");
 
-done:
 	bpf_iter_task_file__destroy(skel);
 }
 
@@ -1249,7 +1392,7 @@ static void str_strip_first_line(char *str)
 	*dst = '\0';
 }
 
-static void test_task_vma(void)
+static void test_task_vma_common(struct bpf_iter_attach_opts *opts)
 {
 	int err, iter_fd = -1, proc_maps_fd = -1;
 	struct bpf_iter_task_vma *skel;
@@ -1261,13 +1404,14 @@ static void test_task_vma(void)
 		return;
 
 	skel->bss->pid = getpid();
+	skel->bss->one_task = opts ? 1 : 0;
 
 	err = bpf_iter_task_vma__load(skel);
 	if (!ASSERT_OK(err, "bpf_iter_task_vma__load"))
 		goto out;
 
 	skel->links.proc_maps = bpf_program__attach_iter(
-		skel->progs.proc_maps, NULL);
+		skel->progs.proc_maps, opts);
 
 	if (!ASSERT_OK_PTR(skel->links.proc_maps, "bpf_program__attach_iter")) {
 		skel->links.proc_maps = NULL;
@@ -1291,6 +1435,8 @@ static void test_task_vma(void)
 			goto out;
 		len += err;
 	}
+	if (opts)
+		ASSERT_EQ(skel->bss->one_task_error, 0, "unexpected task");
 
 	/* read CMP_BUFFER_SIZE (1kB) from /proc/pid/maps */
 	snprintf(maps_path, 64, "/proc/%u/maps", skel->bss->pid);
@@ -1306,6 +1452,9 @@ static void test_task_vma(void)
 	str_strip_first_line(proc_maps_output);
 
 	ASSERT_STREQ(task_vma_output, proc_maps_output, "compare_output");
+
+	check_bpf_link_info(skel->progs.proc_maps);
+
 out:
 	close(proc_maps_fd);
 	close(iter_fd);
@@ -1325,8 +1474,93 @@ void test_bpf_sockmap_map_iter_fd(void)
 	bpf_iter_sockmap__destroy(skel);
 }
 
+static void test_task_vma(void)
+{
+	LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.task.tid = getpid();
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	test_task_vma_common(&opts);
+	test_task_vma_common(NULL);
+}
+
+/* uprobe attach point */
+static noinline int trigger_func(int arg)
+{
+	asm volatile ("");
+	return arg + 1;
+}
+
+static void test_task_vma_offset_common(struct bpf_iter_attach_opts *opts, bool one_proc)
+{
+	struct bpf_iter_vma_offset *skel;
+	struct bpf_link *link;
+	char buf[16] = {};
+	int iter_fd, len;
+	int pgsz, shift;
+
+	skel = bpf_iter_vma_offset__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "bpf_iter_vma_offset__open_and_load"))
+		return;
+
+	skel->bss->pid = getpid();
+	skel->bss->address = (uintptr_t)trigger_func;
+	for (pgsz = getpagesize(), shift = 0; pgsz > 1; pgsz >>= 1, shift++)
+		;
+	skel->bss->page_shift = shift;
+
+	link = bpf_program__attach_iter(skel->progs.get_vma_offset, opts);
+	if (!ASSERT_OK_PTR(link, "attach_iter"))
+		return;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (!ASSERT_GT(iter_fd, 0, "create_iter"))
+		goto exit;
+
+	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+		;
+	buf[15] = 0;
+	ASSERT_EQ(strcmp(buf, "OK\n"), 0, "strcmp");
+
+	ASSERT_EQ(skel->bss->offset, get_uprobe_offset(trigger_func), "offset");
+	if (one_proc)
+		ASSERT_EQ(skel->bss->unique_tgid_cnt, 1, "unique_tgid_count");
+	else
+		ASSERT_GT(skel->bss->unique_tgid_cnt, 1, "unique_tgid_count");
+
+	close(iter_fd);
+
+exit:
+	bpf_link__destroy(link);
+}
+
+static void test_task_vma_offset(void)
+{
+	LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	union bpf_iter_link_info linfo;
+
+	memset(&linfo, 0, sizeof(linfo));
+	linfo.task.pid = getpid();
+	opts.link_info = &linfo;
+	opts.link_info_len = sizeof(linfo);
+
+	test_task_vma_offset_common(&opts, true);
+
+	linfo.task.pid = 0;
+	linfo.task.tid = getpid();
+	test_task_vma_offset_common(&opts, true);
+
+	test_task_vma_offset_common(NULL, false);
+}
+
 void test_bpf_iter(void)
 {
+	ASSERT_OK(pthread_mutex_init(&do_nothing_mutex, NULL), "pthread_mutex_init");
+
 	if (test__start_subtest("btf_id_or_null"))
 		test_btf_id_or_null();
 	if (test__start_subtest("ipv6_route"))
@@ -1335,8 +1569,12 @@ void test_bpf_iter(void)
 		test_netlink();
 	if (test__start_subtest("bpf_map"))
 		test_bpf_map();
-	if (test__start_subtest("task"))
-		test_task();
+	if (test__start_subtest("task_tid"))
+		test_task_tid();
+	if (test__start_subtest("task_pid"))
+		test_task_pid();
+	if (test__start_subtest("task_pidfd"))
+		test_task_pidfd();
 	if (test__start_subtest("task_sleepable"))
 		test_task_sleepable();
 	if (test__start_subtest("task_stack"))
@@ -1397,4 +1635,6 @@ void test_bpf_iter(void)
 		test_ksym_iter();
 	if (test__start_subtest("bpf_sockmap_map_iter_fd"))
 		test_bpf_sockmap_map_iter_fd();
+	if (test__start_subtest("vma_offset"))
+		test_task_vma_offset();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
index b1ca954ed1e5..24da335482d4 100644
--- a/tools/testing/selftests/bpf/prog_tests/btf_dump.c
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -764,7 +764,7 @@ static void test_btf_dump_struct_data(struct btf *btf, struct btf_dump *d,
 
 	/* union with nested struct */
 	TEST_BTF_DUMP_DATA(btf, d, "union", str, union bpf_iter_link_info, BTF_F_COMPACT,
-			   "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_CGROUP_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},}",
+			   "(union bpf_iter_link_info){.map = (struct){.map_fd = (__u32)1,},.cgroup = (struct){.order = (enum bpf_cgroup_iter_order)BPF_CGROUP_ITER_SELF_ONLY,.cgroup_fd = (__u32)1,},.task = (struct){.tid = (__u32)1,.pid = (__u32)1,},}",
 			   { .cgroup = { .order = 1, .cgroup_fd = 1, }});
 
 	/* struct skb with nested structs/unions; because type output is so
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
index d22741272692..96131b9a1caa 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
@@ -6,6 +6,10 @@
 
 char _license[] SEC("license") = "GPL";
 
+uint32_t tid = 0;
+int num_unknown_tid = 0;
+int num_known_tid = 0;
+
 SEC("iter/task")
 int dump_task(struct bpf_iter__task *ctx)
 {
@@ -18,6 +22,11 @@ int dump_task(struct bpf_iter__task *ctx)
 		return 0;
 	}
 
+	if (task->pid != tid)
+		num_unknown_tid++;
+	else
+		num_known_tid++;
+
 	if (ctx->meta->seq_num == 0)
 		BPF_SEQ_PRINTF(seq, "    tgid      gid\n");
 
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
index 6e7b400888fe..b0255080662d 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
@@ -7,14 +7,16 @@ char _license[] SEC("license") = "GPL";
 
 int count = 0;
 int tgid = 0;
+int last_tgid = 0;
+int unique_tgid_count = 0;
 
 SEC("iter/task_file")
 int dump_task_file(struct bpf_iter__task_file *ctx)
 {
 	struct seq_file *seq = ctx->meta->seq;
 	struct task_struct *task = ctx->task;
-	__u32 fd = ctx->fd;
 	struct file *file = ctx->file;
+	__u32 fd = ctx->fd;
 
 	if (task == (void *)0 || file == (void *)0)
 		return 0;
@@ -27,6 +29,11 @@ int dump_task_file(struct bpf_iter__task_file *ctx)
 	if (tgid == task->tgid && task->tgid != task->pid)
 		count++;
 
+	if (last_tgid != task->tgid) {
+		last_tgid = task->tgid;
+		unique_tgid_count++;
+	}
+
 	BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
 		       (long)file->f_op);
 	return 0;
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
index 4ea6a37d1345..dd923dc637d5 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_vma.c
@@ -20,6 +20,8 @@ char _license[] SEC("license") = "GPL";
 #define D_PATH_BUF_SIZE 1024
 char d_path_buf[D_PATH_BUF_SIZE] = {};
 __u32 pid = 0;
+__u32 one_task = 0;
+__u32 one_task_error = 0;
 
 SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx)
 {
@@ -33,8 +35,11 @@ SEC("iter/task_vma") int proc_maps(struct bpf_iter__task_vma *ctx)
 		return 0;
 
 	file = vma->vm_file;
-	if (task->tgid != pid)
+	if (task->tgid != pid) {
+		if (one_task)
+			one_task_error = 1;
 		return 0;
+	}
 	perm_str[0] = (vma->vm_flags & VM_READ) ? 'r' : '-';
 	perm_str[1] = (vma->vm_flags & VM_WRITE) ? 'w' : '-';
 	perm_str[2] = (vma->vm_flags & VM_EXEC) ? 'x' : '-';
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_vma_offset.c b/tools/testing/selftests/bpf/progs/bpf_iter_vma_offset.c
new file mode 100644
index 000000000000..ee7455d2623a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_vma_offset.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u32 unique_tgid_cnt = 0;
+uintptr_t address = 0;
+uintptr_t offset = 0;
+__u32 last_tgid = 0;
+__u32 pid = 0;
+__u32 page_shift = 0;
+
+SEC("iter/task_vma")
+int get_vma_offset(struct bpf_iter__task_vma *ctx)
+{
+	struct vm_area_struct *vma = ctx->vma;
+	struct seq_file *seq = ctx->meta->seq;
+	struct task_struct *task = ctx->task;
+
+	if (task == NULL || vma == NULL)
+		return 0;
+
+	if (last_tgid != task->tgid)
+		unique_tgid_cnt++;
+	last_tgid = task->tgid;
+
+	if (task->tgid != pid)
+		return 0;
+
+	if (vma->vm_start <= address && vma->vm_end > address) {
+		offset = address - vma->vm_start + (vma->vm_pgoff << page_shift);
+		BPF_SEQ_PRINTF(seq, "OK\n");
+	}
+	return 0;
+}
-- 
cgit v1.2.3


From 6bdb6d6be019f697296f52c37865dd7b0ce80750 Mon Sep 17 00:00:00 2001
From: Kui-Feng Lee <kuifeng@fb.com>
Date: Mon, 26 Sep 2022 11:49:57 -0700
Subject: bpftool: Show parameters of BPF task iterators.

Show tid or pid of iterators if giving an argument of tid or pid

For example, the command `bpftool link list` may list following
lines.

1: iter  prog 2  target_name bpf_map
2: iter  prog 3  target_name bpf_prog
33: iter  prog 225  target_name task_file  tid 1644
        pids test_progs(1644)

Link 33 is a task_file iterator with tid 1644.  For now, only targets
of task, task_file and task_vma may be with tid or pid to filter out
tasks other than those belonging to a process (pid) or a thread (tid).

Signed-off-by: Kui-Feng Lee <kuifeng@fb.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/bpf/20220926184957.208194-6-kuifeng@fb.com
---
 tools/bpf/bpftool/link.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
index ef0dc2f8d5a2..2863639706dd 100644
--- a/tools/bpf/bpftool/link.c
+++ b/tools/bpf/bpftool/link.c
@@ -106,6 +106,13 @@ static const char *cgroup_order_string(__u32 order)
 	}
 }
 
+static bool is_iter_task_target(const char *target_name)
+{
+	return strcmp(target_name, "task") == 0 ||
+		strcmp(target_name, "task_file") == 0 ||
+		strcmp(target_name, "task_vma") == 0;
+}
+
 static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr)
 {
 	const char *target_name = u64_to_ptr(info->iter.target_name);
@@ -114,6 +121,12 @@ static void show_iter_json(struct bpf_link_info *info, json_writer_t *wtr)
 
 	if (is_iter_map_target(target_name))
 		jsonw_uint_field(wtr, "map_id", info->iter.map.map_id);
+	else if (is_iter_task_target(target_name)) {
+		if (info->iter.task.tid)
+			jsonw_uint_field(wtr, "tid", info->iter.task.tid);
+		else if (info->iter.task.pid)
+			jsonw_uint_field(wtr, "pid", info->iter.task.pid);
+	}
 
 	if (is_iter_cgroup_target(target_name)) {
 		jsonw_lluint_field(wtr, "cgroup_id", info->iter.cgroup.cgroup_id);
@@ -237,6 +250,12 @@ static void show_iter_plain(struct bpf_link_info *info)
 
 	if (is_iter_map_target(target_name))
 		printf("map_id %u  ", info->iter.map.map_id);
+	else if (is_iter_task_target(target_name)) {
+		if (info->iter.task.tid)
+			printf("tid %u ", info->iter.task.tid);
+		else if (info->iter.task.pid)
+			printf("pid %u ", info->iter.task.pid);
+	}
 
 	if (is_iter_cgroup_target(target_name)) {
 		printf("cgroup_id %llu  ", info->iter.cgroup.cgroup_id);
-- 
cgit v1.2.3


From 4014e916fd0cea19b559890588d70b4e9f44b87f Mon Sep 17 00:00:00 2001
From: Jernej Skrabec <jernej.skrabec@gmail.com>
Date: Wed, 28 Sep 2022 22:01:22 +0200
Subject: clk: sunxi-ng: h6: Fix default PLL GPU rate

In commit 4167ac8a657e ("clk: sunxi-ng: sun50i: h6: Modify GPU clock
configuration to support DFS") divider M0 was forced to be 1 in order to
support DFS. However, that left N as it is, at high value of 36. On
boards without devfreq enabled (all of them in kernel 6.0), this
effectively sets GPU frequency to 864 MHz. This is about 100 MHz above
maximum supported frequency.

In order to fix this, let's set N to 18 (register value 17). That way
default frequency of 432 MHz is preserved.

Fixes: 4167ac8a657e ("clk: sunxi-ng: sun50i: h6: Modify GPU clock configuration to support DFS")
Signed-off-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Link: https://lore.kernel.org/r/20220928200122.3963509-1-jernej.skrabec@gmail.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/sunxi-ng/ccu-sun50i-h6.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/clk/sunxi-ng/ccu-sun50i-h6.c b/drivers/clk/sunxi-ng/ccu-sun50i-h6.c
index 30056da3e0af..42568c616181 100644
--- a/drivers/clk/sunxi-ng/ccu-sun50i-h6.c
+++ b/drivers/clk/sunxi-ng/ccu-sun50i-h6.c
@@ -1191,9 +1191,13 @@ static int sun50i_h6_ccu_probe(struct platform_device *pdev)
 	if (IS_ERR(reg))
 		return PTR_ERR(reg);
 
-	/* Force PLL_GPU output divider bits to 0 */
+	/*
+	 * Force PLL_GPU output divider bits to 0 and adjust
+	 * multiplier to sensible default value of 432 MHz.
+	 */
 	val = readl(reg + SUN50I_H6_PLL_GPU_REG);
-	val &= ~BIT(0);
+	val &= ~(GENMASK(15, 8) | BIT(0));
+	val |= 17 << 8;
 	writel(val, reg + SUN50I_H6_PLL_GPU_REG);
 
 	/* Force GPU_CLK divider bits to 0 */
-- 
cgit v1.2.3


From 38e35e1d0cee3432baadfd6900e1d05a3419eda6 Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Mon, 26 Sep 2022 13:12:01 +0800
Subject: selftests/bpf: Convert sockmap_basic test to ASSERT_* macros

Convert the selftest to use the preferred ASSERT_* macros instead of the
deprecated CHECK().

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1664169131-32405-2-git-send-email-wangyufen@huawei.com
---
 .../selftests/bpf/prog_tests/sockmap_basic.c       | 87 ++++++++--------------
 1 file changed, 33 insertions(+), 54 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index cec5c0882372..0aa088900699 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -27,21 +27,21 @@ static int connected_socket_v4(void)
 	int s, repair, err;
 
 	s = socket(AF_INET, SOCK_STREAM, 0);
-	if (CHECK_FAIL(s == -1))
+	if (!ASSERT_GE(s, 0, "socket"))
 		goto error;
 
 	repair = TCP_REPAIR_ON;
 	err = setsockopt(s, SOL_TCP, TCP_REPAIR, &repair, sizeof(repair));
-	if (CHECK_FAIL(err))
+	if (!ASSERT_OK(err, "setsockopt(TCP_REPAIR)"))
 		goto error;
 
 	err = connect(s, (struct sockaddr *)&addr, len);
-	if (CHECK_FAIL(err))
+	if (!ASSERT_OK(err, "connect"))
 		goto error;
 
 	repair = TCP_REPAIR_OFF_NO_WP;
 	err = setsockopt(s, SOL_TCP, TCP_REPAIR, &repair, sizeof(repair));
-	if (CHECK_FAIL(err))
+	if (!ASSERT_OK(err, "setsockopt(TCP_REPAIR)"))
 		goto error;
 
 	return s;
@@ -54,7 +54,7 @@ error:
 static void compare_cookies(struct bpf_map *src, struct bpf_map *dst)
 {
 	__u32 i, max_entries = bpf_map__max_entries(src);
-	int err, duration = 0, src_fd, dst_fd;
+	int err, src_fd, dst_fd;
 
 	src_fd = bpf_map__fd(src);
 	dst_fd = bpf_map__fd(dst);
@@ -65,20 +65,18 @@ static void compare_cookies(struct bpf_map *src, struct bpf_map *dst)
 		err = bpf_map_lookup_elem(src_fd, &i, &src_cookie);
 		if (err && errno == ENOENT) {
 			err = bpf_map_lookup_elem(dst_fd, &i, &dst_cookie);
-			CHECK(!err, "map_lookup_elem(dst)", "element %u not deleted\n", i);
-			CHECK(err && errno != ENOENT, "map_lookup_elem(dst)", "%s\n",
-			      strerror(errno));
+			ASSERT_ERR(err, "map_lookup_elem(dst)");
+			ASSERT_EQ(errno, ENOENT, "map_lookup_elem(dst)");
 			continue;
 		}
-		if (CHECK(err, "lookup_elem(src)", "%s\n", strerror(errno)))
+		if (!ASSERT_OK(err, "lookup_elem(src)"))
 			continue;
 
 		err = bpf_map_lookup_elem(dst_fd, &i, &dst_cookie);
-		if (CHECK(err, "lookup_elem(dst)", "%s\n", strerror(errno)))
+		if (!ASSERT_OK(err, "lookup_elem(dst)"))
 			continue;
 
-		CHECK(dst_cookie != src_cookie, "cookie mismatch",
-		      "%llu != %llu (pos %u)\n", dst_cookie, src_cookie, i);
+		ASSERT_EQ(dst_cookie, src_cookie, "cookie mismatch");
 	}
 }
 
@@ -89,20 +87,16 @@ static void test_sockmap_create_update_free(enum bpf_map_type map_type)
 	int s, map, err;
 
 	s = connected_socket_v4();
-	if (CHECK_FAIL(s < 0))
+	if (!ASSERT_GE(s, 0, "connected_socket_v4"))
 		return;
 
 	map = bpf_map_create(map_type, NULL, sizeof(int), sizeof(int), 1, NULL);
-	if (CHECK_FAIL(map < 0)) {
-		perror("bpf_cmap_create");
+	if (!ASSERT_GE(map, 0, "bpf_map_create"))
 		goto out;
-	}
 
 	err = bpf_map_update_elem(map, &zero, &s, BPF_NOEXIST);
-	if (CHECK_FAIL(err)) {
-		perror("bpf_map_update");
+	if (!ASSERT_OK(err, "bpf_map_update"))
 		goto out;
-	}
 
 out:
 	close(map);
@@ -115,32 +109,26 @@ static void test_skmsg_helpers(enum bpf_map_type map_type)
 	int err, map, verdict;
 
 	skel = test_skmsg_load_helpers__open_and_load();
-	if (CHECK_FAIL(!skel)) {
-		perror("test_skmsg_load_helpers__open_and_load");
+	if (!ASSERT_OK_PTR(skel, "test_skmsg_load_helpers__open_and_load"))
 		return;
-	}
 
 	verdict = bpf_program__fd(skel->progs.prog_msg_verdict);
 	map = bpf_map__fd(skel->maps.sock_map);
 
 	err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0);
-	if (CHECK_FAIL(err)) {
-		perror("bpf_prog_attach");
+	if (!ASSERT_OK(err, "bpf_prog_attach"))
 		goto out;
-	}
 
 	err = bpf_prog_detach2(verdict, map, BPF_SK_MSG_VERDICT);
-	if (CHECK_FAIL(err)) {
-		perror("bpf_prog_detach2");
+	if (!ASSERT_OK(err, "bpf_prog_detach2"))
 		goto out;
-	}
 out:
 	test_skmsg_load_helpers__destroy(skel);
 }
 
 static void test_sockmap_update(enum bpf_map_type map_type)
 {
-	int err, prog, src, duration = 0;
+	int err, prog, src;
 	struct test_sockmap_update *skel;
 	struct bpf_map *dst_map;
 	const __u32 zero = 0;
@@ -153,11 +141,11 @@ static void test_sockmap_update(enum bpf_map_type map_type)
 	__s64 sk;
 
 	sk = connected_socket_v4();
-	if (CHECK(sk == -1, "connected_socket_v4", "cannot connect\n"))
+	if (!ASSERT_NEQ(sk, -1, "connected_socket_v4"))
 		return;
 
 	skel = test_sockmap_update__open_and_load();
-	if (CHECK(!skel, "open_and_load", "cannot load skeleton\n"))
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
 		goto close_sk;
 
 	prog = bpf_program__fd(skel->progs.copy_sock_map);
@@ -168,7 +156,7 @@ static void test_sockmap_update(enum bpf_map_type map_type)
 		dst_map = skel->maps.dst_sock_hash;
 
 	err = bpf_map_update_elem(src, &zero, &sk, BPF_NOEXIST);
-	if (CHECK(err, "update_elem(src)", "errno=%u\n", errno))
+	if (!ASSERT_OK(err, "update_elem(src)"))
 		goto out;
 
 	err = bpf_prog_test_run_opts(prog, &topts);
@@ -188,17 +176,16 @@ close_sk:
 static void test_sockmap_invalid_update(void)
 {
 	struct test_sockmap_invalid_update *skel;
-	int duration = 0;
 
 	skel = test_sockmap_invalid_update__open_and_load();
-	if (CHECK(skel, "open_and_load", "verifier accepted map_update\n"))
+	if (!ASSERT_NULL(skel, "open_and_load"))
 		test_sockmap_invalid_update__destroy(skel);
 }
 
 static void test_sockmap_copy(enum bpf_map_type map_type)
 {
 	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
-	int err, len, src_fd, iter_fd, duration = 0;
+	int err, len, src_fd, iter_fd;
 	union bpf_iter_link_info linfo = {};
 	__u32 i, num_sockets, num_elems;
 	struct bpf_iter_sockmap *skel;
@@ -208,7 +195,7 @@ static void test_sockmap_copy(enum bpf_map_type map_type)
 	char buf[64];
 
 	skel = bpf_iter_sockmap__open_and_load();
-	if (CHECK(!skel, "bpf_iter_sockmap__open_and_load", "skeleton open_and_load failed\n"))
+	if (!ASSERT_OK_PTR(skel, "bpf_iter_sockmap__open_and_load"))
 		return;
 
 	if (map_type == BPF_MAP_TYPE_SOCKMAP) {
@@ -222,7 +209,7 @@ static void test_sockmap_copy(enum bpf_map_type map_type)
 	}
 
 	sock_fd = calloc(num_sockets, sizeof(*sock_fd));
-	if (CHECK(!sock_fd, "calloc(sock_fd)", "failed to allocate\n"))
+	if (!ASSERT_OK_PTR(sock_fd, "calloc(sock_fd)"))
 		goto out;
 
 	for (i = 0; i < num_sockets; i++)
@@ -232,11 +219,11 @@ static void test_sockmap_copy(enum bpf_map_type map_type)
 
 	for (i = 0; i < num_sockets; i++) {
 		sock_fd[i] = connected_socket_v4();
-		if (CHECK(sock_fd[i] == -1, "connected_socket_v4", "cannot connect\n"))
+		if (!ASSERT_NEQ(sock_fd[i], -1, "connected_socket_v4"))
 			goto out;
 
 		err = bpf_map_update_elem(src_fd, &i, &sock_fd[i], BPF_NOEXIST);
-		if (CHECK(err, "map_update", "failed: %s\n", strerror(errno)))
+		if (!ASSERT_OK(err, "map_update"))
 			goto out;
 	}
 
@@ -248,22 +235,20 @@ static void test_sockmap_copy(enum bpf_map_type map_type)
 		goto out;
 
 	iter_fd = bpf_iter_create(bpf_link__fd(link));
-	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+	if (!ASSERT_GE(iter_fd, 0, "create_iter"))
 		goto free_link;
 
 	/* do some tests */
 	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
 		;
-	if (CHECK(len < 0, "read", "failed: %s\n", strerror(errno)))
+	if (!ASSERT_GE(len, 0, "read"))
 		goto close_iter;
 
 	/* test results */
-	if (CHECK(skel->bss->elems != num_elems, "elems", "got %u expected %u\n",
-		  skel->bss->elems, num_elems))
+	if (!ASSERT_EQ(skel->bss->elems, num_elems, "elems"))
 		goto close_iter;
 
-	if (CHECK(skel->bss->socks != num_sockets, "socks", "got %u expected %u\n",
-		  skel->bss->socks, num_sockets))
+	if (!ASSERT_EQ(skel->bss->socks, num_sockets, "socks"))
 		goto close_iter;
 
 	compare_cookies(src, skel->maps.dst);
@@ -288,28 +273,22 @@ static void test_sockmap_skb_verdict_attach(enum bpf_attach_type first,
 	int err, map, verdict;
 
 	skel = test_sockmap_skb_verdict_attach__open_and_load();
-	if (CHECK_FAIL(!skel)) {
-		perror("test_sockmap_skb_verdict_attach__open_and_load");
+	if (!ASSERT_OK_PTR(skel, "open_and_load"))
 		return;
-	}
 
 	verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
 	map = bpf_map__fd(skel->maps.sock_map);
 
 	err = bpf_prog_attach(verdict, map, first, 0);
-	if (CHECK_FAIL(err)) {
-		perror("bpf_prog_attach");
+	if (!ASSERT_OK(err, "bpf_prog_attach"))
 		goto out;
-	}
 
 	err = bpf_prog_attach(verdict, map, second, 0);
 	ASSERT_EQ(err, -EBUSY, "prog_attach_fail");
 
 	err = bpf_prog_detach2(verdict, map, first);
-	if (CHECK_FAIL(err)) {
-		perror("bpf_prog_detach2");
+	if (!ASSERT_OK(err, "bpf_prog_detach2"))
 		goto out;
-	}
 out:
 	test_sockmap_skb_verdict_attach__destroy(skel);
 }
-- 
cgit v1.2.3


From d155fcb3fff16410ccd7583f9a16c15ddffeca1e Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Mon, 26 Sep 2022 13:12:02 +0800
Subject: selftests/bpf: Convert sockmap_ktls test to ASSERT_* macros

Convert the selftest to use the preferred ASSERT_* macros instead of the
deprecated CHECK().

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1664169131-32405-3-git-send-email-wangyufen@huawei.com
---
 .../selftests/bpf/prog_tests/sockmap_ktls.c        | 39 ++++++----------------
 1 file changed, 10 insertions(+), 29 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c
index e172d89e92e1..2d0796314862 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c
@@ -15,16 +15,12 @@ static int tcp_server(int family)
 	int err, s;
 
 	s = socket(family, SOCK_STREAM, 0);
-	if (CHECK_FAIL(s == -1)) {
-		perror("socket");
+	if (!ASSERT_GE(s, 0, "socket"))
 		return -1;
-	}
 
 	err = listen(s, SOMAXCONN);
-	if (CHECK_FAIL(err)) {
-		perror("listen");
+	if (!ASSERT_OK(err, "listen"))
 		return -1;
-	}
 
 	return s;
 }
@@ -48,44 +44,31 @@ static void test_sockmap_ktls_disconnect_after_delete(int family, int map)
 		return;
 
 	err = getsockname(srv, (struct sockaddr *)&addr, &len);
-	if (CHECK_FAIL(err)) {
-		perror("getsockopt");
+	if (!ASSERT_OK(err, "getsockopt"))
 		goto close_srv;
-	}
 
 	cli = socket(family, SOCK_STREAM, 0);
-	if (CHECK_FAIL(cli == -1)) {
-		perror("socket");
+	if (!ASSERT_GE(cli, 0, "socket"))
 		goto close_srv;
-	}
 
 	err = connect(cli, (struct sockaddr *)&addr, len);
-	if (CHECK_FAIL(err)) {
-		perror("connect");
+	if (!ASSERT_OK(err, "connect"))
 		goto close_cli;
-	}
 
 	err = bpf_map_update_elem(map, &zero, &cli, 0);
-	if (CHECK_FAIL(err)) {
-		perror("bpf_map_update_elem");
+	if (!ASSERT_OK(err, "bpf_map_update_elem"))
 		goto close_cli;
-	}
 
 	err = setsockopt(cli, IPPROTO_TCP, TCP_ULP, "tls", strlen("tls"));
-	if (CHECK_FAIL(err)) {
-		perror("setsockopt(TCP_ULP)");
+	if (!ASSERT_OK(err, "setsockopt(TCP_ULP)"))
 		goto close_cli;
-	}
 
 	err = bpf_map_delete_elem(map, &zero);
-	if (CHECK_FAIL(err)) {
-		perror("bpf_map_delete_elem");
+	if (!ASSERT_OK(err, "bpf_map_delete_elem"))
 		goto close_cli;
-	}
 
 	err = disconnect(cli);
-	if (CHECK_FAIL(err))
-		perror("disconnect");
+	ASSERT_OK(err, "disconnect");
 
 close_cli:
 	close(cli);
@@ -168,10 +151,8 @@ static void run_tests(int family, enum bpf_map_type map_type)
 	int map;
 
 	map = bpf_map_create(map_type, NULL, sizeof(int), sizeof(int), 1, NULL);
-	if (CHECK_FAIL(map < 0)) {
-		perror("bpf_map_create");
+	if (!ASSERT_GE(map, 0, "bpf_map_create"))
 		return;
-	}
 
 	if (test__start_subtest(fmt_test_name("disconnect_after_delete", family, map_type)))
 		test_sockmap_ktls_disconnect_after_delete(family, map);
-- 
cgit v1.2.3


From 099763e7da0beec120827547f227c123e9d4a155 Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Mon, 26 Sep 2022 13:12:03 +0800
Subject: selftests/bpf: Convert sockopt test to ASSERT_* macros

Convert the selftest to use the preferred ASSERT_* macros instead of the
deprecated CHECK().

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1664169131-32405-4-git-send-email-wangyufen@huawei.com
---
 tools/testing/selftests/bpf/prog_tests/sockopt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt.c b/tools/testing/selftests/bpf/prog_tests/sockopt.c
index cd09f4c7dd92..aa4debf62fc6 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt.c
@@ -972,12 +972,12 @@ void test_sockopt(void)
 	int cgroup_fd, i;
 
 	cgroup_fd = test__join_cgroup("/sockopt");
-	if (CHECK_FAIL(cgroup_fd < 0))
+	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
 		return;
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
 		test__start_subtest(tests[i].descr);
-		CHECK_FAIL(run_test(cgroup_fd, &tests[i]));
+		ASSERT_OK(run_test(cgroup_fd, &tests[i]), tests[i].descr);
 	}
 
 	close(cgroup_fd);
-- 
cgit v1.2.3


From 675bc8abe16d9ce97970e8a781e9e72bb8d47ca2 Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Mon, 26 Sep 2022 13:12:04 +0800
Subject: selftests/bpf: Convert sockopt_inherit test to ASSERT_* macros

Convert the selftest to use the preferred ASSERT_* macros instead of the
deprecated CHECK().

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1664169131-32405-5-git-send-email-wangyufen@huawei.com
---
 .../selftests/bpf/prog_tests/sockopt_inherit.c     | 30 ++++++++++------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
index c5cb6e8374b6..60c17a8e2789 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
@@ -76,20 +76,16 @@ static void *server_thread(void *arg)
 	pthread_cond_signal(&server_started);
 	pthread_mutex_unlock(&server_started_mtx);
 
-	if (CHECK_FAIL(err < 0)) {
-		perror("Failed to listed on socket");
+	if (!ASSERT_GE(err, 0, "listed on socket"))
 		return NULL;
-	}
 
 	err += verify_sockopt(fd, CUSTOM_INHERIT1, "listen", 1);
 	err += verify_sockopt(fd, CUSTOM_INHERIT2, "listen", 1);
 	err += verify_sockopt(fd, CUSTOM_LISTENER, "listen", 1);
 
 	client_fd = accept(fd, (struct sockaddr *)&addr, &len);
-	if (CHECK_FAIL(client_fd < 0)) {
-		perror("Failed to accept client");
+	if (!ASSERT_GE(client_fd, 0, "accept client"))
 		return NULL;
-	}
 
 	err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "accept", 1);
 	err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "accept", 1);
@@ -183,20 +179,20 @@ static void run_test(int cgroup_fd)
 		goto close_bpf_object;
 
 	err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt", "_getsockopt");
-	if (CHECK_FAIL(err))
+	if (!ASSERT_OK(err, "prog_attach _getsockopt"))
 		goto close_bpf_object;
 
 	err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt", "_setsockopt");
-	if (CHECK_FAIL(err))
+	if (!ASSERT_OK(err, "prog_attach _setsockopt"))
 		goto close_bpf_object;
 
 	server_fd = start_server();
-	if (CHECK_FAIL(server_fd < 0))
+	if (!ASSERT_GE(server_fd, 0, "start_server"))
 		goto close_bpf_object;
 
 	pthread_mutex_lock(&server_started_mtx);
-	if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread,
-				      (void *)&server_fd))) {
+	if (!ASSERT_OK(pthread_create(&tid, NULL, server_thread,
+				      (void *)&server_fd), "pthread_create")) {
 		pthread_mutex_unlock(&server_started_mtx);
 		goto close_server_fd;
 	}
@@ -204,17 +200,17 @@ static void run_test(int cgroup_fd)
 	pthread_mutex_unlock(&server_started_mtx);
 
 	client_fd = connect_to_server(server_fd);
-	if (CHECK_FAIL(client_fd < 0))
+	if (!ASSERT_GE(client_fd, 0, "connect_to_server"))
 		goto close_server_fd;
 
-	CHECK_FAIL(verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0));
-	CHECK_FAIL(verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0));
-	CHECK_FAIL(verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0));
+	ASSERT_OK(verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0), "verify_sockopt1");
+	ASSERT_OK(verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0), "verify_sockopt2");
+	ASSERT_OK(verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0), "verify_sockopt ener");
 
 	pthread_join(tid, &server_err);
 
 	err = (int)(long)server_err;
-	CHECK_FAIL(err);
+	ASSERT_OK(err, "pthread_join retval");
 
 	close(client_fd);
 
@@ -229,7 +225,7 @@ void test_sockopt_inherit(void)
 	int cgroup_fd;
 
 	cgroup_fd = test__join_cgroup("/sockopt_inherit");
-	if (CHECK_FAIL(cgroup_fd < 0))
+	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup"))
 		return;
 
 	run_test(cgroup_fd);
-- 
cgit v1.2.3


From a605a6bbccceebbb68ab9f8ff2b27e2faa38525d Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Mon, 26 Sep 2022 13:12:05 +0800
Subject: selftests/bpf: Convert sockopt_multi test to ASSERT_* macros

Convert the selftest to use the preferred ASSERT_* macros instead of the
deprecated CHECK().

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1664169131-32405-6-git-send-email-wangyufen@huawei.com
---
 tools/testing/selftests/bpf/prog_tests/sockopt_multi.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
index 28d592dc54a7..7f5659349011 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
@@ -303,11 +303,11 @@ void test_sockopt_multi(void)
 	int err = -1;
 
 	cg_parent = test__join_cgroup("/parent");
-	if (CHECK_FAIL(cg_parent < 0))
+	if (!ASSERT_GE(cg_parent, 0, "join_cgroup /parent"))
 		goto out;
 
 	cg_child = test__join_cgroup("/parent/child");
-	if (CHECK_FAIL(cg_child < 0))
+	if (!ASSERT_GE(cg_child, 0, "join_cgroup /parent/child"))
 		goto out;
 
 	obj = bpf_object__open_file("sockopt_multi.bpf.o", NULL);
@@ -319,11 +319,11 @@ void test_sockopt_multi(void)
 		goto out;
 
 	sock_fd = socket(AF_INET, SOCK_STREAM, 0);
-	if (CHECK_FAIL(sock_fd < 0))
+	if (!ASSERT_GE(sock_fd, 0, "socket"))
 		goto out;
 
-	CHECK_FAIL(run_getsockopt_test(obj, cg_parent, cg_child, sock_fd));
-	CHECK_FAIL(run_setsockopt_test(obj, cg_parent, cg_child, sock_fd));
+	ASSERT_OK(run_getsockopt_test(obj, cg_parent, cg_child, sock_fd), "getsockopt_test");
+	ASSERT_OK(run_setsockopt_test(obj, cg_parent, cg_child, sock_fd), "setsockopt_test");
 
 out:
 	close(sock_fd);
-- 
cgit v1.2.3


From f19708dfa0bf5a016f27e92ef4d3514788f6dc8b Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Mon, 26 Sep 2022 13:12:06 +0800
Subject: selftests/bpf: Convert sockopt_sk test to ASSERT_* macros

Convert the selftest to use the preferred ASSERT_* macros instead of the
deprecated CHECK().

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1664169131-32405-7-git-send-email-wangyufen@huawei.com
---
 tools/testing/selftests/bpf/prog_tests/sockopt_sk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
index 30a99d2ed5c6..60d952719d27 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
@@ -223,7 +223,7 @@ void test_sockopt_sk(void)
 	int cgroup_fd;
 
 	cgroup_fd = test__join_cgroup("/sockopt_sk");
-	if (CHECK_FAIL(cgroup_fd < 0))
+	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /sockopt_sk"))
 		return;
 
 	run_test(cgroup_fd);
-- 
cgit v1.2.3


From a0a17296713aea7b3cbc94662c6ffb53a79a3f2c Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Mon, 26 Sep 2022 13:12:07 +0800
Subject: selftests/bpf: Convert tcp_estats test to ASSERT_* macros

Convert the selftest to use the preferred ASSERT_* macros instead of the
deprecated CHECK().

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1664169131-32405-8-git-send-email-wangyufen@huawei.com
---
 tools/testing/selftests/bpf/prog_tests/tcp_estats.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
index 032dbfb26256..e070bca2b764 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
@@ -6,11 +6,9 @@ void test_tcp_estats(void)
 	const char *file = "./test_tcp_estats.bpf.o";
 	int err, prog_fd;
 	struct bpf_object *obj;
-	__u32 duration = 0;
 
 	err = bpf_prog_test_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
-	CHECK(err, "", "err %d errno %d\n", err, errno);
-	if (err)
+	if (!ASSERT_OK(err, ""))
 		return;
 
 	bpf_object__close(obj);
-- 
cgit v1.2.3


From 3082f8cd4ba32091be82c19c357ddfd300c5a433 Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Mon, 26 Sep 2022 13:12:08 +0800
Subject: selftests/bpf: Convert tcp_hdr_options test to ASSERT_* macros

Convert the selftest to use the preferred ASSERT_* macros instead of the
deprecated CHECK().

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1664169131-32405-9-git-send-email-wangyufen@huawei.com
---
 .../selftests/bpf/prog_tests/tcp_hdr_options.c     | 80 ++++++++--------------
 1 file changed, 28 insertions(+), 52 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
index f24436d33cd6..617bbce6ef8f 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
@@ -42,13 +42,10 @@ struct sk_fds {
 
 static int create_netns(void)
 {
-	if (CHECK(unshare(CLONE_NEWNET), "create netns",
-		  "unshare(CLONE_NEWNET): %s (%d)",
-		  strerror(errno), errno))
+	if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
 		return -1;
 
-	if (CHECK(system("ip link set dev lo up"), "run ip cmd",
-		  "failed to bring lo link up\n"))
+	if (!ASSERT_OK(system("ip link set dev lo up"), "run ip cmd"))
 		return -1;
 
 	return 0;
@@ -80,16 +77,12 @@ static int sk_fds_shutdown(struct sk_fds *sk_fds)
 
 	shutdown(sk_fds->active_fd, SHUT_WR);
 	ret = read(sk_fds->passive_fd, &abyte, sizeof(abyte));
-	if (CHECK(ret != 0, "read-after-shutdown(passive_fd):",
-		  "ret:%d %s (%d)\n",
-		  ret, strerror(errno), errno))
+	if (!ASSERT_EQ(ret, 0, "read-after-shutdown(passive_fd):"))
 		return -1;
 
 	shutdown(sk_fds->passive_fd, SHUT_WR);
 	ret = read(sk_fds->active_fd, &abyte, sizeof(abyte));
-	if (CHECK(ret != 0, "read-after-shutdown(active_fd):",
-		  "ret:%d %s (%d)\n",
-		  ret, strerror(errno), errno))
+	if (!ASSERT_EQ(ret, 0, "read-after-shutdown(active_fd):"))
 		return -1;
 
 	return 0;
@@ -102,8 +95,7 @@ static int sk_fds_connect(struct sk_fds *sk_fds, bool fast_open)
 	socklen_t len;
 
 	sk_fds->srv_fd = start_server(AF_INET6, SOCK_STREAM, LO_ADDR6, 0, 0);
-	if (CHECK(sk_fds->srv_fd == -1, "start_server", "%s (%d)\n",
-		  strerror(errno), errno))
+	if (!ASSERT_NEQ(sk_fds->srv_fd, -1, "start_server"))
 		goto error;
 
 	if (fast_open)
@@ -112,28 +104,25 @@ static int sk_fds_connect(struct sk_fds *sk_fds, bool fast_open)
 	else
 		sk_fds->active_fd = connect_to_fd(sk_fds->srv_fd, 0);
 
-	if (CHECK_FAIL(sk_fds->active_fd == -1)) {
+	if (!ASSERT_NEQ(sk_fds->active_fd, -1, "")) {
 		close(sk_fds->srv_fd);
 		goto error;
 	}
 
 	len = sizeof(addr6);
-	if (CHECK(getsockname(sk_fds->srv_fd, (struct sockaddr *)&addr6,
-			      &len), "getsockname(srv_fd)", "%s (%d)\n",
-		  strerror(errno), errno))
+	if (!ASSERT_OK(getsockname(sk_fds->srv_fd, (struct sockaddr *)&addr6,
+				   &len), "getsockname(srv_fd)"))
 		goto error_close;
 	sk_fds->passive_lport = ntohs(addr6.sin6_port);
 
 	len = sizeof(addr6);
-	if (CHECK(getsockname(sk_fds->active_fd, (struct sockaddr *)&addr6,
-			      &len), "getsockname(active_fd)", "%s (%d)\n",
-		  strerror(errno), errno))
+	if (!ASSERT_OK(getsockname(sk_fds->active_fd, (struct sockaddr *)&addr6,
+				   &len), "getsockname(active_fd)"))
 		goto error_close;
 	sk_fds->active_lport = ntohs(addr6.sin6_port);
 
 	sk_fds->passive_fd = accept(sk_fds->srv_fd, NULL, 0);
-	if (CHECK(sk_fds->passive_fd == -1, "accept(srv_fd)", "%s (%d)\n",
-		  strerror(errno), errno))
+	if (!ASSERT_NEQ(sk_fds->passive_fd, -1, "accept(srv_fd)"))
 		goto error_close;
 
 	if (fast_open) {
@@ -141,8 +130,7 @@ static int sk_fds_connect(struct sk_fds *sk_fds, bool fast_open)
 		int ret;
 
 		ret = read(sk_fds->passive_fd, bytes_in, sizeof(bytes_in));
-		if (CHECK(ret != sizeof(fast), "read fastopen syn data",
-			  "expected=%lu actual=%d\n", sizeof(fast), ret)) {
+		if (!ASSERT_EQ(ret, sizeof(fast), "read fastopen syn data")) {
 			close(sk_fds->passive_fd);
 			goto error_close;
 		}
@@ -163,8 +151,7 @@ static int check_hdr_opt(const struct bpf_test_option *exp,
 			 const struct bpf_test_option *act,
 			 const char *hdr_desc)
 {
-	if (CHECK(memcmp(exp, act, sizeof(*exp)),
-		  "expected-vs-actual", "unexpected %s\n", hdr_desc)) {
+	if (!ASSERT_OK(memcmp(exp, act, sizeof(*exp)), hdr_desc)) {
 		print_option(exp, "expected: ");
 		print_option(act, "  actual: ");
 		return -1;
@@ -178,13 +165,11 @@ static int check_hdr_stg(const struct hdr_stg *exp, int fd,
 {
 	struct hdr_stg act;
 
-	if (CHECK(bpf_map_lookup_elem(hdr_stg_map_fd, &fd, &act),
-		  "map_lookup(hdr_stg_map_fd)", "%s %s (%d)\n",
-		  stg_desc, strerror(errno), errno))
+	if (!ASSERT_OK(bpf_map_lookup_elem(hdr_stg_map_fd, &fd, &act),
+		  "map_lookup(hdr_stg_map_fd)"))
 		return -1;
 
-	if (CHECK(memcmp(exp, &act, sizeof(*exp)),
-		  "expected-vs-actual", "unexpected %s\n", stg_desc)) {
+	if (!ASSERT_OK(memcmp(exp, &act, sizeof(*exp)), stg_desc)) {
 		print_hdr_stg(exp, "expected: ");
 		print_hdr_stg(&act, "  actual: ");
 		return -1;
@@ -228,9 +213,8 @@ static void check_hdr_and_close_fds(struct sk_fds *sk_fds)
 	if (sk_fds_shutdown(sk_fds))
 		goto check_linum;
 
-	if (CHECK(expected_inherit_cb_flags != skel->bss->inherit_cb_flags,
-		  "Unexpected inherit_cb_flags", "0x%x != 0x%x\n",
-		  skel->bss->inherit_cb_flags, expected_inherit_cb_flags))
+	if (!ASSERT_EQ(expected_inherit_cb_flags, skel->bss->inherit_cb_flags,
+		       "inherit_cb_flags"))
 		goto check_linum;
 
 	if (check_hdr_stg(&exp_passive_hdr_stg, sk_fds->passive_fd,
@@ -257,7 +241,7 @@ static void check_hdr_and_close_fds(struct sk_fds *sk_fds)
 		      "active_fin_in");
 
 check_linum:
-	CHECK_FAIL(check_error_linum(sk_fds));
+	ASSERT_FALSE(check_error_linum(sk_fds), "check_error_linum");
 	sk_fds_close(sk_fds);
 }
 
@@ -497,26 +481,20 @@ static void misc(void)
 		/* MSG_EOR to ensure skb will not be combined */
 		ret = send(sk_fds.active_fd, send_msg, sizeof(send_msg),
 			   MSG_EOR);
-		if (CHECK(ret != sizeof(send_msg), "send(msg)", "ret:%d\n",
-			  ret))
+		if (!ASSERT_EQ(ret, sizeof(send_msg), "send(msg)"))
 			goto check_linum;
 
 		ret = read(sk_fds.passive_fd, recv_msg, sizeof(recv_msg));
-		if (CHECK(ret != sizeof(send_msg), "read(msg)", "ret:%d\n",
-			  ret))
+		if (ASSERT_EQ(ret, sizeof(send_msg), "read(msg)"))
 			goto check_linum;
 	}
 
 	if (sk_fds_shutdown(&sk_fds))
 		goto check_linum;
 
-	CHECK(misc_skel->bss->nr_syn != 1, "unexpected nr_syn",
-	      "expected (1) != actual (%u)\n",
-		misc_skel->bss->nr_syn);
+	ASSERT_EQ(misc_skel->bss->nr_syn, 1, "unexpected nr_syn");
 
-	CHECK(misc_skel->bss->nr_data != nr_data, "unexpected nr_data",
-	      "expected (%u) != actual (%u)\n",
-	      nr_data, misc_skel->bss->nr_data);
+	ASSERT_EQ(misc_skel->bss->nr_data, nr_data, "unexpected nr_data");
 
 	/* The last ACK may have been delayed, so it is either 1 or 2. */
 	CHECK(misc_skel->bss->nr_pure_ack != 1 &&
@@ -525,12 +503,10 @@ static void misc(void)
 	      "expected (1 or 2) != actual (%u)\n",
 		misc_skel->bss->nr_pure_ack);
 
-	CHECK(misc_skel->bss->nr_fin != 1, "unexpected nr_fin",
-	      "expected (1) != actual (%u)\n",
-	      misc_skel->bss->nr_fin);
+	ASSERT_EQ(misc_skel->bss->nr_fin, 1, "unexpected nr_fin");
 
 check_linum:
-	CHECK_FAIL(check_error_linum(&sk_fds));
+	ASSERT_FALSE(check_error_linum(&sk_fds), "check_error_linum");
 	sk_fds_close(&sk_fds);
 	bpf_link__destroy(link);
 }
@@ -555,15 +531,15 @@ void test_tcp_hdr_options(void)
 	int i;
 
 	skel = test_tcp_hdr_options__open_and_load();
-	if (CHECK(!skel, "open and load skel", "failed"))
+	if (!ASSERT_OK_PTR(skel, "open and load skel"))
 		return;
 
 	misc_skel = test_misc_tcp_hdr_options__open_and_load();
-	if (CHECK(!misc_skel, "open and load misc test skel", "failed"))
+	if (!ASSERT_OK_PTR(misc_skel, "open and load misc test skel"))
 		goto skel_destroy;
 
 	cg_fd = test__join_cgroup(CG_NAME);
-	if (CHECK_FAIL(cg_fd < 0))
+	if (ASSERT_GE(cg_fd, 0, "join_cgroup"))
 		goto skel_destroy;
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
-- 
cgit v1.2.3


From 8dda32ac58b622f4c1ec0edd4f6b12f84170fe01 Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Mon, 26 Sep 2022 13:12:09 +0800
Subject: selftests/bpf: Convert tcp_rtt test to ASSERT_* macros

Convert the selftest to use the preferred ASSERT_* macros instead of the
deprecated CHECK().

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1664169131-32405-10-git-send-email-wangyufen@huawei.com
---
 tools/testing/selftests/bpf/prog_tests/tcp_rtt.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
index 96ff2c20af81..8fe84da1b9b4 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
@@ -16,8 +16,7 @@ static void send_byte(int fd)
 {
 	char b = 0x55;
 
-	if (CHECK_FAIL(write(fd, &b, sizeof(b)) != 1))
-		perror("Failed to send single byte");
+	ASSERT_EQ(write(fd, &b, sizeof(b)), 1, "send single byte");
 }
 
 static int wait_for_ack(int fd, int retries)
@@ -51,10 +50,8 @@ static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked,
 	int err = 0;
 	struct tcp_rtt_storage val;
 
-	if (CHECK_FAIL(bpf_map_lookup_elem(map_fd, &client_fd, &val) < 0)) {
-		perror("Failed to read socket storage");
+	if (!ASSERT_GE(bpf_map_lookup_elem(map_fd, &client_fd, &val), 0, "read socket storage"))
 		return -1;
-	}
 
 	if (val.invoked != invoked) {
 		log_err("%s: unexpected bpf_tcp_sock.invoked %d != %d",
@@ -151,14 +148,14 @@ void test_tcp_rtt(void)
 	int server_fd, cgroup_fd;
 
 	cgroup_fd = test__join_cgroup("/tcp_rtt");
-	if (CHECK_FAIL(cgroup_fd < 0))
+	if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /tcp_rtt"))
 		return;
 
 	server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0);
-	if (CHECK_FAIL(server_fd < 0))
+	if (!ASSERT_GE(server_fd, 0, "start_server"))
 		goto close_cgroup_fd;
 
-	CHECK_FAIL(run_test(cgroup_fd, server_fd));
+	ASSERT_OK(run_test(cgroup_fd, server_fd), "run_test");
 
 	close(server_fd);
 
-- 
cgit v1.2.3


From 9d0b05bdfbea25693cdd63c29aa12b982307d81e Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Mon, 26 Sep 2022 13:12:10 +0800
Subject: selftests/bpf: Convert tcpbpf_user test to ASSERT_* macros

Convert the selftest to use the preferred ASSERT_* macros instead of the
deprecated CHECK().

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1664169131-32405-11-git-send-email-wangyufen@huawei.com
---
 .../testing/selftests/bpf/prog_tests/tcpbpf_user.c | 32 ++++++++--------------
 1 file changed, 12 insertions(+), 20 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c
index 87923d2865b7..7e8fe1bad03f 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c
@@ -8,8 +8,6 @@
 #define LO_ADDR6 "::1"
 #define CG_NAME "/tcpbpf-user-test"
 
-static __u32 duration;
-
 static void verify_result(struct tcpbpf_globals *result)
 {
 	__u32 expected_events = ((1 << BPF_SOCK_OPS_TIMEOUT_INIT) |
@@ -22,9 +20,7 @@ static void verify_result(struct tcpbpf_globals *result)
 				 (1 << BPF_SOCK_OPS_TCP_LISTEN_CB));
 
 	/* check global map */
-	CHECK(expected_events != result->event_map, "event_map",
-	      "unexpected event_map: actual 0x%08x != expected 0x%08x\n",
-	      result->event_map, expected_events);
+	ASSERT_EQ(expected_events, result->event_map, "event_map");
 
 	ASSERT_EQ(result->bytes_received, 501, "bytes_received");
 	ASSERT_EQ(result->bytes_acked, 1002, "bytes_acked");
@@ -56,18 +52,15 @@ static void run_test(struct tcpbpf_globals *result)
 	int i, rv;
 
 	listen_fd = start_server(AF_INET6, SOCK_STREAM, LO_ADDR6, 0, 0);
-	if (CHECK(listen_fd == -1, "start_server", "listen_fd:%d errno:%d\n",
-		  listen_fd, errno))
+	if (!ASSERT_NEQ(listen_fd, -1, "start_server"))
 		goto done;
 
 	cli_fd = connect_to_fd(listen_fd, 0);
-	if (CHECK(cli_fd == -1, "connect_to_fd(listen_fd)",
-		  "cli_fd:%d errno:%d\n", cli_fd, errno))
+	if (!ASSERT_NEQ(cli_fd, -1, "connect_to_fd(listen_fd)"))
 		goto done;
 
 	accept_fd = accept(listen_fd, NULL, NULL);
-	if (CHECK(accept_fd == -1, "accept(listen_fd)",
-		  "accept_fd:%d errno:%d\n", accept_fd, errno))
+	if (!ASSERT_NEQ(accept_fd, -1, "accept(listen_fd)"))
 		goto done;
 
 	/* Send 1000B of '+'s from cli_fd -> accept_fd */
@@ -75,11 +68,11 @@ static void run_test(struct tcpbpf_globals *result)
 		buf[i] = '+';
 
 	rv = send(cli_fd, buf, 1000, 0);
-	if (CHECK(rv != 1000, "send(cli_fd)", "rv:%d errno:%d\n", rv, errno))
+	if (!ASSERT_EQ(rv, 1000, "send(cli_fd)"))
 		goto done;
 
 	rv = recv(accept_fd, buf, 1000, 0);
-	if (CHECK(rv != 1000, "recv(accept_fd)", "rv:%d errno:%d\n", rv, errno))
+	if (!ASSERT_EQ(rv, 1000, "recv(accept_fd)"))
 		goto done;
 
 	/* Send 500B of '.'s from accept_fd ->cli_fd */
@@ -87,11 +80,11 @@ static void run_test(struct tcpbpf_globals *result)
 		buf[i] = '.';
 
 	rv = send(accept_fd, buf, 500, 0);
-	if (CHECK(rv != 500, "send(accept_fd)", "rv:%d errno:%d\n", rv, errno))
+	if (!ASSERT_EQ(rv, 500, "send(accept_fd)"))
 		goto done;
 
 	rv = recv(cli_fd, buf, 500, 0);
-	if (CHECK(rv != 500, "recv(cli_fd)", "rv:%d errno:%d\n", rv, errno))
+	if (!ASSERT_EQ(rv, 500, "recv(cli_fd)"))
 		goto done;
 
 	/*
@@ -100,12 +93,12 @@ static void run_test(struct tcpbpf_globals *result)
 	 */
 	shutdown(accept_fd, SHUT_WR);
 	err = recv(cli_fd, buf, 1, 0);
-	if (CHECK(err, "recv(cli_fd) for fin", "err:%d errno:%d\n", err, errno))
+	if (!ASSERT_OK(err, "recv(cli_fd) for fin"))
 		goto done;
 
 	shutdown(cli_fd, SHUT_WR);
 	err = recv(accept_fd, buf, 1, 0);
-	CHECK(err, "recv(accept_fd) for fin", "err:%d errno:%d\n", err, errno);
+	ASSERT_OK(err, "recv(accept_fd) for fin");
 done:
 	if (accept_fd != -1)
 		close(accept_fd);
@@ -124,12 +117,11 @@ void test_tcpbpf_user(void)
 	int cg_fd = -1;
 
 	skel = test_tcpbpf_kern__open_and_load();
-	if (CHECK(!skel, "open and load skel", "failed"))
+	if (!ASSERT_OK_PTR(skel, "open and load skel"))
 		return;
 
 	cg_fd = test__join_cgroup(CG_NAME);
-	if (CHECK(cg_fd < 0, "test__join_cgroup(" CG_NAME ")",
-		  "cg_fd:%d errno:%d", cg_fd, errno))
+	if (!ASSERT_GE(cg_fd, 0, "test__join_cgroup(" CG_NAME ")"))
 		goto err;
 
 	skel->links.bpf_testcb = bpf_program__attach_cgroup(skel->progs.bpf_testcb, cg_fd);
-- 
cgit v1.2.3


From 1fddca3d36d1dc4a19a8060d20de1b77edfe63e0 Mon Sep 17 00:00:00 2001
From: Wang Yufen <wangyufen@huawei.com>
Date: Mon, 26 Sep 2022 13:12:11 +0800
Subject: selftests/bpf: Convert udp_limit test to ASSERT_* macros

Convert the selftest to use the preferred ASSERT_* macros instead of the
deprecated CHECK().

Signed-off-by: Wang Yufen <wangyufen@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/1664169131-32405-12-git-send-email-wangyufen@huawei.com
---
 tools/testing/selftests/bpf/prog_tests/udp_limit.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/udp_limit.c b/tools/testing/selftests/bpf/prog_tests/udp_limit.c
index 56c9d6bd38a3..2643d896ddae 100644
--- a/tools/testing/selftests/bpf/prog_tests/udp_limit.c
+++ b/tools/testing/selftests/bpf/prog_tests/udp_limit.c
@@ -5,8 +5,6 @@
 #include <sys/types.h>
 #include <sys/socket.h>
 
-static int duration;
-
 void test_udp_limit(void)
 {
 	struct udp_limit *skel;
@@ -14,11 +12,11 @@ void test_udp_limit(void)
 	int cgroup_fd;
 
 	cgroup_fd = test__join_cgroup("/udp_limit");
-	if (CHECK(cgroup_fd < 0, "cg-join", "errno %d", errno))
+	if (!ASSERT_GE(cgroup_fd, 0, "cg-join"))
 		return;
 
 	skel = udp_limit__open_and_load();
-	if (CHECK(!skel, "skel-load", "errno %d", errno))
+	if (!ASSERT_OK_PTR(skel, "skel-load"))
 		goto close_cgroup_fd;
 
 	skel->links.sock = bpf_program__attach_cgroup(skel->progs.sock, cgroup_fd);
@@ -32,11 +30,11 @@ void test_udp_limit(void)
 	 * verify that.
 	 */
 	fd1 = socket(AF_INET, SOCK_DGRAM, 0);
-	if (CHECK(fd1 < 0, "fd1", "errno %d", errno))
+	if (!ASSERT_GE(fd1, 0, "socket(fd1)"))
 		goto close_skeleton;
 
 	fd2 = socket(AF_INET, SOCK_DGRAM, 0);
-	if (CHECK(fd2 >= 0, "fd2", "errno %d", errno))
+	if (!ASSERT_LT(fd2, 0, "socket(fd2)"))
 		goto close_skeleton;
 
 	/* We can reopen again after close. */
@@ -44,7 +42,7 @@ void test_udp_limit(void)
 	fd1 = -1;
 
 	fd1 = socket(AF_INET, SOCK_DGRAM, 0);
-	if (CHECK(fd1 < 0, "fd1-again", "errno %d", errno))
+	if (!ASSERT_GE(fd1, 0, "socket(fd1-again)"))
 		goto close_skeleton;
 
 	/* Make sure the program was invoked the expected
@@ -54,13 +52,11 @@ void test_udp_limit(void)
 	 * - close fd1          - BPF_CGROUP_INET_SOCK_RELEASE
 	 * - open fd1 again     - BPF_CGROUP_INET_SOCK_CREATE
 	 */
-	if (CHECK(skel->bss->invocations != 4, "bss-invocations",
-		  "invocations=%d", skel->bss->invocations))
+	if (!ASSERT_EQ(skel->bss->invocations, 4, "bss-invocations"))
 		goto close_skeleton;
 
 	/* We should still have a single socket in use */
-	if (CHECK(skel->bss->in_use != 1, "bss-in_use",
-		  "in_use=%d", skel->bss->in_use))
+	if (!ASSERT_EQ(skel->bss->in_use, 1, "bss-in_use"))
 		goto close_skeleton;
 
 close_skeleton:
-- 
cgit v1.2.3


From d59bd748db0a97a5d6a33b284b6c58b7f6f4f768 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 28 Sep 2022 09:49:28 -0600
Subject: io_uring/poll: disable level triggered poll

Stefan reports that there are issues with the level triggered
notification. Since we're late in the cycle, and it was introduced for
the 6.0 release, just disable it at prep time and we can bring this
back when Samba is happy with it.

Reported-by: Stefan Metzmacher <metze@samba.org>
Reviewed-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/poll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/io_uring/poll.c b/io_uring/poll.c
index d5bad0bea6e4..0d9f49c575e0 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -857,7 +857,7 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (sqe->buf_index || sqe->off || sqe->addr)
 		return -EINVAL;
 	flags = READ_ONCE(sqe->len);
-	if (flags & ~(IORING_POLL_ADD_MULTI|IORING_POLL_ADD_LEVEL))
+	if (flags & ~IORING_POLL_ADD_MULTI)
 		return -EINVAL;
 	if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP))
 		return -EINVAL;
-- 
cgit v1.2.3


From 1b24a132eba7a1c19475ba2510ec1c00af3ff914 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 5 Sep 2022 09:15:03 -0700
Subject: clk: iproc: Do not rely on node name for correct PLL setup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After commit 31fd9b79dc58 ("ARM: dts: BCM5301X: update CRU block
description") a warning from clk-iproc-pll.c was generated due to a
duplicate PLL name as well as the console stopped working. Upon closer
inspection it became clear that iproc_pll_clk_setup() used the Device
Tree node unit name as an unique identifier as well as a parent name to
parent all clocks under the PLL.

BCM5301X was the first platform on which that got noticed because of the
DT node unit name renaming but the same assumptions hold true for any
user of the iproc_pll_clk_setup() function.

The first 'clock-output-names' property is always guaranteed to be
unique as well as providing the actual desired PLL clock name, so we
utilize that to register the PLL and as a parent name of all children
clock.

Fixes: 5fe225c105fd ("clk: iproc: add initial common clock support")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Acked-by: Rafał Miłecki <rafal@milecki.pl>
Link: https://lore.kernel.org/r/20220905161504.1526-1-f.fainelli@gmail.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/bcm/clk-iproc-pll.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/clk/bcm/clk-iproc-pll.c b/drivers/clk/bcm/clk-iproc-pll.c
index 1a098db12062..680f9d8d357c 100644
--- a/drivers/clk/bcm/clk-iproc-pll.c
+++ b/drivers/clk/bcm/clk-iproc-pll.c
@@ -726,6 +726,7 @@ void iproc_pll_clk_setup(struct device_node *node,
 	const char *parent_name;
 	struct iproc_clk *iclk_array;
 	struct clk_hw_onecell_data *clk_data;
+	const char *clk_name;
 
 	if (WARN_ON(!pll_ctrl) || WARN_ON(!clk_ctrl))
 		return;
@@ -773,7 +774,12 @@ void iproc_pll_clk_setup(struct device_node *node,
 	iclk = &iclk_array[0];
 	iclk->pll = pll;
 
-	init.name = node->name;
+	ret = of_property_read_string_index(node, "clock-output-names",
+					    0, &clk_name);
+	if (WARN_ON(ret))
+		goto err_pll_register;
+
+	init.name = clk_name;
 	init.ops = &iproc_pll_ops;
 	init.flags = 0;
 	parent_name = of_clk_get_parent_name(node, 0);
@@ -793,13 +799,11 @@ void iproc_pll_clk_setup(struct device_node *node,
 		goto err_pll_register;
 
 	clk_data->hws[0] = &iclk->hw;
+	parent_name = clk_name;
 
 	/* now initialize and register all leaf clocks */
 	for (i = 1; i < num_clks; i++) {
-		const char *clk_name;
-
 		memset(&init, 0, sizeof(init));
-		parent_name = node->name;
 
 		ret = of_property_read_string_index(node, "clock-output-names",
 						    i, &clk_name);
-- 
cgit v1.2.3


From daaa2fbe678efdaced53d1c635f4d326751addf8 Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Tue, 30 Aug 2022 11:31:31 +0800
Subject: clk: imx93: drop of_match_ptr

There is build warning when CONFIG_OF is not selected.
>> drivers/clk/imx/clk-imx93.c:324:34: warning: 'imx93_clk_of_match'
>> defined but not used [-Wunused-const-variable=]
     324 | static const struct of_device_id imx93_clk_of_match[] = {
         |                                  ^~~~~~~~~~~~~~~~~~

The driver only support DT table, no sense to use of_match_ptr.

Fixes: 24defbe194b6 ("clk: imx: add i.MX93 clk")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20220830033137.4149542-3-peng.fan@oss.nxp.com
Reviewed-by: Abel Vesa <abel.vesa@linaro.org>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/imx/clk-imx93.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/clk/imx/clk-imx93.c b/drivers/clk/imx/clk-imx93.c
index f5c9fa40491c..dcc41d178238 100644
--- a/drivers/clk/imx/clk-imx93.c
+++ b/drivers/clk/imx/clk-imx93.c
@@ -332,7 +332,7 @@ static struct platform_driver imx93_clk_driver = {
 	.driver = {
 		.name = "imx93-ccm",
 		.suppress_bind_attrs = true,
-		.of_match_table = of_match_ptr(imx93_clk_of_match),
+		.of_match_table = imx93_clk_of_match,
 	},
 };
 module_platform_driver(imx93_clk_driver);
-- 
cgit v1.2.3


From e1e10b44cf284248fb099681f48cc723564a1cc8 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 27 Sep 2022 17:45:29 +0200
Subject: xfrm: pass extack down to xfrm_type ->init_state

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      | 3 ++-
 net/ipv4/ah4.c          | 2 +-
 net/ipv4/esp4.c         | 2 +-
 net/ipv4/ipcomp.c       | 3 ++-
 net/ipv4/xfrm4_tunnel.c | 2 +-
 net/ipv6/ah6.c          | 2 +-
 net/ipv6/esp6.c         | 2 +-
 net/ipv6/ipcomp6.c      | 3 ++-
 net/ipv6/mip6.c         | 4 ++--
 net/ipv6/xfrm6_tunnel.c | 2 +-
 net/xfrm/xfrm_state.c   | 2 +-
 11 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index c504d07bcb7c..dbc81f5eb553 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -405,7 +405,8 @@ struct xfrm_type {
 #define XFRM_TYPE_LOCAL_COADDR	4
 #define XFRM_TYPE_REMOTE_COADDR	8
 
-	int			(*init_state)(struct xfrm_state *x);
+	int			(*init_state)(struct xfrm_state *x,
+					      struct netlink_ext_ack *extack);
 	void			(*destructor)(struct xfrm_state *);
 	int			(*input)(struct xfrm_state *, struct sk_buff *skb);
 	int			(*output)(struct xfrm_state *, struct sk_buff *pskb);
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index f8ad04470d3a..babefff15de3 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -471,7 +471,7 @@ static int ah4_err(struct sk_buff *skb, u32 info)
 	return 0;
 }
 
-static int ah_init_state(struct xfrm_state *x)
+static int ah_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	struct ah_data *ahp = NULL;
 	struct xfrm_algo_desc *aalg_desc;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 5c03eba787e5..bc2b2c5717b5 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1131,7 +1131,7 @@ error:
 	return err;
 }
 
-static int esp_init_state(struct xfrm_state *x)
+static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	struct crypto_aead *aead;
 	u32 align;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 366094c1ce6c..230d1120874f 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -117,7 +117,8 @@ out:
 	return err;
 }
 
-static int ipcomp4_init_state(struct xfrm_state *x)
+static int ipcomp4_init_state(struct xfrm_state *x,
+			      struct netlink_ext_ack *extack)
 {
 	int err = -EINVAL;
 
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 9d4f418f1bf8..08826e0d7962 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -22,7 +22,7 @@ static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb)
 	return ip_hdr(skb)->protocol;
 }
 
-static int ipip_init_state(struct xfrm_state *x)
+static int ipip_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	if (x->props.mode != XFRM_MODE_TUNNEL)
 		return -EINVAL;
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index b5995c1f4d7a..f5bc0d4b37ad 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -666,7 +666,7 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	return 0;
 }
 
-static int ah6_init_state(struct xfrm_state *x)
+static int ah6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	struct ah_data *ahp = NULL;
 	struct xfrm_algo_desc *aalg_desc;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 8220923a12f7..2ca9b7b7e500 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -1174,7 +1174,7 @@ error:
 	return err;
 }
 
-static int esp6_init_state(struct xfrm_state *x)
+static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	struct crypto_aead *aead;
 	u32 align;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 15f984be3570..7e47009739e9 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -136,7 +136,8 @@ out:
 	return err;
 }
 
-static int ipcomp6_init_state(struct xfrm_state *x)
+static int ipcomp6_init_state(struct xfrm_state *x,
+			      struct netlink_ext_ack *extack)
 {
 	int err = -EINVAL;
 
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index aeb35d26e474..3d87ae88ebfd 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -247,7 +247,7 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb,
 	return err;
 }
 
-static int mip6_destopt_init_state(struct xfrm_state *x)
+static int mip6_destopt_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	if (x->id.spi) {
 		pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi);
@@ -333,7 +333,7 @@ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb)
 	return 0;
 }
 
-static int mip6_rthdr_init_state(struct xfrm_state *x)
+static int mip6_rthdr_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	if (x->id.spi) {
 		pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi);
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 2b31112c0856..dda44b0671ac 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -270,7 +270,7 @@ static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	return 0;
 }
 
-static int xfrm6_tunnel_init_state(struct xfrm_state *x)
+static int xfrm6_tunnel_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	if (x->props.mode != XFRM_MODE_TUNNEL)
 		return -EINVAL;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 0b59ff7985e6..82c571d07836 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2673,7 +2673,7 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
 
 	x->type_offload = xfrm_get_type_offload(x->id.proto, family, offload);
 
-	err = x->type->init_state(x);
+	err = x->type->init_state(x, extack);
 	if (err)
 		goto error;
 
-- 
cgit v1.2.3


From ef87a4f84b10187a1db8aee95ed5b863474750c1 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 27 Sep 2022 17:45:30 +0200
Subject: xfrm: ah: add extack to ah_init_state, ah6_init_state

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/ah4.c | 21 +++++++++++++--------
 net/ipv6/ah6.c | 21 ++++++++++++++-------
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index babefff15de3..ee4e578c7f20 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -477,24 +477,32 @@ static int ah_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 	struct xfrm_algo_desc *aalg_desc;
 	struct crypto_ahash *ahash;
 
-	if (!x->aalg)
+	if (!x->aalg) {
+		NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm");
 		goto error;
+	}
 
-	if (x->encap)
+	if (x->encap) {
+		NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation");
 		goto error;
+	}
 
 	ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
 	if (!ahp)
 		return -ENOMEM;
 
 	ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
-	if (IS_ERR(ahash))
+	if (IS_ERR(ahash)) {
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 		goto error;
+	}
 
 	ahp->ahash = ahash;
 	if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
-				(x->aalg->alg_key_len + 7) / 8))
+				(x->aalg->alg_key_len + 7) / 8)) {
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 		goto error;
+	}
 
 	/*
 	 * Lookup the algorithm description maintained by xfrm_algo,
@@ -507,10 +515,7 @@ static int ah_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 
 	if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
 	    crypto_ahash_digestsize(ahash)) {
-		pr_info("%s: %s digestsize %u != %u\n",
-			__func__, x->aalg->alg_name,
-			crypto_ahash_digestsize(ahash),
-			aalg_desc->uinfo.auth.icv_fullbits / 8);
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 		goto error;
 	}
 
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index f5bc0d4b37ad..5228d2716289 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -672,24 +672,32 @@ static int ah6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 	struct xfrm_algo_desc *aalg_desc;
 	struct crypto_ahash *ahash;
 
-	if (!x->aalg)
+	if (!x->aalg) {
+		NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm");
 		goto error;
+	}
 
-	if (x->encap)
+	if (x->encap) {
+		NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation");
 		goto error;
+	}
 
 	ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
 	if (!ahp)
 		return -ENOMEM;
 
 	ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
-	if (IS_ERR(ahash))
+	if (IS_ERR(ahash)) {
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 		goto error;
+	}
 
 	ahp->ahash = ahash;
 	if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
-			       (x->aalg->alg_key_len + 7) / 8))
+			       (x->aalg->alg_key_len + 7) / 8)) {
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 		goto error;
+	}
 
 	/*
 	 * Lookup the algorithm description maintained by xfrm_algo,
@@ -702,9 +710,7 @@ static int ah6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 
 	if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
 	    crypto_ahash_digestsize(ahash)) {
-		pr_info("AH: %s digestsize %u != %u\n",
-			x->aalg->alg_name, crypto_ahash_digestsize(ahash),
-			aalg_desc->uinfo.auth.icv_fullbits/8);
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 		goto error;
 	}
 
@@ -721,6 +727,7 @@ static int ah6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 		x->props.header_len += sizeof(struct ipv6hdr);
 		break;
 	default:
+		NL_SET_ERR_MSG(extack, "Invalid mode requested for AH, must be one of TRANSPORT, TUNNEL, BEET");
 		goto error;
 	}
 	x->data = ahp;
-- 
cgit v1.2.3


From 67c44f93c951937b80735ada68f2de25885d1834 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 27 Sep 2022 17:45:31 +0200
Subject: xfrm: esp: add extack to esp_init_state, esp6_init_state

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/esp4.c | 53 ++++++++++++++++++++++++++++++++---------------------
 net/ipv6/esp6.c | 53 ++++++++++++++++++++++++++++++++---------------------
 2 files changed, 64 insertions(+), 42 deletions(-)

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index bc2b2c5717b5..751a05276f48 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1007,16 +1007,17 @@ static void esp_destroy(struct xfrm_state *x)
 	crypto_free_aead(aead);
 }
 
-static int esp_init_aead(struct xfrm_state *x)
+static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	char aead_name[CRYPTO_MAX_ALG_NAME];
 	struct crypto_aead *aead;
 	int err;
 
-	err = -ENAMETOOLONG;
 	if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
-		     x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
-		goto error;
+		     x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
+		NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+		return -ENAMETOOLONG;
+	}
 
 	aead = crypto_alloc_aead(aead_name, 0, 0);
 	err = PTR_ERR(aead);
@@ -1034,11 +1035,15 @@ static int esp_init_aead(struct xfrm_state *x)
 	if (err)
 		goto error;
 
+	return 0;
+
 error:
+	NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 	return err;
 }
 
-static int esp_init_authenc(struct xfrm_state *x)
+static int esp_init_authenc(struct xfrm_state *x,
+			    struct netlink_ext_ack *extack)
 {
 	struct crypto_aead *aead;
 	struct crypto_authenc_key_param *param;
@@ -1049,10 +1054,6 @@ static int esp_init_authenc(struct xfrm_state *x)
 	unsigned int keylen;
 	int err;
 
-	err = -EINVAL;
-	if (!x->ealg)
-		goto error;
-
 	err = -ENAMETOOLONG;
 
 	if ((x->props.flags & XFRM_STATE_ESN)) {
@@ -1061,22 +1062,28 @@ static int esp_init_authenc(struct xfrm_state *x)
 			     x->geniv ?: "", x->geniv ? "(" : "",
 			     x->aalg ? x->aalg->alg_name : "digest_null",
 			     x->ealg->alg_name,
-			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
+			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+			NL_SET_ERR_MSG(extack, "Algorithm name is too long");
 			goto error;
+		}
 	} else {
 		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
 			     "%s%sauthenc(%s,%s)%s",
 			     x->geniv ?: "", x->geniv ? "(" : "",
 			     x->aalg ? x->aalg->alg_name : "digest_null",
 			     x->ealg->alg_name,
-			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
+			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+			NL_SET_ERR_MSG(extack, "Algorithm name is too long");
 			goto error;
+		}
 	}
 
 	aead = crypto_alloc_aead(authenc_name, 0, 0);
 	err = PTR_ERR(aead);
-	if (IS_ERR(aead))
+	if (IS_ERR(aead)) {
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 		goto error;
+	}
 
 	x->data = aead;
 
@@ -1106,17 +1113,16 @@ static int esp_init_authenc(struct xfrm_state *x)
 		err = -EINVAL;
 		if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
 		    crypto_aead_authsize(aead)) {
-			pr_info("ESP: %s digestsize %u != %u\n",
-				x->aalg->alg_name,
-				crypto_aead_authsize(aead),
-				aalg_desc->uinfo.auth.icv_fullbits / 8);
+			NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 			goto free_key;
 		}
 
 		err = crypto_aead_setauthsize(
 			aead, x->aalg->alg_trunc_len / 8);
-		if (err)
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 			goto free_key;
+		}
 	}
 
 	param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
@@ -1139,10 +1145,14 @@ static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 
 	x->data = NULL;
 
-	if (x->aead)
-		err = esp_init_aead(x);
-	else
-		err = esp_init_authenc(x);
+	if (x->aead) {
+		err = esp_init_aead(x, extack);
+	} else if (x->ealg) {
+		err = esp_init_authenc(x, extack);
+	} else {
+		NL_SET_ERR_MSG(extack, "ESP: AEAD or CRYPT must be provided");
+		err = -EINVAL;
+	}
 
 	if (err)
 		goto error;
@@ -1160,6 +1170,7 @@ static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 
 		switch (encap->encap_type) {
 		default:
+			NL_SET_ERR_MSG(extack, "Unsupported encapsulation type for ESP");
 			err = -EINVAL;
 			goto error;
 		case UDP_ENCAP_ESPINUDP:
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 2ca9b7b7e500..e7a16f9643e5 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -1050,16 +1050,17 @@ static void esp6_destroy(struct xfrm_state *x)
 	crypto_free_aead(aead);
 }
 
-static int esp_init_aead(struct xfrm_state *x)
+static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	char aead_name[CRYPTO_MAX_ALG_NAME];
 	struct crypto_aead *aead;
 	int err;
 
-	err = -ENAMETOOLONG;
 	if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
-		     x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
-		goto error;
+		     x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
+		NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+		return -ENAMETOOLONG;
+	}
 
 	aead = crypto_alloc_aead(aead_name, 0, 0);
 	err = PTR_ERR(aead);
@@ -1077,11 +1078,15 @@ static int esp_init_aead(struct xfrm_state *x)
 	if (err)
 		goto error;
 
+	return 0;
+
 error:
+	NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 	return err;
 }
 
-static int esp_init_authenc(struct xfrm_state *x)
+static int esp_init_authenc(struct xfrm_state *x,
+			    struct netlink_ext_ack *extack)
 {
 	struct crypto_aead *aead;
 	struct crypto_authenc_key_param *param;
@@ -1092,10 +1097,6 @@ static int esp_init_authenc(struct xfrm_state *x)
 	unsigned int keylen;
 	int err;
 
-	err = -EINVAL;
-	if (!x->ealg)
-		goto error;
-
 	err = -ENAMETOOLONG;
 
 	if ((x->props.flags & XFRM_STATE_ESN)) {
@@ -1104,22 +1105,28 @@ static int esp_init_authenc(struct xfrm_state *x)
 			     x->geniv ?: "", x->geniv ? "(" : "",
 			     x->aalg ? x->aalg->alg_name : "digest_null",
 			     x->ealg->alg_name,
-			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
+			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+			NL_SET_ERR_MSG(extack, "Algorithm name is too long");
 			goto error;
+		}
 	} else {
 		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
 			     "%s%sauthenc(%s,%s)%s",
 			     x->geniv ?: "", x->geniv ? "(" : "",
 			     x->aalg ? x->aalg->alg_name : "digest_null",
 			     x->ealg->alg_name,
-			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
+			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+			NL_SET_ERR_MSG(extack, "Algorithm name is too long");
 			goto error;
+		}
 	}
 
 	aead = crypto_alloc_aead(authenc_name, 0, 0);
 	err = PTR_ERR(aead);
-	if (IS_ERR(aead))
+	if (IS_ERR(aead)) {
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 		goto error;
+	}
 
 	x->data = aead;
 
@@ -1149,17 +1156,16 @@ static int esp_init_authenc(struct xfrm_state *x)
 		err = -EINVAL;
 		if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
 		    crypto_aead_authsize(aead)) {
-			pr_info("ESP: %s digestsize %u != %u\n",
-				x->aalg->alg_name,
-				crypto_aead_authsize(aead),
-				aalg_desc->uinfo.auth.icv_fullbits / 8);
+			NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 			goto free_key;
 		}
 
 		err = crypto_aead_setauthsize(
 			aead, x->aalg->alg_trunc_len / 8);
-		if (err)
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
 			goto free_key;
+		}
 	}
 
 	param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
@@ -1182,10 +1188,14 @@ static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 
 	x->data = NULL;
 
-	if (x->aead)
-		err = esp_init_aead(x);
-	else
-		err = esp_init_authenc(x);
+	if (x->aead) {
+		err = esp_init_aead(x, extack);
+	} else if (x->ealg) {
+		err = esp_init_authenc(x, extack);
+	} else {
+		NL_SET_ERR_MSG(extack, "ESP: AEAD or CRYPT must be provided");
+		err = -EINVAL;
+	}
 
 	if (err)
 		goto error;
@@ -1213,6 +1223,7 @@ static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 
 		switch (encap->encap_type) {
 		default:
+			NL_SET_ERR_MSG(extack, "Unsupported encapsulation type for ESP");
 			err = -EINVAL;
 			goto error;
 		case UDP_ENCAP_ESPINUDP:
-- 
cgit v1.2.3


From 25ec92cd042ace0c109c3f6e5e6b634073414cc0 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 27 Sep 2022 17:45:32 +0200
Subject: xfrm: tunnel: add extack to ipip_init_state, xfrm6_tunnel_init_state

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_tunnel.c | 8 ++++++--
 net/ipv6/xfrm6_tunnel.c | 8 ++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 08826e0d7962..8489fa106583 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -24,11 +24,15 @@ static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb)
 
 static int ipip_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
-	if (x->props.mode != XFRM_MODE_TUNNEL)
+	if (x->props.mode != XFRM_MODE_TUNNEL) {
+		NL_SET_ERR_MSG(extack, "IPv4 tunnel can only be used with tunnel mode");
 		return -EINVAL;
+	}
 
-	if (x->encap)
+	if (x->encap) {
+		NL_SET_ERR_MSG(extack, "IPv4 tunnel is not compatible with encapsulation");
 		return -EINVAL;
+	}
 
 	x->props.header_len = sizeof(struct iphdr);
 
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index dda44b0671ac..1323f2f6928e 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -272,11 +272,15 @@ static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 static int xfrm6_tunnel_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
-	if (x->props.mode != XFRM_MODE_TUNNEL)
+	if (x->props.mode != XFRM_MODE_TUNNEL) {
+		NL_SET_ERR_MSG(extack, "IPv6 tunnel can only be used with tunnel mode");
 		return -EINVAL;
+	}
 
-	if (x->encap)
+	if (x->encap) {
+		NL_SET_ERR_MSG(extack, "IPv6 tunnel is not compatible with encapsulation");
 		return -EINVAL;
+	}
 
 	x->props.header_len = sizeof(struct ipv6hdr);
 
-- 
cgit v1.2.3


From 6ee55320520e31f5dae637e928d5792352b22776 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 27 Sep 2022 17:45:33 +0200
Subject: xfrm: ipcomp: add extack to ipcomp{4,6}_init_state

And the shared helper ipcomp_init_state.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/ipcomp.h   |  2 +-
 net/ipv4/ipcomp.c      |  7 +++++--
 net/ipv6/ipcomp6.c     |  7 +++++--
 net/xfrm/xfrm_ipcomp.c | 10 +++++++---
 4 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/include/net/ipcomp.h b/include/net/ipcomp.h
index c31108295079..8660a2a6d1fc 100644
--- a/include/net/ipcomp.h
+++ b/include/net/ipcomp.h
@@ -22,7 +22,7 @@ struct xfrm_state;
 int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb);
 int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb);
 void ipcomp_destroy(struct xfrm_state *x);
-int ipcomp_init_state(struct xfrm_state *x);
+int ipcomp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack);
 
 static inline struct ip_comp_hdr *ip_comp_hdr(const struct sk_buff *skb)
 {
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 230d1120874f..5a4fb2539b08 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -130,17 +130,20 @@ static int ipcomp4_init_state(struct xfrm_state *x,
 		x->props.header_len += sizeof(struct iphdr);
 		break;
 	default:
+		NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp");
 		goto out;
 	}
 
-	err = ipcomp_init_state(x);
+	err = ipcomp_init_state(x, extack);
 	if (err)
 		goto out;
 
 	if (x->props.mode == XFRM_MODE_TUNNEL) {
 		err = ipcomp_tunnel_attach(x);
-		if (err)
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state");
 			goto out;
+		}
 	}
 
 	err = 0;
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 7e47009739e9..72d4858dec18 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -149,17 +149,20 @@ static int ipcomp6_init_state(struct xfrm_state *x,
 		x->props.header_len += sizeof(struct ipv6hdr);
 		break;
 	default:
+		NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp");
 		goto out;
 	}
 
-	err = ipcomp_init_state(x);
+	err = ipcomp_init_state(x, extack);
 	if (err)
 		goto out;
 
 	if (x->props.mode == XFRM_MODE_TUNNEL) {
 		err = ipcomp6_tunnel_attach(x);
-		if (err)
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state");
 			goto out;
+		}
 	}
 
 	err = 0;
diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c
index cb40ff0ff28d..656045a87606 100644
--- a/net/xfrm/xfrm_ipcomp.c
+++ b/net/xfrm/xfrm_ipcomp.c
@@ -325,18 +325,22 @@ void ipcomp_destroy(struct xfrm_state *x)
 }
 EXPORT_SYMBOL_GPL(ipcomp_destroy);
 
-int ipcomp_init_state(struct xfrm_state *x)
+int ipcomp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	int err;
 	struct ipcomp_data *ipcd;
 	struct xfrm_algo_desc *calg_desc;
 
 	err = -EINVAL;
-	if (!x->calg)
+	if (!x->calg) {
+		NL_SET_ERR_MSG(extack, "Missing required compression algorithm");
 		goto out;
+	}
 
-	if (x->encap)
+	if (x->encap) {
+		NL_SET_ERR_MSG(extack, "IPComp is not compatible with encapsulation");
 		goto out;
+	}
 
 	err = -ENOMEM;
 	ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL);
-- 
cgit v1.2.3


From 28b5dbd5dcf7659f64713c66eb7301924e070bf8 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Tue, 27 Sep 2022 17:45:34 +0200
Subject: xfrm: mip6: add extack to mip6_destopt_init_state,
 mip6_rthdr_init_state

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/mip6.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index 3d87ae88ebfd..83d2a8be263f 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -250,12 +250,11 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb,
 static int mip6_destopt_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	if (x->id.spi) {
-		pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi);
+		NL_SET_ERR_MSG(extack, "SPI must be 0");
 		return -EINVAL;
 	}
 	if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
-		pr_info("%s: state's mode is not %u: %u\n",
-			__func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+		NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION");
 		return -EINVAL;
 	}
 
@@ -336,12 +335,11 @@ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb)
 static int mip6_rthdr_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	if (x->id.spi) {
-		pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi);
+		NL_SET_ERR_MSG(extack, "SPI must be 0");
 		return -EINVAL;
 	}
 	if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
-		pr_info("%s: state's mode is not %u: %u\n",
-			__func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+		NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION");
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 457c8b60267054869513ab1fb5513abb0a566dd0 Mon Sep 17 00:00:00 2001
From: Zhengjun Xing <zhengjun.xing@linux.intel.com>
Date: Tue, 27 Sep 2022 13:15:13 +0800
Subject: perf test: Fix test case 87 ("perf record tests") for hybrid systems

The test case 87 ("perf record tests") failed on hybrid systems,the event
"cpu/br_inst_retired.near_call/p" is only for non-hybrid system. Correct
the test event to support both non-hybrid and hybrid systems.

Before:

  # ./perf test 87
  87: perf record tests                                   : FAILED!

After:

  # ./perf test 87
  87: perf record tests                                   : Ok

Fixes: 24f378e66021f559 ("perf test: Add basic perf record tests")
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Xing Zhengjun <zhengjun.xing@linux.intel.com>
Acked-by: Ian Rogers <irogers@google.com>
Cc: Alexander Shishkin <alexander.shishkin@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220927051513.3768717-1-zhengjun.xing@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/shell/record.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh
index 00c7285ce1ac..301f95427159 100755
--- a/tools/perf/tests/shell/record.sh
+++ b/tools/perf/tests/shell/record.sh
@@ -61,7 +61,7 @@ test_register_capture() {
     echo "Register capture test [Skipped missing registers]"
     return
   fi
-  if ! perf record -o - --intr-regs=di,r8,dx,cx -e cpu/br_inst_retired.near_call/p \
+  if ! perf record -o - --intr-regs=di,r8,dx,cx -e br_inst_retired.near_call:p \
     -c 1000 --per-thread true 2> /dev/null \
     | perf script -F ip,sym,iregs -i - 2> /dev/null \
     | egrep -q "DI:"
-- 
cgit v1.2.3


From 25c5e67cdf744cbb93fd06647611d3036218debe Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 28 Sep 2022 12:55:43 -0300
Subject: perf tests record: Fail the test if the 'errs' counter is not zero

We were just checking for the 'err' variable, when we should really see
if there was some of the many checked errors that don't stop the test
right away.

Detected with clang 15.0.0:

  44    75.23 fedora:37       : FAIL clang version 15.0.0 (Fedora 15.0.0-2.fc37)

    tests/perf-record.c:68:16: error: variable 'errs' set but not used [-Werror,-Wunused-but-set-variable]
            int err = -1, errs = 0, i, wakeups = 0;
                          ^
    1 error generated.

The patch introducing this 'perf test' entry had that check:

  +       return (err < 0 || errs > 0) ? -1 : 0;

But at some point we lost that:

  -	  return (err < 0 || errs > 0) ? -1 : 0;
  +	  if (err == -EACCES)
  +               return TEST_SKIP;
  +	  if (err < 0)
  +               return TEST_FAIL;
  +	  return TEST_OK

Put it back.

Fixes: 2cf88f4614c996e5 ("perf test: Use skip in PERF_RECORD_*")
Acked-by: Ian Rogers <irogers@google.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Link: https://lore.kernel.org/lkml/YzR0n5QhsH9VyYB0@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/perf-record.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c
index 6a001fcfed68..4952abe716f3 100644
--- a/tools/perf/tests/perf-record.c
+++ b/tools/perf/tests/perf-record.c
@@ -332,7 +332,7 @@ out_delete_evlist:
 out:
 	if (err == -EACCES)
 		return TEST_SKIP;
-	if (err < 0)
+	if (err < 0 || errs != 0)
 		return TEST_FAIL;
 	return TEST_OK;
 }
-- 
cgit v1.2.3


From 83ca5fb40e758e0a0257bf4e3a1148dd52c6d0f2 Mon Sep 17 00:00:00 2001
From: Leo Li <sunpeng.li@amd.com>
Date: Mon, 26 Sep 2022 17:53:55 -0400
Subject: drm/amd/display: Prevent OTG shutdown during PSR SU

[Why]

Enabling Z10 optimizations allows DMUB to disable the OTG during PSR
link-off. This theoretically saves power by putting more of the display
hardware to sleep. However, we observe that with PSR SU, it causes
visual artifacts, higher power usage, and potential system hang.

This is partly due to an odd behavior with the VStartup interrupt used
to signal DRM vblank events. If the OTG is toggled on/off during a PSR
link on/off cycle, the vstartup interrupt fires twice in quick
succession. This generates incorrectly timed vblank events.
Additionally, it can cause cursor updates to generate visual artifacts.

Note that this is not observed with PSR1 since PSR is fully disabled
when there are vblank event requestors. Cursor updates are also
artifact-free, likely because there are no selectively-updated (SU)
frames that can generate artifacts.

[How]

A potential solution is to disable z10 idle optimizations only when fast
updates (flips & cursor updates) are committed. A mechanism to do so
would require some thoughtful design. Let's just disable idle
optimizations for PSR2 for now.

Fixes: 7cc191ee7621 ("drm/amd/display: Implement MPO PSR SU")
Reported-by: August Wikerfors <git@augustwikerfors.se>
Link: https://lore.kernel.org/r/c1f8886a-5624-8f49-31b1-e42b6d20dcf5@augustwikerfors.se/
Tested-by: August Wikerfors <git@augustwikerfors.se>
Reviewed-by: Harry Wentland <harry.wentland@amd.com>
Signed-off-by: Leo Li <sunpeng.li@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c
index c8da18e45b0e..8ca10ab3dfc1 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_psr.c
@@ -170,7 +170,13 @@ bool amdgpu_dm_psr_enable(struct dc_stream_state *stream)
 					   &stream, 1,
 					   &params);
 
-	power_opt |= psr_power_opt_z10_static_screen;
+	/*
+	 * Only enable static-screen optimizations for PSR1. For PSR SU, this
+	 * causes vstartup interrupt issues, used by amdgpu_dm to send vblank
+	 * events.
+	 */
+	if (link->psr_settings.psr_version < DC_PSR_VERSION_SU_1)
+		power_opt |= psr_power_opt_z10_static_screen;
 
 	return dc_link_set_psr_allow_active(link, &psr_enable, false, false, &power_opt);
 }
-- 
cgit v1.2.3


From 64696c40d03c01e0ea2e3e9aa1c490a7b6a1b6be Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Sep 2022 00:04:03 -0700
Subject: bpf: Add __bpf_prog_{enter,exit}_struct_ops for struct_ops trampoline

The struct_ops prog is to allow using bpf to implement the functions in
a struct (eg. kernel module).  The current usage is to implement the
tcp_congestion.  The kernel does not call the tcp-cc's ops (ie.
the bpf prog) in a recursive way.

The struct_ops is sharing the tracing-trampoline's enter/exit
function which tracks prog->active to avoid recursion.  It is
needed for tracing prog.  However, it turns out the struct_ops
bpf prog will hit this prog->active and unnecessarily skipped
running the struct_ops prog.  eg.  The '.ssthresh' may run in_task()
and then interrupted by softirq that runs the same '.ssthresh'.
Skip running the '.ssthresh' will end up returning random value
to the caller.

The patch adds __bpf_prog_{enter,exit}_struct_ops for the
struct_ops trampoline.  They do not track the prog->active
to detect recursion.

One exception is when the tcp_congestion's '.init' ops is doing
bpf_setsockopt(TCP_CONGESTION) and then recurs to the same
'.init' ops.  This will be addressed in the following patches.

Fixes: ca06f55b9002 ("bpf: Add per-program recursion prevention mechanism")
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20220929070407.965581-2-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c |  3 +++
 include/linux/bpf.h         |  4 ++++
 kernel/bpf/trampoline.c     | 23 +++++++++++++++++++++++
 3 files changed, 30 insertions(+)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 35796db58116..5b6230779cf3 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1836,6 +1836,9 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
 	if (p->aux->sleepable) {
 		enter = __bpf_prog_enter_sleepable;
 		exit = __bpf_prog_exit_sleepable;
+	} else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) {
+		enter = __bpf_prog_enter_struct_ops;
+		exit = __bpf_prog_exit_struct_ops;
 	} else if (p->expected_attach_type == BPF_LSM_CGROUP) {
 		enter = __bpf_prog_enter_lsm_cgroup;
 		exit = __bpf_prog_exit_lsm_cgroup;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0f3eaf3ed98c..9e7d46d16032 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -864,6 +864,10 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
 					struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
 					struct bpf_tramp_run_ctx *run_ctx);
+u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
+					struct bpf_tramp_run_ctx *run_ctx);
+void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
+					struct bpf_tramp_run_ctx *run_ctx);
 void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
 void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
 
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 6f7b939321d6..bf0906e1e2b9 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -964,6 +964,29 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
 	rcu_read_unlock_trace();
 }
 
+u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
+					struct bpf_tramp_run_ctx *run_ctx)
+	__acquires(RCU)
+{
+	rcu_read_lock();
+	migrate_disable();
+
+	run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+	return bpf_prog_start_time();
+}
+
+void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
+					struct bpf_tramp_run_ctx *run_ctx)
+	__releases(RCU)
+{
+	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+	update_prog_stats(prog, start);
+	migrate_enable();
+	rcu_read_unlock();
+}
+
 void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
 {
 	percpu_ref_get(&tr->pcref);
-- 
cgit v1.2.3


From 37cfbe0bf2e85287350a6b0ca9521f5a4c7389ce Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Sep 2022 00:04:04 -0700
Subject: bpf: Move the "cdg" tcp-cc check to the common sol_tcp_sockopt()

The check on the tcp-cc, "cdg", is done in the bpf_sk_setsockopt which is
used by the bpf_tcp_ca, bpf_lsm, cg_sockopt, and tcp_iter hooks.
However, it is not done for cg sock_ddr, cg sockops, and some of
the bpf_lsm_cgroup hooks.

The tcp-cc "cdg" should have very limited usage.  This patch is to
move the "cdg" check to the common sol_tcp_sockopt() so that all
hooks have a consistent behavior.   The motivation to make
this check consistent now is because the latter patch will
refactor the bpf_setsockopt(TCP_CONGESTION) into another function,
so it is better to take this chance to refactor this piece
also.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20220929070407.965581-3-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 2fd9449026aa..f4cea3ff994a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5127,6 +5127,13 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
 	case TCP_CONGESTION:
 		if (*optlen < 2)
 			return -EINVAL;
+		/* "cdg" is the only cc that alloc a ptr
+		 * in inet_csk_ca area.  The bpf-tcp-cc may
+		 * overwrite this ptr after switching to cdg.
+		 */
+		if (!getopt && *optlen >= sizeof("cdg") - 1 &&
+		    !strncmp("cdg", optval, *optlen))
+			return -ENOTSUPP;
 		break;
 	case TCP_SAVED_SYN:
 		if (*optlen < 1)
@@ -5285,12 +5292,6 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
 BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
 	   int, optname, char *, optval, int, optlen)
 {
-	if (level == SOL_TCP && optname == TCP_CONGESTION) {
-		if (optlen >= sizeof("cdg") - 1 &&
-		    !strncmp("cdg", optval, optlen))
-			return -ENOTSUPP;
-	}
-
 	return _bpf_setsockopt(sk, level, optname, optval, optlen);
 }
 
-- 
cgit v1.2.3


From 1e7d217faa11ac027f622124a3842aafbd0c4a42 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Sep 2022 00:04:05 -0700
Subject: bpf: Refactor bpf_setsockopt(TCP_CONGESTION) handling into another
 function

This patch moves the bpf_setsockopt(TCP_CONGESTION) logic into
another function.  The next patch will add extra logic to avoid
recursion and this will make the latter patch easier to follow.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20220929070407.965581-4-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/filter.c | 45 ++++++++++++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index f4cea3ff994a..96f2f7a65e65 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5102,6 +5102,33 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
 	return 0;
 }
 
+static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
+				      int *optlen, bool getopt)
+{
+	if (*optlen < 2)
+		return -EINVAL;
+
+	if (getopt) {
+		if (!inet_csk(sk)->icsk_ca_ops)
+			return -EINVAL;
+		/* BPF expects NULL-terminated tcp-cc string */
+		optval[--(*optlen)] = '\0';
+		return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
+					 KERNEL_SOCKPTR(optval),
+					 KERNEL_SOCKPTR(optlen));
+	}
+
+	/* "cdg" is the only cc that alloc a ptr
+	 * in inet_csk_ca area.  The bpf-tcp-cc may
+	 * overwrite this ptr after switching to cdg.
+	 */
+	if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
+		return -ENOTSUPP;
+
+	return do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+				KERNEL_SOCKPTR(optval), *optlen);
+}
+
 static int sol_tcp_sockopt(struct sock *sk, int optname,
 			   char *optval, int *optlen,
 			   bool getopt)
@@ -5125,16 +5152,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
 			return -EINVAL;
 		break;
 	case TCP_CONGESTION:
-		if (*optlen < 2)
-			return -EINVAL;
-		/* "cdg" is the only cc that alloc a ptr
-		 * in inet_csk_ca area.  The bpf-tcp-cc may
-		 * overwrite this ptr after switching to cdg.
-		 */
-		if (!getopt && *optlen >= sizeof("cdg") - 1 &&
-		    !strncmp("cdg", optval, *optlen))
-			return -ENOTSUPP;
-		break;
+		return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
 	case TCP_SAVED_SYN:
 		if (*optlen < 1)
 			return -EINVAL;
@@ -5159,13 +5177,6 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
 			return 0;
 		}
 
-		if (optname == TCP_CONGESTION) {
-			if (!inet_csk(sk)->icsk_ca_ops)
-				return -EINVAL;
-			/* BPF expects NULL-terminated tcp-cc string */
-			optval[--(*optlen)] = '\0';
-		}
-
 		return do_tcp_getsockopt(sk, SOL_TCP, optname,
 					 KERNEL_SOCKPTR(optval),
 					 KERNEL_SOCKPTR(optlen));
-- 
cgit v1.2.3


From 061ff040710e9f6f043d1fa80b1b362d2845b17a Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Sep 2022 00:04:06 -0700
Subject: bpf: tcp: Stop bpf_setsockopt(TCP_CONGESTION) in init ops to recur
 itself

When a bad bpf prog '.init' calls
bpf_setsockopt(TCP_CONGESTION, "itself"), it will trigger this loop:

.init => bpf_setsockopt(tcp_cc) => .init => bpf_setsockopt(tcp_cc) ...
... => .init => bpf_setsockopt(tcp_cc).

It was prevented by the prog->active counter before but the prog->active
detection cannot be used in struct_ops as explained in the earlier
patch of the set.

In this patch, the second bpf_setsockopt(tcp_cc) is not allowed
in order to break the loop.  This is done by using a bit of
an existing 1 byte hole in tcp_sock to check if there is
on-going bpf_setsockopt(TCP_CONGESTION) in this tcp_sock.

Note that this essentially limits only the first '.init' can
call bpf_setsockopt(TCP_CONGESTION) to pick a fallback cc (eg. peer
does not support ECN) and the second '.init' cannot fallback to
another cc.  This applies even the second
bpf_setsockopt(TCP_CONGESTION) will not cause a loop.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20220929070407.965581-5-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/tcp.h      |  6 ++++++
 net/core/filter.c        | 28 +++++++++++++++++++++++++++-
 net/ipv4/tcp_minisocks.c |  1 +
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a9fbe22732c3..3bdf687e2fb3 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -388,6 +388,12 @@ struct tcp_sock {
 	u8	bpf_sock_ops_cb_flags;  /* Control calling BPF programs
 					 * values defined in uapi/linux/tcp.h
 					 */
+	u8	bpf_chg_cc_inprogress:1; /* In the middle of
+					  * bpf_setsockopt(TCP_CONGESTION),
+					  * it is to avoid the bpf_tcp_cc->init()
+					  * to recur itself by calling
+					  * bpf_setsockopt(TCP_CONGESTION, "itself").
+					  */
 #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
 #else
 #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
diff --git a/net/core/filter.c b/net/core/filter.c
index 96f2f7a65e65..ac4c45c02da5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5105,6 +5105,9 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
 static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
 				      int *optlen, bool getopt)
 {
+	struct tcp_sock *tp;
+	int ret;
+
 	if (*optlen < 2)
 		return -EINVAL;
 
@@ -5125,8 +5128,31 @@ static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
 	if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
 		return -ENOTSUPP;
 
-	return do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+	/* It stops this looping
+	 *
+	 * .init => bpf_setsockopt(tcp_cc) => .init =>
+	 * bpf_setsockopt(tcp_cc)" => .init => ....
+	 *
+	 * The second bpf_setsockopt(tcp_cc) is not allowed
+	 * in order to break the loop when both .init
+	 * are the same bpf prog.
+	 *
+	 * This applies even the second bpf_setsockopt(tcp_cc)
+	 * does not cause a loop.  This limits only the first
+	 * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
+	 * pick a fallback cc (eg. peer does not support ECN)
+	 * and the second '.init' cannot fallback to
+	 * another.
+	 */
+	tp = tcp_sk(sk);
+	if (tp->bpf_chg_cc_inprogress)
+		return -EBUSY;
+
+	tp->bpf_chg_cc_inprogress = 1;
+	ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
 				KERNEL_SOCKPTR(optval), *optlen);
+	tp->bpf_chg_cc_inprogress = 0;
+	return ret;
 }
 
 static int sol_tcp_sockopt(struct sock *sk, int optname,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb95d88497ae..ddcdc2bc4c04 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -541,6 +541,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	newtp->fastopen_req = NULL;
 	RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
 
+	newtp->bpf_chg_cc_inprogress = 0;
 	tcp_bpf_clone(sk, newsk);
 
 	__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
-- 
cgit v1.2.3


From 3411c5b6f8d6e08d98e606dcf74fc42e2f9d731f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <martin.lau@kernel.org>
Date: Thu, 29 Sep 2022 00:04:07 -0700
Subject: selftests/bpf: Check -EBUSY for the recurred
 bpf_setsockopt(TCP_CONGESTION)

This patch changes the bpf_dctcp test to ensure the recurred
bpf_setsockopt(TCP_CONGESTION) returns -EBUSY.

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20220929070407.965581-6-martin.lau@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c  |  4 ++++
 tools/testing/selftests/bpf/progs/bpf_dctcp.c      | 25 +++++++++++++++-------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
index 2959a52ced06..e980188d4124 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -290,6 +290,10 @@ static void test_dctcp_fallback(void)
 		goto done;
 	ASSERT_STREQ(dctcp_skel->bss->cc_res, "cubic", "cc_res");
 	ASSERT_EQ(dctcp_skel->bss->tcp_cdg_res, -ENOTSUPP, "tcp_cdg_res");
+	/* All setsockopt(TCP_CONGESTION) in the recurred
+	 * bpf_dctcp->init() should fail with -EBUSY.
+	 */
+	ASSERT_EQ(dctcp_skel->bss->ebusy_cnt, 3, "ebusy_cnt");
 
 	err = getsockopt(srv_fd, SOL_TCP, TCP_CONGESTION, srv_cc, &cc_len);
 	if (!ASSERT_OK(err, "getsockopt(srv_fd, TCP_CONGESTION)"))
diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
index 9573be6122be..460682759aed 100644
--- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c
+++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/tcp.h>
+#include <errno.h>
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include "bpf_tcp_helpers.h"
@@ -23,6 +24,7 @@ const char tcp_cdg[] = "cdg";
 char cc_res[TCP_CA_NAME_MAX];
 int tcp_cdg_res = 0;
 int stg_result = 0;
+int ebusy_cnt = 0;
 
 struct {
 	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
@@ -64,16 +66,23 @@ void BPF_PROG(dctcp_init, struct sock *sk)
 
 	if (!(tp->ecn_flags & TCP_ECN_OK) && fallback[0]) {
 		/* Switch to fallback */
-		bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
-			       (void *)fallback, sizeof(fallback));
-		/* Switch back to myself which the bpf trampoline
-		 * stopped calling dctcp_init recursively.
+		if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+				   (void *)fallback, sizeof(fallback)) == -EBUSY)
+			ebusy_cnt++;
+
+		/* Switch back to myself and the recurred dctcp_init()
+		 * will get -EBUSY for all bpf_setsockopt(TCP_CONGESTION),
+		 * except the last "cdg" one.
 		 */
-		bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
-			       (void *)bpf_dctcp, sizeof(bpf_dctcp));
+		if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+				   (void *)bpf_dctcp, sizeof(bpf_dctcp)) == -EBUSY)
+			ebusy_cnt++;
+
 		/* Switch back to fallback */
-		bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
-			       (void *)fallback, sizeof(fallback));
+		if (bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+				   (void *)fallback, sizeof(fallback)) == -EBUSY)
+			ebusy_cnt++;
+
 		/* Expecting -ENOTSUPP for tcp_cdg_res */
 		tcp_cdg_res = bpf_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
 					     (void *)tcp_cdg, sizeof(tcp_cdg));
-- 
cgit v1.2.3


From 903cfe8a7aa8894ae60ef47a9c011e551d7bafef Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Fri, 23 Sep 2022 17:42:05 +0200
Subject: samples/landlock: Print hints about ABI versions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend the help with the latest Landlock ABI version supported by the
sandboxer.

Inform users about the sandboxer or the kernel not being up-to-date.

Make the version check code easier to update and harder to misuse.

Cc: Paul Moore <paul@paul-moore.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Reviewed-by: Günther Noack <gnoack3000@gmail.com>
Link: https://lore.kernel.org/r/20220923154207.3311629-2-mic@digikod.net
---
 samples/landlock/sandboxer.c | 37 +++++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/samples/landlock/sandboxer.c b/samples/landlock/sandboxer.c
index 3e404e51ec64..f29bb3c72230 100644
--- a/samples/landlock/sandboxer.c
+++ b/samples/landlock/sandboxer.c
@@ -162,11 +162,10 @@ out_free_name:
 	LANDLOCK_ACCESS_FS_MAKE_SYM | \
 	LANDLOCK_ACCESS_FS_REFER)
 
-#define ACCESS_ABI_2 ( \
-	LANDLOCK_ACCESS_FS_REFER)
-
 /* clang-format on */
 
+#define LANDLOCK_ABI_LAST 2
+
 int main(const int argc, char *const argv[], char *const *const envp)
 {
 	const char *cmd_path;
@@ -196,8 +195,12 @@ int main(const int argc, char *const argv[], char *const *const envp)
 			"\nexample:\n"
 			"%s=\"/bin:/lib:/usr:/proc:/etc:/dev/urandom\" "
 			"%s=\"/dev/null:/dev/full:/dev/zero:/dev/pts:/tmp\" "
-			"%s bash -i\n",
+			"%s bash -i\n\n",
 			ENV_FS_RO_NAME, ENV_FS_RW_NAME, argv[0]);
+		fprintf(stderr,
+			"This sandboxer can use Landlock features "
+			"up to ABI version %d.\n",
+			LANDLOCK_ABI_LAST);
 		return 1;
 	}
 
@@ -225,12 +228,30 @@ int main(const int argc, char *const argv[], char *const *const envp)
 		}
 		return 1;
 	}
+
 	/* Best-effort security. */
-	if (abi < 2) {
-		ruleset_attr.handled_access_fs &= ~ACCESS_ABI_2;
-		access_fs_ro &= ~ACCESS_ABI_2;
-		access_fs_rw &= ~ACCESS_ABI_2;
+	switch (abi) {
+	case 1:
+		/* Removes LANDLOCK_ACCESS_FS_REFER for ABI < 2 */
+		ruleset_attr.handled_access_fs &= ~LANDLOCK_ACCESS_FS_REFER;
+
+		fprintf(stderr,
+			"Hint: You should update the running kernel "
+			"to leverage Landlock features "
+			"provided by ABI version %d (instead of %d).\n",
+			LANDLOCK_ABI_LAST, abi);
+		__attribute__((fallthrough));
+	case LANDLOCK_ABI_LAST:
+		break;
+	default:
+		fprintf(stderr,
+			"Hint: You should update this sandboxer "
+			"to leverage Landlock features "
+			"provided by ABI version %d (instead of %d).\n",
+			abi, LANDLOCK_ABI_LAST);
 	}
+	access_fs_ro &= ruleset_attr.handled_access_fs;
+	access_fs_rw &= ruleset_attr.handled_access_fs;
 
 	ruleset_fd =
 		landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0);
-- 
cgit v1.2.3


From 16023b05f0832f5bc14e6e0d1e7be4d00e01e1bb Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Fri, 23 Sep 2022 17:42:06 +0200
Subject: landlock: Slightly improve documentation and fix spelling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that we have more than one ABI version, make limitation explanation
more consistent by replacing "ABI 1" with "ABI < 2".  This also
indicates which ABIs support such past limitation.

Improve documentation consistency by not using contractions.

Fix spelling in fs.c .

Cc: Paul Moore <paul@paul-moore.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Reviewed-by: Günther Noack <gnoack3000@gmail.com>
Link: https://lore.kernel.org/r/20220923154207.3311629-3-mic@digikod.net
---
 Documentation/security/landlock.rst      |  4 ++--
 Documentation/userspace-api/landlock.rst | 10 +++++-----
 security/landlock/fs.c                   |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Documentation/security/landlock.rst b/Documentation/security/landlock.rst
index 5c77730b4479..cc9617f3175b 100644
--- a/Documentation/security/landlock.rst
+++ b/Documentation/security/landlock.rst
@@ -7,7 +7,7 @@ Landlock LSM: kernel documentation
 ==================================
 
 :Author: Mickaël Salaün
-:Date: May 2022
+:Date: September 2022
 
 Landlock's goal is to create scoped access-control (i.e. sandboxing).  To
 harden a whole system, this feature should be available to any process,
@@ -49,7 +49,7 @@ Filesystem access rights
 ------------------------
 
 All access rights are tied to an inode and what can be accessed through it.
-Reading the content of a directory doesn't imply to be allowed to read the
+Reading the content of a directory does not imply to be allowed to read the
 content of a listed inode.  Indeed, a file name is local to its parent
 directory, and an inode can be referenced by multiple file names thanks to
 (hard) links.  Being able to unlink a file only has a direct impact on the
diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst
index b8ea59493964..83bae71bf042 100644
--- a/Documentation/userspace-api/landlock.rst
+++ b/Documentation/userspace-api/landlock.rst
@@ -8,7 +8,7 @@ Landlock: unprivileged access control
 =====================================
 
 :Author: Mickaël Salaün
-:Date: May 2022
+:Date: September 2022
 
 The goal of Landlock is to enable to restrict ambient rights (e.g. global
 filesystem access) for a set of processes.  Because Landlock is a stackable
@@ -170,7 +170,7 @@ It is recommended setting access rights to file hierarchy leaves as much as
 possible.  For instance, it is better to be able to have ``~/doc/`` as a
 read-only hierarchy and ``~/tmp/`` as a read-write hierarchy, compared to
 ``~/`` as a read-only hierarchy and ``~/tmp/`` as a read-write hierarchy.
-Following this good practice leads to self-sufficient hierarchies that don't
+Following this good practice leads to self-sufficient hierarchies that do not
 depend on their location (i.e. parent directories).  This is particularly
 relevant when we want to allow linking or renaming.  Indeed, having consistent
 access rights per directory enables to change the location of such directory
@@ -380,8 +380,8 @@ by the Documentation/admin-guide/cgroup-v1/memory.rst.
 Previous limitations
 ====================
 
-File renaming and linking (ABI 1)
----------------------------------
+File renaming and linking (ABI < 2)
+-----------------------------------
 
 Because Landlock targets unprivileged access controls, it needs to properly
 handle composition of rules.  Such property also implies rules nesting.
@@ -410,7 +410,7 @@ contains `CONFIG_LSM=landlock,[...]` with `[...]`  as the list of other
 potentially useful security modules for the running system (see the
 `CONFIG_LSM` help).
 
-If the running kernel doesn't have `landlock` in `CONFIG_LSM`, then we can
+If the running kernel does not have `landlock` in `CONFIG_LSM`, then we can
 still enable it by adding ``lsm=landlock,[...]`` to
 Documentation/admin-guide/kernel-parameters.rst thanks to the bootloader
 configuration.
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index a9dbd99d9ee7..64ed7665455f 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -712,7 +712,7 @@ static inline access_mask_t maybe_remove(const struct dentry *const dentry)
  * allowed accesses in @layer_masks_dom.
  *
  * This is similar to check_access_path_dual() but much simpler because it only
- * handles walking on the same mount point and only check one set of accesses.
+ * handles walking on the same mount point and only checks one set of accesses.
  *
  * Returns:
  * - true if all the domain access rights are allowed for @dir;
-- 
cgit v1.2.3


From 2fff00c81d4c37a037cf704d2d219fbcb45aea3c Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Fri, 23 Sep 2022 17:42:07 +0200
Subject: landlock: Fix documentation style
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It seems that all code should use double backquotes, which is also used
to convert "%" defines.  Let's use an homogeneous style and remove all
use of simple backquotes (which should only be used for emphasis).

Cc: Günther Noack <gnoack3000@gmail.com>
Cc: Paul Moore <paul@paul-moore.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Link: https://lore.kernel.org/r/20220923154207.3311629-4-mic@digikod.net
---
 Documentation/security/landlock.rst      |  4 ++--
 Documentation/userspace-api/landlock.rst | 25 ++++++++++----------
 include/uapi/linux/landlock.h            | 10 ++++----
 security/landlock/syscalls.c             | 40 ++++++++++++++++----------------
 4 files changed, 40 insertions(+), 39 deletions(-)

diff --git a/Documentation/security/landlock.rst b/Documentation/security/landlock.rst
index cc9617f3175b..c0029d5d02eb 100644
--- a/Documentation/security/landlock.rst
+++ b/Documentation/security/landlock.rst
@@ -54,8 +54,8 @@ content of a listed inode.  Indeed, a file name is local to its parent
 directory, and an inode can be referenced by multiple file names thanks to
 (hard) links.  Being able to unlink a file only has a direct impact on the
 directory, not the unlinked inode.  This is the reason why
-`LANDLOCK_ACCESS_FS_REMOVE_FILE` or `LANDLOCK_ACCESS_FS_REFER` are not allowed
-to be tied to files but only to directories.
+``LANDLOCK_ACCESS_FS_REMOVE_FILE`` or ``LANDLOCK_ACCESS_FS_REFER`` are not
+allowed to be tied to files but only to directories.
 
 Tests
 =====
diff --git a/Documentation/userspace-api/landlock.rst b/Documentation/userspace-api/landlock.rst
index 83bae71bf042..cec780c2f497 100644
--- a/Documentation/userspace-api/landlock.rst
+++ b/Documentation/userspace-api/landlock.rst
@@ -69,7 +69,7 @@ should try to protect users as much as possible whatever the kernel they are
 using.  To avoid binary enforcement (i.e. either all security features or
 none), we can leverage a dedicated Landlock command to get the current version
 of the Landlock ABI and adapt the handled accesses.  Let's check if we should
-remove the `LANDLOCK_ACCESS_FS_REFER` access right which is only supported
+remove the ``LANDLOCK_ACCESS_FS_REFER`` access right which is only supported
 starting with the second version of the ABI.
 
 .. code-block:: c
@@ -128,7 +128,7 @@ descriptor.
 It may also be required to create rules following the same logic as explained
 for the ruleset creation, by filtering access rights according to the Landlock
 ABI version.  In this example, this is not required because
-`LANDLOCK_ACCESS_FS_REFER` is not allowed by any rule.
+``LANDLOCK_ACCESS_FS_REFER`` is not allowed by any rule.
 
 We now have a ruleset with one rule allowing read access to ``/usr`` while
 denying all other handled accesses for the filesystem.  The next step is to
@@ -154,8 +154,8 @@ The current thread is now ready to sandbox itself with the ruleset.
     }
     close(ruleset_fd);
 
-If the `landlock_restrict_self` system call succeeds, the current thread is now
-restricted and this policy will be enforced on all its subsequently created
+If the ``landlock_restrict_self`` system call succeeds, the current thread is
+now restricted and this policy will be enforced on all its subsequently created
 children as well.  Once a thread is landlocked, there is no way to remove its
 security policy; only adding more restrictions is allowed.  These threads are
 now in a new Landlock domain, merge of their parent one (if any) with the new
@@ -175,7 +175,8 @@ depend on their location (i.e. parent directories).  This is particularly
 relevant when we want to allow linking or renaming.  Indeed, having consistent
 access rights per directory enables to change the location of such directory
 without relying on the destination directory access rights (except those that
-are required for this operation, see `LANDLOCK_ACCESS_FS_REFER` documentation).
+are required for this operation, see ``LANDLOCK_ACCESS_FS_REFER``
+documentation).
 Having self-sufficient hierarchies also helps to tighten the required access
 rights to the minimal set of data.  This also helps avoid sinkhole directories,
 i.e.  directories where data can be linked to but not linked from.  However,
@@ -259,7 +260,7 @@ Backward and forward compatibility
 
 Landlock is designed to be compatible with past and future versions of the
 kernel.  This is achieved thanks to the system call attributes and the
-associated bitflags, particularly the ruleset's `handled_access_fs`.  Making
+associated bitflags, particularly the ruleset's ``handled_access_fs``.  Making
 handled access right explicit enables the kernel and user space to have a clear
 contract with each other.  This is required to make sure sandboxing will not
 get stricter with a system update, which could break applications.
@@ -394,7 +395,7 @@ according to the potentially lost constraints.  To protect against privilege
 escalations through renaming or linking, and for the sake of simplicity,
 Landlock previously limited linking and renaming to the same directory.
 Starting with the Landlock ABI version 2, it is now possible to securely
-control renaming and linking thanks to the new `LANDLOCK_ACCESS_FS_REFER`
+control renaming and linking thanks to the new ``LANDLOCK_ACCESS_FS_REFER``
 access right.
 
 .. _kernel_support:
@@ -403,14 +404,14 @@ Kernel support
 ==============
 
 Landlock was first introduced in Linux 5.13 but it must be configured at build
-time with `CONFIG_SECURITY_LANDLOCK=y`.  Landlock must also be enabled at boot
+time with ``CONFIG_SECURITY_LANDLOCK=y``.  Landlock must also be enabled at boot
 time as the other security modules.  The list of security modules enabled by
-default is set with `CONFIG_LSM`.  The kernel configuration should then
-contains `CONFIG_LSM=landlock,[...]` with `[...]`  as the list of other
+default is set with ``CONFIG_LSM``.  The kernel configuration should then
+contains ``CONFIG_LSM=landlock,[...]`` with ``[...]``  as the list of other
 potentially useful security modules for the running system (see the
-`CONFIG_LSM` help).
+``CONFIG_LSM`` help).
 
-If the running kernel does not have `landlock` in `CONFIG_LSM`, then we can
+If the running kernel does not have ``landlock`` in ``CONFIG_LSM``, then we can
 still enable it by adding ``lsm=landlock,[...]`` to
 Documentation/admin-guide/kernel-parameters.rst thanks to the bootloader
 configuration.
diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 23df4e0e8ace..9c4bcc37a455 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -26,7 +26,7 @@ struct landlock_ruleset_attr {
 	 * Landlock filesystem access rights that are not part of
 	 * handled_access_fs are allowed.  This is needed for backward
 	 * compatibility reasons.  One exception is the
-	 * LANDLOCK_ACCESS_FS_REFER access right, which is always implicitly
+	 * %LANDLOCK_ACCESS_FS_REFER access right, which is always implicitly
 	 * handled, but must still be explicitly handled to add new rules with
 	 * this access right.
 	 */
@@ -128,11 +128,11 @@ struct landlock_path_beneath_attr {
  *   hierarchy must also always have the same or a superset of restrictions of
  *   the source hierarchy.  If it is not the case, or if the domain doesn't
  *   handle this access right, such actions are denied by default with errno
- *   set to EXDEV.  Linking also requires a LANDLOCK_ACCESS_FS_MAKE_* access
- *   right on the destination directory, and renaming also requires a
- *   LANDLOCK_ACCESS_FS_REMOVE_* access right on the source's (file or
+ *   set to ``EXDEV``.  Linking also requires a ``LANDLOCK_ACCESS_FS_MAKE_*``
+ *   access right on the destination directory, and renaming also requires a
+ *   ``LANDLOCK_ACCESS_FS_REMOVE_*`` access right on the source's (file or
  *   directory) parent.  Otherwise, such actions are denied with errno set to
- *   EACCES.  The EACCES errno prevails over EXDEV to let user space
+ *   ``EACCES``.  The ``EACCES`` errno prevails over ``EXDEV`` to let user space
  *   efficiently deal with an unrecoverable error.
  *
  * .. warning::
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index 735a0865ea11..2ca0ccbd905a 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -149,10 +149,10 @@ static const struct file_operations ruleset_fops = {
  *
  * Possible returned errors are:
  *
- * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
- * - EINVAL: unknown @flags, or unknown access, or too small @size;
- * - E2BIG or EFAULT: @attr or @size inconsistencies;
- * - ENOMSG: empty &landlock_ruleset_attr.handled_access_fs.
+ * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - %EINVAL: unknown @flags, or unknown access, or too small @size;
+ * - %E2BIG or %EFAULT: @attr or @size inconsistencies;
+ * - %ENOMSG: empty &landlock_ruleset_attr.handled_access_fs.
  */
 SYSCALL_DEFINE3(landlock_create_ruleset,
 		const struct landlock_ruleset_attr __user *const, attr,
@@ -280,7 +280,7 @@ out_fdput:
  * @ruleset_fd: File descriptor tied to the ruleset that should be extended
  *		with the new rule.
  * @rule_type: Identify the structure type pointed to by @rule_attr (only
- *             LANDLOCK_RULE_PATH_BENEATH for now).
+ *             %LANDLOCK_RULE_PATH_BENEATH for now).
  * @rule_attr: Pointer to a rule (only of type &struct
  *             landlock_path_beneath_attr for now).
  * @flags: Must be 0.
@@ -290,17 +290,17 @@ out_fdput:
  *
  * Possible returned errors are:
  *
- * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
- * - EINVAL: @flags is not 0, or inconsistent access in the rule (i.e.
+ * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - %EINVAL: @flags is not 0, or inconsistent access in the rule (i.e.
  *   &landlock_path_beneath_attr.allowed_access is not a subset of the
  *   ruleset handled accesses);
- * - ENOMSG: Empty accesses (e.g. &landlock_path_beneath_attr.allowed_access);
- * - EBADF: @ruleset_fd is not a file descriptor for the current thread, or a
+ * - %ENOMSG: Empty accesses (e.g. &landlock_path_beneath_attr.allowed_access);
+ * - %EBADF: @ruleset_fd is not a file descriptor for the current thread, or a
  *   member of @rule_attr is not a file descriptor as expected;
- * - EBADFD: @ruleset_fd is not a ruleset file descriptor, or a member of
+ * - %EBADFD: @ruleset_fd is not a ruleset file descriptor, or a member of
  *   @rule_attr is not the expected file descriptor type;
- * - EPERM: @ruleset_fd has no write access to the underlying ruleset;
- * - EFAULT: @rule_attr inconsistency.
+ * - %EPERM: @ruleset_fd has no write access to the underlying ruleset;
+ * - %EFAULT: @rule_attr inconsistency.
  */
 SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd,
 		const enum landlock_rule_type, rule_type,
@@ -378,20 +378,20 @@ out_put_ruleset:
  * @flags: Must be 0.
  *
  * This system call enables to enforce a Landlock ruleset on the current
- * thread.  Enforcing a ruleset requires that the task has CAP_SYS_ADMIN in its
+ * thread.  Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its
  * namespace or is running with no_new_privs.  This avoids scenarios where
  * unprivileged tasks can affect the behavior of privileged children.
  *
  * Possible returned errors are:
  *
- * - EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
- * - EINVAL: @flags is not 0.
- * - EBADF: @ruleset_fd is not a file descriptor for the current thread;
- * - EBADFD: @ruleset_fd is not a ruleset file descriptor;
- * - EPERM: @ruleset_fd has no read access to the underlying ruleset, or the
+ * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time;
+ * - %EINVAL: @flags is not 0.
+ * - %EBADF: @ruleset_fd is not a file descriptor for the current thread;
+ * - %EBADFD: @ruleset_fd is not a ruleset file descriptor;
+ * - %EPERM: @ruleset_fd has no read access to the underlying ruleset, or the
  *   current thread is not running with no_new_privs, or it doesn't have
- *   CAP_SYS_ADMIN in its namespace.
- * - E2BIG: The maximum number of stacked rulesets is reached for the current
+ *   %CAP_SYS_ADMIN in its namespace.
+ * - %E2BIG: The maximum number of stacked rulesets is reached for the current
  *   thread.
  */
 SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32,
-- 
cgit v1.2.3


From 5149a427d27751da95b04fabe98ab65f412b5777 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 29 Sep 2022 10:13:47 +0200
Subject: perf parse-events: Ignore clang 15 warning about variable set but
 unused in bison produced code

clang 15 now warns:

  46    65.20 fedora:rawhide                : FAIL clang version 15.0.0 (Fedora 15.0.0-3.fc38)
    util/parse-events-bison.c:1401:9: error: variable 'parse_events_nerrs' set but not used [-Werror,-Wunused-but-set-variable]
        int yynerrs = 0;
            ^
    #define yynerrs         parse_events_nerrs
                            ^
    1 error generated.
    make[3]: *** [/git/perf-6.0.0-rc7/tools/build/Makefile.build:139: util] Error 2

Just ignore one more compiler warning for the bison generated C code.

Committer notes:

Older clangs don't know about -Wunused-but-set-variable, so we need to
add -Wno-unknown-warning-option to avoid this:

  37    44.92 fedora:32                     : FAIL clang version 10.0.1 (Fedora 10.0.1-3.fc32)
    error: unknown warning option '-Wno-unused-but-set-variable'; did you mean '-Wno-unused-const-variable'? [-Werror,-Wunknown-warning-option]
    make[3]: *** [/git/perf-6.0.0-rc7/tools/build/Makefile.build:139: util] Error 2

Reported-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/lkml/20220929140514.226807-1-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/Build | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 9dfae1bda9cc..485e1a343165 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -269,7 +269,7 @@ CFLAGS_expr-flex.o          += $(flex_flags)
 bison_flags := -DYYENABLE_NLS=0
 BISON_GE_35 := $(shell expr $(shell $(BISON) --version | grep bison | sed -e 's/.\+ \([0-9]\+\).\([0-9]\+\)/\1\2/g') \>\= 35)
 ifeq ($(BISON_GE_35),1)
-  bison_flags += -Wno-unused-parameter -Wno-nested-externs -Wno-implicit-function-declaration -Wno-switch-enum
+  bison_flags += -Wno-unused-parameter -Wno-nested-externs -Wno-implicit-function-declaration -Wno-switch-enum -Wno-unused-but-set-variable -Wno-unknown-warning-option
 else
   bison_flags += -w
 endif
-- 
cgit v1.2.3


From 5551717bddb0c8aa78c4d2f8846624e2cf6816f7 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Thu, 29 Sep 2022 16:05:14 +0200
Subject: perf tests mmap-basic: Remove unused variable to address clang 15
 warning

A clang 15 build reveal several unused-but-set variables, removing the
'foo' variable in tests/mmap-basic.o object to address one of those
cases.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lore.kernel.org/lkml/20220929140514.226807-2-jolsa@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/tests/mmap-basic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c
index dfb6173b2a82..9e9a2b67de19 100644
--- a/tools/perf/tests/mmap-basic.c
+++ b/tools/perf/tests/mmap-basic.c
@@ -114,8 +114,7 @@ static int test__basic_mmap(struct test_suite *test __maybe_unused, int subtest
 
 	for (i = 0; i < nsyscalls; ++i)
 		for (j = 0; j < expected_nr_events[i]; ++j) {
-			int foo = syscalls[i]();
-			++foo;
+			syscalls[i]();
 		}
 
 	md = &evlist->mmap[0];
-- 
cgit v1.2.3


From 9d0f5cd167444dcffcd46fe19a2cf86f30371f5c Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 27 Sep 2022 10:05:53 -0600
Subject: docs: promote the title of process/index.rst

...otherwise Sphinx won't cooperate when trying to list it explicitly in
the top-level index.rst file

Reviewed-by: David Vernet <void@manifault.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20220927160559.97154-2-corbet@lwn.net
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/process/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/process/index.rst b/Documentation/process/index.rst
index 2ba2a1582bbe..d4b6217472b0 100644
--- a/Documentation/process/index.rst
+++ b/Documentation/process/index.rst
@@ -5,6 +5,7 @@
 
 .. _process_index:
 
+=============================================
 Working with the kernel development community
 =============================================
 
-- 
cgit v1.2.3


From 0c7b4366f1ab955f0b8411c14e764d75e21e1a1c Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 27 Sep 2022 10:05:54 -0600
Subject: docs: Rewrite the front page

The front page is the entry point to the documentation, especially for
people who read it online.  It's a big mess of everything we could think to
toss into it.  Rewrite the page with an eye toward simplicity and making it
easy for readers to get going toward what they really want to find.

This is only a beginning, but it makes our docs more approachable than
before.

Acked-by: Jani Nikula <jani.nikula@intel.com>
Reviewed-by: David Vernet <void@manifault.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20220927160559.97154-3-corbet@lwn.net
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/index.rst          | 148 ++++++++++++++-------------------------
 Documentation/subsystem-apis.rst |  58 +++++++++++++++
 2 files changed, 110 insertions(+), 96 deletions(-)
 create mode 100644 Documentation/subsystem-apis.rst

diff --git a/Documentation/index.rst b/Documentation/index.rst
index 4737c18c97ff..bc492e79f1be 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -18,131 +18,88 @@ documents into a coherent whole.  Please note that improvements to the
 documentation are welcome; join the linux-doc list at vger.kernel.org if
 you want to help out.
 
-Licensing documentation
------------------------
+Working with the development community
+--------------------------------------
 
-The following describes the license of the Linux kernel source code
-(GPLv2), how to properly mark the license of individual files in the source
-tree, as well as links to the full license text.
-
-* :ref:`kernel_licensing`
-
-User-oriented documentation
----------------------------
-
-The following manuals are written for *users* of the kernel — those who are
-trying to get it to work optimally on a given system.
+The essential guides for interacting with the kernel's development
+community and getting your work upstream.
 
 .. toctree::
-   :maxdepth: 2
-
-   admin-guide/index
-   kbuild/index
-
-Firmware-related documentation
-------------------------------
-The following holds information on the kernel's expectations regarding the
-platform firmwares.
+   :maxdepth: 1
 
-.. toctree::
-   :maxdepth: 2
+   process/development-process
+   process/submitting-patches
+   Code of conduct <process/code-of-conduct>
+   maintainer/index
+   All development-process docs <process/index>
 
-   firmware-guide/index
-   devicetree/index
 
-Application-developer documentation
------------------------------------
+Internal API manuals
+--------------------
 
-The user-space API manual gathers together documents describing aspects of
-the kernel interface as seen by application developers.
+Manuals for use by developers working to interface with the rest of the
+kernel.
 
 .. toctree::
-   :maxdepth: 2
-
-   userspace-api/index
+   :maxdepth: 1
 
+   core-api/index
+   driver-api/index
+   subsystem-apis
+   Locking in the kernel <locking/index>
 
-Introduction to kernel development
-----------------------------------
+Development tools and processes
+-------------------------------
 
-These manuals contain overall information about how to develop the kernel.
-The kernel community is quite large, with thousands of developers
-contributing over the course of a year.  As with any large community,
-knowing how things are done will make the process of getting your changes
-merged much easier.
+Various other manuals with useful information for all kernel developers.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
 
-   process/index
-   dev-tools/index
+   process/license-rules
    doc-guide/index
+   dev-tools/index
+   dev-tools/testing-overview
    kernel-hacking/index
    trace/index
-   maintainer/index
    fault-injection/index
    livepatch/index
 
 
-Kernel API documentation
-------------------------
+User-oriented documentation
+---------------------------
 
-These books get into the details of how specific kernel subsystems work
-from the point of view of a kernel developer.  Much of the information here
-is taken directly from the kernel source, with supplemental material added
-as needed (or at least as we managed to add it — probably *not* all that is
-needed).
+The following manuals are written for *users* of the kernel — those who are
+trying to get it to work optimally on a given system and application
+developers seeking information on the kernel's user-space APIs.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
+
+   admin-guide/index
+   The kernel build system <kbuild/index>
+   admin-guide/reporting-issues.rst
+   User-space tools <tools/index>
+   userspace-api/index
+
+
+Firmware-related documentation
+------------------------------
+The following holds information on the kernel's expectations regarding the
+platform firmwares.
+
+.. toctree::
+   :maxdepth: 1
+
+   firmware-guide/index
+   devicetree/index
 
-   driver-api/index
-   core-api/index
-   locking/index
-   accounting/index
-   block/index
-   cdrom/index
-   cpu-freq/index
-   fb/index
-   fpga/index
-   hid/index
-   i2c/index
-   iio/index
-   isdn/index
-   infiniband/index
-   leds/index
-   netlabel/index
-   networking/index
-   pcmcia/index
-   power/index
-   target/index
-   timers/index
-   spi/index
-   w1/index
-   watchdog/index
-   virt/index
-   input/index
-   hwmon/index
-   gpu/index
-   security/index
-   sound/index
-   crypto/index
-   filesystems/index
-   mm/index
-   bpf/index
-   usb/index
-   PCI/index
-   scsi/index
-   misc-devices/index
-   scheduler/index
-   mhi/index
-   peci/index
 
 Architecture-agnostic documentation
 -----------------------------------
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
 
    asm-annotations
 
@@ -163,9 +120,8 @@ of the documentation body, or may require some adjustments and/or conversion
 to ReStructured Text format, or are simply too old.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
 
-   tools/index
    staging/index
 
 
diff --git a/Documentation/subsystem-apis.rst b/Documentation/subsystem-apis.rst
new file mode 100644
index 000000000000..af65004a80aa
--- /dev/null
+++ b/Documentation/subsystem-apis.rst
@@ -0,0 +1,58 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+Kernel subsystem documentation
+==============================
+
+These books get into the details of how specific kernel subsystems work
+from the point of view of a kernel developer.  Much of the information here
+is taken directly from the kernel source, with supplemental material added
+as needed (or at least as we managed to add it — probably *not* all that is
+needed).
+
+**Fixme**: much more organizational work is needed here.
+
+.. toctree::
+   :maxdepth: 1
+
+   driver-api/index
+   core-api/index
+   locking/index
+   accounting/index
+   block/index
+   cdrom/index
+   cpu-freq/index
+   fb/index
+   fpga/index
+   hid/index
+   i2c/index
+   iio/index
+   isdn/index
+   infiniband/index
+   leds/index
+   netlabel/index
+   networking/index
+   pcmcia/index
+   power/index
+   target/index
+   timers/index
+   spi/index
+   w1/index
+   watchdog/index
+   virt/index
+   input/index
+   hwmon/index
+   gpu/index
+   security/index
+   sound/index
+   crypto/index
+   filesystems/index
+   mm/index
+   bpf/index
+   usb/index
+   PCI/index
+   scsi/index
+   misc-devices/index
+   scheduler/index
+   mhi/index
+   peci/index
-- 
cgit v1.2.3


From 3aa024e4e91249524e1342a6790bb561fbea89a4 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 27 Sep 2022 10:05:55 -0600
Subject: docs: reconfigure the HTML left column

Use the html_sidebars directive to get a more useful set of links in the
left column.

Unfortunately, this is a no-op with the default RTD theme, but others
observe it.

Reviewed-by: David Vernet <void@manifault.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20220927160559.97154-4-corbet@lwn.net
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/conf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/conf.py b/Documentation/conf.py
index cdcc8e0f46a6..c45a3b7f7717 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -369,7 +369,8 @@ html_static_path = ['sphinx-static']
 html_use_smartypants = False
 
 # Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
+# Note that the RTD theme ignores this.
+html_sidebars = { '**': ['searchbox.html', 'localtoc.html', 'sourcelink.html']}
 
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
-- 
cgit v1.2.3


From 90c0bf89dec0c66d2e6538857060586073427059 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 27 Sep 2022 10:05:56 -0600
Subject: docs: remove some index.rst cruft

There is some useless boilerplate text that was added by sphinx when this
file was first created; take it out.

Reviewed-by: David Vernet <void@manifault.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20220927160559.97154-5-corbet@lwn.net
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/index.rst | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Documentation/index.rst b/Documentation/index.rst
index bc492e79f1be..da80c584133c 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -1,11 +1,5 @@
 .. SPDX-License-Identifier: GPL-2.0
 
-
-.. The Linux Kernel documentation master file, created by
-   sphinx-quickstart on Fri Feb 12 13:51:46 2016.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
 .. _linux_doc:
 
 The Linux Kernel documentation
-- 
cgit v1.2.3


From f4bf1cd4ac9c8c4610b687e49a1ba691ab286235 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 27 Sep 2022 10:05:57 -0600
Subject: docs: move asm-annotations.rst into core-api

This one file should not really be in the top-level documentation
directory.  core-api/ may not be a perfect fit but seems to be best, so
move it there.  Adjust a couple of internal document references to make
them location-independent, and point checkpatch.pl at the new location.

Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Joe Perches <joe@perches.com>
Reviewed-by: David Vernet <void@manifault.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20220927160559.97154-6-corbet@lwn.net
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/asm-annotations.rst          | 221 ----------------------------
 Documentation/core-api/asm-annotations.rst | 222 +++++++++++++++++++++++++++++
 Documentation/core-api/index.rst           |   1 +
 Documentation/index.rst                    |   8 --
 scripts/checkpatch.pl                      |   2 +-
 5 files changed, 224 insertions(+), 230 deletions(-)
 delete mode 100644 Documentation/asm-annotations.rst
 create mode 100644 Documentation/core-api/asm-annotations.rst

diff --git a/Documentation/asm-annotations.rst b/Documentation/asm-annotations.rst
deleted file mode 100644
index a64f2ca469d4..000000000000
--- a/Documentation/asm-annotations.rst
+++ /dev/null
@@ -1,221 +0,0 @@
-Assembler Annotations
-=====================
-
-Copyright (c) 2017-2019 Jiri Slaby
-
-This document describes the new macros for annotation of data and code in
-assembly. In particular, it contains information about ``SYM_FUNC_START``,
-``SYM_FUNC_END``, ``SYM_CODE_START``, and similar.
-
-Rationale
----------
-Some code like entries, trampolines, or boot code needs to be written in
-assembly. The same as in C, such code is grouped into functions and
-accompanied with data. Standard assemblers do not force users into precisely
-marking these pieces as code, data, or even specifying their length.
-Nevertheless, assemblers provide developers with such annotations to aid
-debuggers throughout assembly. On top of that, developers also want to mark
-some functions as *global* in order to be visible outside of their translation
-units.
-
-Over time, the Linux kernel has adopted macros from various projects (like
-``binutils``) to facilitate such annotations. So for historic reasons,
-developers have been using ``ENTRY``, ``END``, ``ENDPROC``, and other
-annotations in assembly.  Due to the lack of their documentation, the macros
-are used in rather wrong contexts at some locations. Clearly, ``ENTRY`` was
-intended to denote the beginning of global symbols (be it data or code).
-``END`` used to mark the end of data or end of special functions with
-*non-standard* calling convention. In contrast, ``ENDPROC`` should annotate
-only ends of *standard* functions.
-
-When these macros are used correctly, they help assemblers generate a nice
-object with both sizes and types set correctly. For example, the result of
-``arch/x86/lib/putuser.S``::
-
-   Num:    Value          Size Type    Bind   Vis      Ndx Name
-    25: 0000000000000000    33 FUNC    GLOBAL DEFAULT    1 __put_user_1
-    29: 0000000000000030    37 FUNC    GLOBAL DEFAULT    1 __put_user_2
-    32: 0000000000000060    36 FUNC    GLOBAL DEFAULT    1 __put_user_4
-    35: 0000000000000090    37 FUNC    GLOBAL DEFAULT    1 __put_user_8
-
-This is not only important for debugging purposes. When there are properly
-annotated objects like this, tools can be run on them to generate more useful
-information. In particular, on properly annotated objects, ``objtool`` can be
-run to check and fix the object if needed. Currently, ``objtool`` can report
-missing frame pointer setup/destruction in functions. It can also
-automatically generate annotations for :doc:`ORC unwinder <x86/orc-unwinder>`
-for most code. Both of these are especially important to support reliable
-stack traces which are in turn necessary for :doc:`Kernel live patching
-<livepatch/livepatch>`.
-
-Caveat and Discussion
----------------------
-As one might realize, there were only three macros previously. That is indeed
-insufficient to cover all the combinations of cases:
-
-* standard/non-standard function
-* code/data
-* global/local symbol
-
-There was a discussion_ and instead of extending the current ``ENTRY/END*``
-macros, it was decided that brand new macros should be introduced instead::
-
-    So how about using macro names that actually show the purpose, instead
-    of importing all the crappy, historic, essentially randomly chosen
-    debug symbol macro names from the binutils and older kernels?
-
-.. _discussion: https://lore.kernel.org/r/20170217104757.28588-1-jslaby@suse.cz
-
-Macros Description
-------------------
-
-The new macros are prefixed with the ``SYM_`` prefix and can be divided into
-three main groups:
-
-1. ``SYM_FUNC_*`` -- to annotate C-like functions. This means functions with
-   standard C calling conventions. For example, on x86, this means that the
-   stack contains a return address at the predefined place and a return from
-   the function can happen in a standard way. When frame pointers are enabled,
-   save/restore of frame pointer shall happen at the start/end of a function,
-   respectively, too.
-
-   Checking tools like ``objtool`` should ensure such marked functions conform
-   to these rules. The tools can also easily annotate these functions with
-   debugging information (like *ORC data*) automatically.
-
-2. ``SYM_CODE_*`` -- special functions called with special stack. Be it
-   interrupt handlers with special stack content, trampolines, or startup
-   functions.
-
-   Checking tools mostly ignore checking of these functions. But some debug
-   information still can be generated automatically. For correct debug data,
-   this code needs hints like ``UNWIND_HINT_REGS`` provided by developers.
-
-3. ``SYM_DATA*`` -- obviously data belonging to ``.data`` sections and not to
-   ``.text``. Data do not contain instructions, so they have to be treated
-   specially by the tools: they should not treat the bytes as instructions,
-   nor assign any debug information to them.
-
-Instruction Macros
-~~~~~~~~~~~~~~~~~~
-This section covers ``SYM_FUNC_*`` and ``SYM_CODE_*`` enumerated above.
-
-``objtool`` requires that all code must be contained in an ELF symbol. Symbol
-names that have a ``.L`` prefix do not emit symbol table entries. ``.L``
-prefixed symbols can be used within a code region, but should be avoided for
-denoting a range of code via ``SYM_*_START/END`` annotations.
-
-* ``SYM_FUNC_START`` and ``SYM_FUNC_START_LOCAL`` are supposed to be **the
-  most frequent markings**. They are used for functions with standard calling
-  conventions -- global and local. Like in C, they both align the functions to
-  architecture specific ``__ALIGN`` bytes. There are also ``_NOALIGN`` variants
-  for special cases where developers do not want this implicit alignment.
-
-  ``SYM_FUNC_START_WEAK`` and ``SYM_FUNC_START_WEAK_NOALIGN`` markings are
-  also offered as an assembler counterpart to the *weak* attribute known from
-  C.
-
-  All of these **shall** be coupled with ``SYM_FUNC_END``. First, it marks
-  the sequence of instructions as a function and computes its size to the
-  generated object file. Second, it also eases checking and processing such
-  object files as the tools can trivially find exact function boundaries.
-
-  So in most cases, developers should write something like in the following
-  example, having some asm instructions in between the macros, of course::
-
-    SYM_FUNC_START(memset)
-        ... asm insns ...
-    SYM_FUNC_END(memset)
-
-  In fact, this kind of annotation corresponds to the now deprecated ``ENTRY``
-  and ``ENDPROC`` macros.
-
-* ``SYM_FUNC_ALIAS``, ``SYM_FUNC_ALIAS_LOCAL``, and ``SYM_FUNC_ALIAS_WEAK`` can
-  be used to define multiple names for a function. The typical use is::
-
-    SYM_FUNC_START(__memset)
-        ... asm insns ...
-    SYN_FUNC_END(__memset)
-    SYM_FUNC_ALIAS(memset, __memset)
-
-  In this example, one can call ``__memset`` or ``memset`` with the same
-  result, except the debug information for the instructions is generated to
-  the object file only once -- for the non-``ALIAS`` case.
-
-* ``SYM_CODE_START`` and ``SYM_CODE_START_LOCAL`` should be used only in
-  special cases -- if you know what you are doing. This is used exclusively
-  for interrupt handlers and similar where the calling convention is not the C
-  one. ``_NOALIGN`` variants exist too. The use is the same as for the ``FUNC``
-  category above::
-
-    SYM_CODE_START_LOCAL(bad_put_user)
-        ... asm insns ...
-    SYM_CODE_END(bad_put_user)
-
-  Again, every ``SYM_CODE_START*`` **shall** be coupled by ``SYM_CODE_END``.
-
-  To some extent, this category corresponds to deprecated ``ENTRY`` and
-  ``END``. Except ``END`` had several other meanings too.
-
-* ``SYM_INNER_LABEL*`` is used to denote a label inside some
-  ``SYM_{CODE,FUNC}_START`` and ``SYM_{CODE,FUNC}_END``.  They are very similar
-  to C labels, except they can be made global. An example of use::
-
-    SYM_CODE_START(ftrace_caller)
-        /* save_mcount_regs fills in first two parameters */
-        ...
-
-    SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
-        /* Load the ftrace_ops into the 3rd parameter */
-        ...
-
-    SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
-        call ftrace_stub
-        ...
-        retq
-    SYM_CODE_END(ftrace_caller)
-
-Data Macros
-~~~~~~~~~~~
-Similar to instructions, there is a couple of macros to describe data in the
-assembly.
-
-* ``SYM_DATA_START`` and ``SYM_DATA_START_LOCAL`` mark the start of some data
-  and shall be used in conjunction with either ``SYM_DATA_END``, or
-  ``SYM_DATA_END_LABEL``. The latter adds also a label to the end, so that
-  people can use ``lstack`` and (local) ``lstack_end`` in the following
-  example::
-
-    SYM_DATA_START_LOCAL(lstack)
-        .skip 4096
-    SYM_DATA_END_LABEL(lstack, SYM_L_LOCAL, lstack_end)
-
-* ``SYM_DATA`` and ``SYM_DATA_LOCAL`` are variants for simple, mostly one-line
-  data::
-
-    SYM_DATA(HEAP,     .long rm_heap)
-    SYM_DATA(heap_end, .long rm_stack)
-
-  In the end, they expand to ``SYM_DATA_START`` with ``SYM_DATA_END``
-  internally.
-
-Support Macros
-~~~~~~~~~~~~~~
-All the above reduce themselves to some invocation of ``SYM_START``,
-``SYM_END``, or ``SYM_ENTRY`` at last. Normally, developers should avoid using
-these.
-
-Further, in the above examples, one could see ``SYM_L_LOCAL``. There are also
-``SYM_L_GLOBAL`` and ``SYM_L_WEAK``. All are intended to denote linkage of a
-symbol marked by them. They are used either in ``_LABEL`` variants of the
-earlier macros, or in ``SYM_START``.
-
-
-Overriding Macros
-~~~~~~~~~~~~~~~~~
-Architecture can also override any of the macros in their own
-``asm/linkage.h``, including macros specifying the type of a symbol
-(``SYM_T_FUNC``, ``SYM_T_OBJECT``, and ``SYM_T_NONE``).  As every macro
-described in this file is surrounded by ``#ifdef`` + ``#endif``, it is enough
-to define the macros differently in the aforementioned architecture-dependent
-header.
diff --git a/Documentation/core-api/asm-annotations.rst b/Documentation/core-api/asm-annotations.rst
new file mode 100644
index 000000000000..bc514ed59887
--- /dev/null
+++ b/Documentation/core-api/asm-annotations.rst
@@ -0,0 +1,222 @@
+Assembler Annotations
+=====================
+
+Copyright (c) 2017-2019 Jiri Slaby
+
+This document describes the new macros for annotation of data and code in
+assembly. In particular, it contains information about ``SYM_FUNC_START``,
+``SYM_FUNC_END``, ``SYM_CODE_START``, and similar.
+
+Rationale
+---------
+Some code like entries, trampolines, or boot code needs to be written in
+assembly. The same as in C, such code is grouped into functions and
+accompanied with data. Standard assemblers do not force users into precisely
+marking these pieces as code, data, or even specifying their length.
+Nevertheless, assemblers provide developers with such annotations to aid
+debuggers throughout assembly. On top of that, developers also want to mark
+some functions as *global* in order to be visible outside of their translation
+units.
+
+Over time, the Linux kernel has adopted macros from various projects (like
+``binutils``) to facilitate such annotations. So for historic reasons,
+developers have been using ``ENTRY``, ``END``, ``ENDPROC``, and other
+annotations in assembly.  Due to the lack of their documentation, the macros
+are used in rather wrong contexts at some locations. Clearly, ``ENTRY`` was
+intended to denote the beginning of global symbols (be it data or code).
+``END`` used to mark the end of data or end of special functions with
+*non-standard* calling convention. In contrast, ``ENDPROC`` should annotate
+only ends of *standard* functions.
+
+When these macros are used correctly, they help assemblers generate a nice
+object with both sizes and types set correctly. For example, the result of
+``arch/x86/lib/putuser.S``::
+
+   Num:    Value          Size Type    Bind   Vis      Ndx Name
+    25: 0000000000000000    33 FUNC    GLOBAL DEFAULT    1 __put_user_1
+    29: 0000000000000030    37 FUNC    GLOBAL DEFAULT    1 __put_user_2
+    32: 0000000000000060    36 FUNC    GLOBAL DEFAULT    1 __put_user_4
+    35: 0000000000000090    37 FUNC    GLOBAL DEFAULT    1 __put_user_8
+
+This is not only important for debugging purposes. When there are properly
+annotated objects like this, tools can be run on them to generate more useful
+information. In particular, on properly annotated objects, ``objtool`` can be
+run to check and fix the object if needed. Currently, ``objtool`` can report
+missing frame pointer setup/destruction in functions. It can also
+automatically generate annotations for the ORC unwinder
+(Documentation/x86/orc-unwinder.rst)
+for most code. Both of these are especially important to support reliable
+stack traces which are in turn necessary for kernel live patching
+(Documentation/livepatch/livepatch.rst).
+
+Caveat and Discussion
+---------------------
+As one might realize, there were only three macros previously. That is indeed
+insufficient to cover all the combinations of cases:
+
+* standard/non-standard function
+* code/data
+* global/local symbol
+
+There was a discussion_ and instead of extending the current ``ENTRY/END*``
+macros, it was decided that brand new macros should be introduced instead::
+
+    So how about using macro names that actually show the purpose, instead
+    of importing all the crappy, historic, essentially randomly chosen
+    debug symbol macro names from the binutils and older kernels?
+
+.. _discussion: https://lore.kernel.org/r/20170217104757.28588-1-jslaby@suse.cz
+
+Macros Description
+------------------
+
+The new macros are prefixed with the ``SYM_`` prefix and can be divided into
+three main groups:
+
+1. ``SYM_FUNC_*`` -- to annotate C-like functions. This means functions with
+   standard C calling conventions. For example, on x86, this means that the
+   stack contains a return address at the predefined place and a return from
+   the function can happen in a standard way. When frame pointers are enabled,
+   save/restore of frame pointer shall happen at the start/end of a function,
+   respectively, too.
+
+   Checking tools like ``objtool`` should ensure such marked functions conform
+   to these rules. The tools can also easily annotate these functions with
+   debugging information (like *ORC data*) automatically.
+
+2. ``SYM_CODE_*`` -- special functions called with special stack. Be it
+   interrupt handlers with special stack content, trampolines, or startup
+   functions.
+
+   Checking tools mostly ignore checking of these functions. But some debug
+   information still can be generated automatically. For correct debug data,
+   this code needs hints like ``UNWIND_HINT_REGS`` provided by developers.
+
+3. ``SYM_DATA*`` -- obviously data belonging to ``.data`` sections and not to
+   ``.text``. Data do not contain instructions, so they have to be treated
+   specially by the tools: they should not treat the bytes as instructions,
+   nor assign any debug information to them.
+
+Instruction Macros
+~~~~~~~~~~~~~~~~~~
+This section covers ``SYM_FUNC_*`` and ``SYM_CODE_*`` enumerated above.
+
+``objtool`` requires that all code must be contained in an ELF symbol. Symbol
+names that have a ``.L`` prefix do not emit symbol table entries. ``.L``
+prefixed symbols can be used within a code region, but should be avoided for
+denoting a range of code via ``SYM_*_START/END`` annotations.
+
+* ``SYM_FUNC_START`` and ``SYM_FUNC_START_LOCAL`` are supposed to be **the
+  most frequent markings**. They are used for functions with standard calling
+  conventions -- global and local. Like in C, they both align the functions to
+  architecture specific ``__ALIGN`` bytes. There are also ``_NOALIGN`` variants
+  for special cases where developers do not want this implicit alignment.
+
+  ``SYM_FUNC_START_WEAK`` and ``SYM_FUNC_START_WEAK_NOALIGN`` markings are
+  also offered as an assembler counterpart to the *weak* attribute known from
+  C.
+
+  All of these **shall** be coupled with ``SYM_FUNC_END``. First, it marks
+  the sequence of instructions as a function and computes its size to the
+  generated object file. Second, it also eases checking and processing such
+  object files as the tools can trivially find exact function boundaries.
+
+  So in most cases, developers should write something like in the following
+  example, having some asm instructions in between the macros, of course::
+
+    SYM_FUNC_START(memset)
+        ... asm insns ...
+    SYM_FUNC_END(memset)
+
+  In fact, this kind of annotation corresponds to the now deprecated ``ENTRY``
+  and ``ENDPROC`` macros.
+
+* ``SYM_FUNC_ALIAS``, ``SYM_FUNC_ALIAS_LOCAL``, and ``SYM_FUNC_ALIAS_WEAK`` can
+  be used to define multiple names for a function. The typical use is::
+
+    SYM_FUNC_START(__memset)
+        ... asm insns ...
+    SYN_FUNC_END(__memset)
+    SYM_FUNC_ALIAS(memset, __memset)
+
+  In this example, one can call ``__memset`` or ``memset`` with the same
+  result, except the debug information for the instructions is generated to
+  the object file only once -- for the non-``ALIAS`` case.
+
+* ``SYM_CODE_START`` and ``SYM_CODE_START_LOCAL`` should be used only in
+  special cases -- if you know what you are doing. This is used exclusively
+  for interrupt handlers and similar where the calling convention is not the C
+  one. ``_NOALIGN`` variants exist too. The use is the same as for the ``FUNC``
+  category above::
+
+    SYM_CODE_START_LOCAL(bad_put_user)
+        ... asm insns ...
+    SYM_CODE_END(bad_put_user)
+
+  Again, every ``SYM_CODE_START*`` **shall** be coupled by ``SYM_CODE_END``.
+
+  To some extent, this category corresponds to deprecated ``ENTRY`` and
+  ``END``. Except ``END`` had several other meanings too.
+
+* ``SYM_INNER_LABEL*`` is used to denote a label inside some
+  ``SYM_{CODE,FUNC}_START`` and ``SYM_{CODE,FUNC}_END``.  They are very similar
+  to C labels, except they can be made global. An example of use::
+
+    SYM_CODE_START(ftrace_caller)
+        /* save_mcount_regs fills in first two parameters */
+        ...
+
+    SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
+        /* Load the ftrace_ops into the 3rd parameter */
+        ...
+
+    SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
+        call ftrace_stub
+        ...
+        retq
+    SYM_CODE_END(ftrace_caller)
+
+Data Macros
+~~~~~~~~~~~
+Similar to instructions, there is a couple of macros to describe data in the
+assembly.
+
+* ``SYM_DATA_START`` and ``SYM_DATA_START_LOCAL`` mark the start of some data
+  and shall be used in conjunction with either ``SYM_DATA_END``, or
+  ``SYM_DATA_END_LABEL``. The latter adds also a label to the end, so that
+  people can use ``lstack`` and (local) ``lstack_end`` in the following
+  example::
+
+    SYM_DATA_START_LOCAL(lstack)
+        .skip 4096
+    SYM_DATA_END_LABEL(lstack, SYM_L_LOCAL, lstack_end)
+
+* ``SYM_DATA`` and ``SYM_DATA_LOCAL`` are variants for simple, mostly one-line
+  data::
+
+    SYM_DATA(HEAP,     .long rm_heap)
+    SYM_DATA(heap_end, .long rm_stack)
+
+  In the end, they expand to ``SYM_DATA_START`` with ``SYM_DATA_END``
+  internally.
+
+Support Macros
+~~~~~~~~~~~~~~
+All the above reduce themselves to some invocation of ``SYM_START``,
+``SYM_END``, or ``SYM_ENTRY`` at last. Normally, developers should avoid using
+these.
+
+Further, in the above examples, one could see ``SYM_L_LOCAL``. There are also
+``SYM_L_GLOBAL`` and ``SYM_L_WEAK``. All are intended to denote linkage of a
+symbol marked by them. They are used either in ``_LABEL`` variants of the
+earlier macros, or in ``SYM_START``.
+
+
+Overriding Macros
+~~~~~~~~~~~~~~~~~
+Architecture can also override any of the macros in their own
+``asm/linkage.h``, including macros specifying the type of a symbol
+(``SYM_T_FUNC``, ``SYM_T_OBJECT``, and ``SYM_T_NONE``).  As every macro
+described in this file is surrounded by ``#ifdef`` + ``#endif``, it is enough
+to define the macros differently in the aforementioned architecture-dependent
+header.
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index dc95df462eea..f5d8e3779fe8 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -23,6 +23,7 @@ it.
    printk-formats
    printk-index
    symbol-namespaces
+   asm-annotations
 
 Data structures and low-level utilities
 =======================================
diff --git a/Documentation/index.rst b/Documentation/index.rst
index da80c584133c..5a700548ae82 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -89,14 +89,6 @@ platform firmwares.
    devicetree/index
 
 
-Architecture-agnostic documentation
------------------------------------
-
-.. toctree::
-   :maxdepth: 1
-
-   asm-annotations
-
 Architecture-specific documentation
 -----------------------------------
 
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 79e759aac543..812af52f97d2 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3751,7 +3751,7 @@ sub process {
 		if ($realfile =~ /\.S$/ &&
 		    $line =~ /^\+\s*(?:[A-Z]+_)?SYM_[A-Z]+_(?:START|END)(?:_[A-Z_]+)?\s*\(\s*\.L/) {
 			WARN("AVOID_L_PREFIX",
-			     "Avoid using '.L' prefixed local symbol names for denoting a range of code via 'SYM_*_START/END' annotations; see Documentation/asm-annotations.rst\n" . $herecurr);
+			     "Avoid using '.L' prefixed local symbol names for denoting a range of code via 'SYM_*_START/END' annotations; see Documentation/core-api/asm-annotations.rst\n" . $herecurr);
 		}
 
 # check we are in a valid source file C or perl if not then ignore this hunk
-- 
cgit v1.2.3


From e40573a43d163a5c9fe14c647bc4c5201d782893 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 27 Sep 2022 10:05:58 -0600
Subject: docs: put atomic*.txt and memory-barriers.txt into the core-api book

These files describe part of the core API, but have never been converted to
RST due to ... let's say local oppposition.  So, create a set of
special-purpose wrappers to ..include these files into a separate page so
that they can be a part of the htmldocs build.  Then link them into the
core-api manual and remove them from the "staging" dumping ground.

Acked-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Reviewed-by: David Vernet <void@manifault.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20220927160559.97154-7-corbet@lwn.net
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/core-api/index.rst                   |  3 ++
 Documentation/core-api/wrappers/atomic_bitops.rst  | 18 ++++++++++
 Documentation/core-api/wrappers/atomic_t.rst       | 19 ++++++++++
 .../core-api/wrappers/memory-barriers.rst          | 18 ++++++++++
 Documentation/staging/index.rst                    | 42 ----------------------
 5 files changed, 58 insertions(+), 42 deletions(-)
 create mode 100644 Documentation/core-api/wrappers/atomic_bitops.rst
 create mode 100644 Documentation/core-api/wrappers/atomic_t.rst
 create mode 100644 Documentation/core-api/wrappers/memory-barriers.rst

diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index f5d8e3779fe8..b0e7b4771fff 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -45,6 +45,8 @@ Library functionality that is used throughout the kernel.
    this_cpu_ops
    timekeeping
    errseq
+   wrappers/atomic_t
+   wrappers/atomic_bitops
 
 Low level entry and exit
 ========================
@@ -68,6 +70,7 @@ Documentation/locking/index.rst for more related documentation.
    local_ops
    padata
    ../RCU/index
+   wrappers/memory-barriers.rst
 
 Low-level hardware management
 =============================
diff --git a/Documentation/core-api/wrappers/atomic_bitops.rst b/Documentation/core-api/wrappers/atomic_bitops.rst
new file mode 100644
index 000000000000..bf24e4081a8f
--- /dev/null
+++ b/Documentation/core-api/wrappers/atomic_bitops.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+   This is a simple wrapper to bring atomic_bitops.txt into the RST world
+   until such a time as that file can be converted directly.
+
+=============
+Atomic bitops
+=============
+
+.. raw:: latex
+
+    \footnotesize
+
+.. include:: ../../atomic_bitops.txt
+   :literal:
+
+.. raw:: latex
+
+    \normalsize
diff --git a/Documentation/core-api/wrappers/atomic_t.rst b/Documentation/core-api/wrappers/atomic_t.rst
new file mode 100644
index 000000000000..ed109a964c77
--- /dev/null
+++ b/Documentation/core-api/wrappers/atomic_t.rst
@@ -0,0 +1,19 @@
+.. SPDX-License-Identifier: GPL-2.0
+   This is a simple wrapper to bring atomic_t.txt into the RST world
+   until such a time as that file can be converted directly.
+
+============
+Atomic types
+============
+
+.. raw:: latex
+
+    \footnotesize
+
+.. include:: ../../atomic_t.txt
+   :literal:
+
+.. raw:: latex
+
+    \normalsize
+
diff --git a/Documentation/core-api/wrappers/memory-barriers.rst b/Documentation/core-api/wrappers/memory-barriers.rst
new file mode 100644
index 000000000000..532460b5e3eb
--- /dev/null
+++ b/Documentation/core-api/wrappers/memory-barriers.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+   This is a simple wrapper to bring memory-barriers.txt into the RST world
+   until such a time as that file can be converted directly.
+
+============================
+Linux kernel memory barriers
+============================
+
+.. raw:: latex
+
+    \footnotesize
+
+.. include:: ../../memory-barriers.txt
+   :literal:
+
+.. raw:: latex
+
+    \normalsize
diff --git a/Documentation/staging/index.rst b/Documentation/staging/index.rst
index abd0d18254d2..ded8254bc0d7 100644
--- a/Documentation/staging/index.rst
+++ b/Documentation/staging/index.rst
@@ -14,45 +14,3 @@ Unsorted Documentation
    static-keys
    tee
    xz
-
-Atomic Types
-============
-
-.. raw:: latex
-
-    \footnotesize
-
-.. include:: ../atomic_t.txt
-   :literal:
-
-.. raw:: latex
-
-    \normalsize
-
-Atomic bitops
-=============
-
-.. raw:: latex
-
-    \footnotesize
-
-.. include:: ../atomic_bitops.txt
-   :literal:
-
-.. raw:: latex
-
-    \normalsize
-
-Memory Barriers
-===============
-
-.. raw:: latex
-
-    \footnotesize
-
-.. include:: ../memory-barriers.txt
-   :literal:
-
-.. raw:: latex
-
-    \normalsize
-- 
cgit v1.2.3


From 489876063fb14290d2d1b6080c5fdc02b7a481d4 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet <corbet@lwn.net>
Date: Tue, 27 Sep 2022 10:05:59 -0600
Subject: docs: add a man-pages link to the front page

Readers looking for user-oriented information may benefit from it.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
Reviewed-by: David Vernet <void@manifault.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20220927160559.97154-8-corbet@lwn.net
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/index.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/index.rst b/Documentation/index.rst
index 5a700548ae82..85eab6e990ab 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -76,6 +76,8 @@ developers seeking information on the kernel's user-space APIs.
    User-space tools <tools/index>
    userspace-api/index
 
+See also: the `Linux man pages <https://www.kernel.org/doc/man-pages/>`_,
+which are kept separately from the kernel's own documentation.
 
 Firmware-related documentation
 ------------------------------
-- 
cgit v1.2.3


From 8e8bf60a675473a034ecfcb7a842b98868ed74a3 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 29 Sep 2022 15:20:16 -0300
Subject: perf build: Fixup disabling of -Wdeprecated-declarations for the
 python scripting engine

A brown paper bag where -Wno-error=deprecated-declarations was added
from compiler output when the right thing is to add
-Wno-deprecated-declarations, fix it.

Fixes: 4ee3c4da8b1b9c22 ("perf scripting python: Do not build fail on deprecation warnings")
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/util/scripting-engines/Build | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/scripting-engines/Build b/tools/perf/util/scripting-engines/Build
index c92326c2233a..0f5ba28339cf 100644
--- a/tools/perf/util/scripting-engines/Build
+++ b/tools/perf/util/scripting-engines/Build
@@ -3,4 +3,4 @@ perf-$(CONFIG_LIBPYTHON) += trace-event-python.o
 
 CFLAGS_trace-event-perl.o += $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-nested-externs -Wno-undef -Wno-switch-default -Wno-bad-function-cast -Wno-declaration-after-statement -Wno-switch-enum
 
-CFLAGS_trace-event-python.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-error=deprecated-declarations
+CFLAGS_trace-event-python.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-deprecated-declarations
-- 
cgit v1.2.3


From a3aded135e84a581ba567c30ecddff47c8b6cfcf Mon Sep 17 00:00:00 2001
From: Sonny Jiang <sonny.jiang@amd.com>
Date: Thu, 29 Sep 2022 12:24:27 -0400
Subject: drm/amdgpu: Enable VCN DPG for GC11_0_1

Enable VCN DPG on GC11_0_1

Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
Reviewed-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/soc21.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c
index 2e50db3b761e..6e564b549b9f 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -625,6 +625,7 @@ static int soc21_common_early_init(void *handle)
 			AMD_CG_SUPPORT_JPEG_MGCG;
 		adev->pg_flags =
 			AMD_PG_SUPPORT_GFX_PG |
+			AMD_PG_SUPPORT_VCN_DPG |
 			AMD_PG_SUPPORT_JPEG;
 		adev->external_rev_id = adev->rev_id + 0x1;
 		break;
-- 
cgit v1.2.3


From 541540b9045c2665d6736a77c776ac81d4225eec Mon Sep 17 00:00:00 2001
From: Sonny Jiang <sonny.jiang@amd.com>
Date: Thu, 29 Sep 2022 12:26:45 -0400
Subject: drm/amdgpu: Enable sram on vcn_4_0_2

Enable sram on vcn_4_0_2

Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
Reviewed-by: James Zhu <James.Zhu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index f36e4f08db6d..0b52af415b28 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -191,7 +191,7 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)
 		fw_name = FIRMWARE_VCN4_0_2;
 		if ((adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) &&
 			(adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG))
-			adev->vcn.indirect_sram = false;
+			adev->vcn.indirect_sram = true;
 		break;
 	case IP_VERSION(4, 0, 4):
 		fw_name = FIRMWARE_VCN4_0_4;
-- 
cgit v1.2.3


From e94102e5067591aa713e623d7d1226f07d4431cf Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Fri, 16 Sep 2022 17:55:06 +0800
Subject: docs, kprobes: Fix the wrong location of Kprobes

After commit 22471e1313f2 ("kconfig: use a menu in arch/Kconfig to reduce
clutter"), the location of Kprobes is under "General architecture-dependent
options" rather than "General setup".

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Fixes: 22471e1313f2 ("kconfig: use a menu in arch/Kconfig to reduce clutter")
Link: https://lore.kernel.org/r/1663322106-12178-1-git-send-email-yangtiezhu@loongson.cn
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/trace/kprobes.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/trace/kprobes.rst b/Documentation/trace/kprobes.rst
index f318bceda1e6..48cf778a2468 100644
--- a/Documentation/trace/kprobes.rst
+++ b/Documentation/trace/kprobes.rst
@@ -328,8 +328,8 @@ Configuring Kprobes
 ===================
 
 When configuring the kernel using make menuconfig/xconfig/oldconfig,
-ensure that CONFIG_KPROBES is set to "y". Under "General setup", look
-for "Kprobes".
+ensure that CONFIG_KPROBES is set to "y", look for "Kprobes" under
+"General architecture-dependent options".
 
 So that you can load and unload Kprobes-based instrumentation modules,
 make sure "Loadable module support" (CONFIG_MODULES) and "Module
-- 
cgit v1.2.3


From 2f993509a97ed210ed0ada9198d380885265fa0b Mon Sep 17 00:00:00 2001
From: Thorsten Leemhuis <linux@leemhuis.info>
Date: Wed, 28 Sep 2022 15:39:40 +0200
Subject: docs: process/5.Posting.rst: clarify use of Reported-by: tag

Bring the description on when to use the Reported-by: tag found in
Documentation/process/5.Posting.rst more in line with the description in
Documentation/process/submitting-patches.rst: before this change the two
were contradicting each other, as the latter is way more permissive and
only states '[...] if the bug was reported in private, then ask for
permission first before using the Reported-by tag.'

Signed-off-by: Thorsten Leemhuis <linux@leemhuis.info>
Link: https://lore.kernel.org/r/2fc7162dfb76e04da5ea903c9c170d913e735dad.1664372256.git.linux@leemhuis.info
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/process/5.Posting.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Documentation/process/5.Posting.rst b/Documentation/process/5.Posting.rst
index 906235c11c24..d87f1fee4cbc 100644
--- a/Documentation/process/5.Posting.rst
+++ b/Documentation/process/5.Posting.rst
@@ -256,8 +256,10 @@ The tags in common use are:
  - Cc: the named person received a copy of the patch and had the
    opportunity to comment on it.
 
-Be careful in the addition of tags to your patches: only Cc: is appropriate
-for addition without the explicit permission of the person named.
+Be careful in the addition of tags to your patches, as only Cc: is appropriate
+for addition without the explicit permission of the person named; using
+Reported-by: is fine most of the time as well, but ask for permission if
+the bug was reported in private.
 
 
 Sending the patch
-- 
cgit v1.2.3


From 679b4bc25fc7b5b742622eb760211f2b5b1dc4db Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Fri, 16 Sep 2022 19:08:05 +0900
Subject: docs/doc-guide: Add documentation on SPHINX_IMGMATH

Now that building html docs with math expressions does not need texlive
packages, remove the note on the requirement in the "Sphinx Install"
section.

Instead, add sections of "Math Expressions in HTML" and "Choice of Math
Renderer".
Describe the effect of setting SPHINX_IMGMATH in the latter section.

Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/a67e3279-6bc7-ee2c-2b49-9275252460b0@gmail.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/doc-guide/sphinx.rst | 57 +++++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/Documentation/doc-guide/sphinx.rst b/Documentation/doc-guide/sphinx.rst
index 1228b85f6f77..c708cec889af 100644
--- a/Documentation/doc-guide/sphinx.rst
+++ b/Documentation/doc-guide/sphinx.rst
@@ -48,10 +48,6 @@ or ``virtualenv``, depending on how your distribution packaged Python 3.
       on the Sphinx version, it should be installed separately,
       with ``pip install sphinx_rtd_theme``.
 
-   #) Some ReST pages contain math expressions. Due to the way Sphinx works,
-      those expressions are written using LaTeX notation. It needs texlive
-      installed with amsfonts and amsmath in order to evaluate them.
-
 In summary, if you want to install Sphinx version 2.4.4, you should do::
 
        $ virtualenv sphinx_2.4.4
@@ -86,6 +82,27 @@ Depending on the distribution, you may also need to install a series of
 ``texlive`` packages that provide the minimal set of functionalities
 required for ``XeLaTeX`` to work.
 
+Math Expressions in HTML
+------------------------
+
+Some ReST pages contain math expressions. Due to the way Sphinx works,
+those expressions are written using LaTeX notation.
+There are two options for Sphinx to render math expressions in html output.
+One is an extension called `imgmath`_ which converts math expressions into
+images and embeds them in html pages.
+The other is an extension called `mathjax`_ which delegates math rendering
+to JavaScript capable web browsers.
+The former was the only option for pre-6.1 kernel documentation and it
+requires quite a few texlive packages including amsfonts and amsmath among
+others.
+
+Since kernel release 6.1, html pages with math expressions can be built
+without installing any texlive packages. See `Choice of Math Renderer`_ for
+further info.
+
+.. _imgmath: https://www.sphinx-doc.org/en/master/usage/extensions/math.html#module-sphinx.ext.imgmath
+.. _mathjax: https://www.sphinx-doc.org/en/master/usage/extensions/math.html#module-sphinx.ext.mathjax
+
 .. _sphinx-pre-install:
 
 Checking for Sphinx dependencies
@@ -164,6 +181,38 @@ To remove the generated documentation, run ``make cleandocs``.
 	  as well would improve the quality of images embedded in PDF
 	  documents, especially for kernel releases 5.18 and later.
 
+Choice of Math Renderer
+-----------------------
+
+Since kernel release 6.1, mathjax works as a fallback math renderer for
+html output.\ [#sph1_8]_
+
+Math renderer is chosen depending on available commands as shown below:
+
+.. table:: Math Renderer Choices for HTML
+
+    ============= ================= ============
+    Math renderer Required commands Image format
+    ============= ================= ============
+    imgmath       latex, dvipng     PNG (raster)
+    mathjax
+    ============= ================= ============
+
+The choice can be overridden by setting an environment variable
+``SPHINX_IMGMATH`` as shown below:
+
+.. table:: Effect of Setting ``SPHINX_IMGMATH``
+
+    ====================== ========
+    Setting                Renderer
+    ====================== ========
+    ``SPHINX_IMGMATH=yes`` imgmath
+    ``SPHINX_IMGMATH=no``  mathjax
+    ====================== ========
+
+.. [#sph1_8] Fallback of math renderer requires Sphinx >=1.8.
+
+
 Writing Documentation
 =====================
 
-- 
cgit v1.2.3


From 26e54448092917edd54c4255382f78b24e07b01f Mon Sep 17 00:00:00 2001
From: Kristen Carlson Accardi <kristen@linux.intel.com>
Date: Mon, 26 Sep 2022 14:11:49 -0700
Subject: Documentation/CoC: Reflect current CoC interpretation and practices

The Code of Conduct interpretation does not reflect the current
practices of the CoC committee or the TAB. Update the documentation
to remove references to initial committees and boot strap periods
since it is past that time, and note that the this document
does serve as the documentation for the CoC committee processes.

Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
Reviewed-by: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20220926211149.2278214-1-kristen@linux.intel.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 .../process/code-of-conduct-interpretation.rst     | 24 ++++++++++++----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/Documentation/process/code-of-conduct-interpretation.rst b/Documentation/process/code-of-conduct-interpretation.rst
index 4f8a06b00f60..922e0b547bc3 100644
--- a/Documentation/process/code-of-conduct-interpretation.rst
+++ b/Documentation/process/code-of-conduct-interpretation.rst
@@ -127,10 +127,12 @@ are listed at https://kernel.org/code-of-conduct.html.  Members can not
 access reports made before they joined or after they have left the
 committee.
 
-The initial Code of Conduct Committee consists of volunteer members of
-the TAB, as well as a professional mediator acting as a neutral third
-party.  The first task of the committee is to establish documented
-processes, which will be made public.
+The Code of Conduct Committee consists of volunteer community members
+appointed by the TAB, as well as a professional mediator acting as a
+neutral third party.  The processes the Code of Conduct committee will
+use to address reports is varied and will depend on the individual
+circumstance, however, this file serves as documentation for the
+general process used.
 
 Any member of the committee, including the mediator, can be contacted
 directly if a reporter does not wish to include the full committee in a
@@ -141,16 +143,16 @@ processes (see above) and consults with the TAB as needed and
 appropriate, for instance to request and receive information about the
 kernel community.
 
-Any decisions by the committee will be brought to the TAB, for
-implementation of enforcement with the relevant maintainers if needed.
-A decision by the Code of Conduct Committee can be overturned by the TAB
-by a two-thirds vote.
+Any decisions regarding enforcement recommendations will be brought to
+the TAB for implementation of enforcement with the relevant maintainers
+if needed.  A decision by the Code of Conduct Committee can be overturned
+by the TAB by a two-thirds vote.
 
 At quarterly intervals, the Code of Conduct Committee and TAB will
 provide a report summarizing the anonymised reports that the Code of
 Conduct committee has received and their status, as well details of any
 overridden decisions including complete and identifiable voting details.
 
-We expect to establish a different process for Code of Conduct Committee
-staffing beyond the bootstrap period.  This document will be updated
-with that information when this occurs.
+Because how we interpret and enforce the Code of Conduct will evolve over
+time, this document will be updated when necessary to reflect any
+changes.
-- 
cgit v1.2.3


From 9a7d7a80e11025e5e8b4dedf75c97096b67a7b9b Mon Sep 17 00:00:00 2001
From: Vernon Yang <vernon2gm@gmail.com>
Date: Mon, 26 Sep 2022 23:20:32 +0800
Subject: Documentation/mm: modify page_referenced to folio_referenced

Since commit b3ac04132c4b ("mm/rmap: Turn page_referenced() into
folio_referenced()") the page_referenced function name was modified,
so fix it up to use the correct one.

Signed-off-by: Vernon Yang <vernon2gm@gmail.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Link: https://lore.kernel.org/r/20220926152032.74621-1-vernon2gm@gmail.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/mm/unevictable-lru.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst
index b280367d6a44..4a0e158aa9ce 100644
--- a/Documentation/mm/unevictable-lru.rst
+++ b/Documentation/mm/unevictable-lru.rst
@@ -197,7 +197,7 @@ unevictable list for the memory cgroup and node being scanned.
 There may be situations where a page is mapped into a VM_LOCKED VMA, but the
 page is not marked as PG_mlocked.  Such pages will make it all the way to
 shrink_active_list() or shrink_page_list() where they will be detected when
-vmscan walks the reverse map in page_referenced() or try_to_unmap().  The page
+vmscan walks the reverse map in folio_referenced() or try_to_unmap().  The page
 is culled to the unevictable list when it is released by the shrinker.
 
 To "cull" an unevictable page, vmscan simply puts the page back on the LRU list
@@ -267,7 +267,7 @@ the LRU.  Such pages can be "noticed" by memory management in several places:
  (4) in the fault path and when a VM_LOCKED stack segment is expanded; or
 
  (5) as mentioned above, in vmscan:shrink_page_list() when attempting to
-     reclaim a page in a VM_LOCKED VMA by page_referenced() or try_to_unmap().
+     reclaim a page in a VM_LOCKED VMA by folio_referenced() or try_to_unmap().
 
 mlocked pages become unlocked and rescued from the unevictable list when:
 
@@ -547,7 +547,7 @@ vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously
 unevictable pages found on the inactive lists to the appropriate memory cgroup
 and node unevictable list.
 
-rmap's page_referenced_one(), called via vmscan's shrink_active_list() or
+rmap's folio_referenced_one(), called via vmscan's shrink_active_list() or
 shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(),
 check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_page()
 to correct them.  Such pages are culled to the unevictable list when released
-- 
cgit v1.2.3


From 06699e6915536b88ec9457f105a5facf5bc81f88 Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Tue, 27 Sep 2022 16:38:19 +0800
Subject: Documentation: devres: update IRQ helper

devm_irq_sim_init() has been changed to devm_irq_domain_create_sim()
in commit 337cbeb2c13e ("genirq/irq_sim: Simplify the API").

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Link: https://lore.kernel.org/r/20220927083819.12484-1-yangyingliang@huawei.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/driver-api/driver-model/devres.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 6f190967ba2e..79c80ecf6a42 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -334,7 +334,7 @@ IRQ
   devm_irq_alloc_descs_from()
   devm_irq_alloc_generic_chip()
   devm_irq_setup_generic_chip()
-  devm_irq_sim_init()
+  devm_irq_domain_create_sim()
 
 LED
   devm_led_classdev_register()
-- 
cgit v1.2.3


From 657ed9c9bca059660238771dd1fcecb57b59f90a Mon Sep 17 00:00:00 2001
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Tue, 27 Sep 2022 16:02:15 +0800
Subject: Documentation: devres: add missing IO helper

Add missing devm_request_free_mem_region() to devres.rst.
It's introduced by commit 0092908d16c6 ("mm: factor out a
devm_request_free_mem_region helper").

Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
Link: https://lore.kernel.org/r/20220927080215.1359979-1-yangyingliang@huawei.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/driver-api/driver-model/devres.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 79c80ecf6a42..dc1b2353cea3 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -301,6 +301,7 @@ IO region
   devm_release_region()
   devm_release_resource()
   devm_request_mem_region()
+  devm_request_free_mem_region()
   devm_request_region()
   devm_request_resource()
 
-- 
cgit v1.2.3


From 1cfd9d7e43d5a1cf739d1420b10b1e65feb02f88 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 23 Sep 2022 13:34:24 +0200
Subject: coding-style.rst: document BUG() and WARN() rules ("do not crash the
 kernel")

Linus notes [1] that the introduction of new code that uses VM_BUG_ON()
is just as bad as BUG_ON(), because it will crash the kernel on
distributions that enable CONFIG_DEBUG_VM (like Fedora):

    VM_BUG_ON() has the exact same semantics as BUG_ON. It is literally
    no different, the only difference is "we can make the code smaller
    because these are less important". [2]

This resulted in a more generic discussion about usage of BUG() and
friends. While there might be corner cases that still deserve a BUG_ON(),
most BUG_ON() cases should simply use WARN_ON_ONCE() and implement a
recovery path if reasonable:

    The only possible case where BUG_ON can validly be used is "I have
    some fundamental data corruption and cannot possibly return an
    error". [2]

As a very good approximation is the general rule:

    "absolutely no new BUG_ON() calls _ever_" [2]

... not even if something really shouldn't ever happen and is merely for
documenting that an invariant always has to hold. However, there are sill
exceptions where BUG_ON() may be used:

    If you have a "this is major internal corruption, there's no way we can
    continue", then BUG_ON() is appropriate. [3]

There is only one good BUG_ON():

    Now, that said, there is one very valid sub-form of BUG_ON():
    BUILD_BUG_ON() is absolutely 100% fine. [2]

While WARN will also crash the machine with panic_on_warn set, that's
exactly to be expected:

    So we have two very different cases: the "virtual machine with good
    logging where a dead machine is fine" - use 'panic_on_warn'. And
    the actual real hardware with real drivers, running real loads by
    users. [4]

The basic idea is that warnings will similarly get reported by users
and be found during testing. However, in contrast to a BUG(), there is a
way to actually influence the expected behavior (e.g., panic_on_warn)
and to eventually keep the machine alive to extract some debug info.

Ingo notes that not all WARN_ON_ONCE cases need recovery. If we don't ever
expect this code to trigger in any case, recovery code is not really
helpful.

    I'd prefer to keep all these warnings 'simple' - i.e. no attempted
    recovery & control flow, unless we ever expect these to trigger.
    [5]

There have been different rules floating around that were never properly
documented. Let's try to clarify.

[1] https://lkml.kernel.org/r/CAHk-=wiEAH+ojSpAgx_Ep=NKPWHU8AdO3V56BXcCsU97oYJ1EA@mail.gmail.com
[2] https://lore.kernel.org/r/CAHk-=wg40EAZofO16Eviaj7mfqDhZ2gVEbvfsMf6gYzspRjYvw@mail.gmail.com
[3] https://lkml.kernel.org/r/CAHk-=wit-DmhMfQErY29JSPjFgebx_Ld+pnerc4J2Ag990WwAA@mail.gmail.com
[4] https://lore.kernel.org/r/CAHk-=wgF7K2gSSpy=m_=K3Nov4zaceUX9puQf1TjkTJLA2XC_g@mail.gmail.com
[5] https://lore.kernel.org/r/YwIW+mVeZoTOxn%2F4@gmail.com

Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/20220923113426.52871-2-david@redhat.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 Documentation/process/coding-style.rst | 62 ++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst
index 03eb53fd029a..007e49ef6cec 100644
--- a/Documentation/process/coding-style.rst
+++ b/Documentation/process/coding-style.rst
@@ -1186,6 +1186,68 @@ expression used.  For instance:
 	#endif /* CONFIG_SOMETHING */
 
 
+22) Do not crash the kernel
+---------------------------
+
+In general, the decision to crash the kernel belongs to the user, rather
+than to the kernel developer.
+
+Avoid panic()
+*************
+
+panic() should be used with care and primarily only during system boot.
+panic() is, for example, acceptable when running out of memory during boot and
+not being able to continue.
+
+Use WARN() rather than BUG()
+****************************
+
+Do not add new code that uses any of the BUG() variants, such as BUG(),
+BUG_ON(), or VM_BUG_ON(). Instead, use a WARN*() variant, preferably
+WARN_ON_ONCE(), and possibly with recovery code. Recovery code is not
+required if there is no reasonable way to at least partially recover.
+
+"I'm too lazy to do error handling" is not an excuse for using BUG(). Major
+internal corruptions with no way of continuing may still use BUG(), but need
+good justification.
+
+Use WARN_ON_ONCE() rather than WARN() or WARN_ON()
+**************************************************
+
+WARN_ON_ONCE() is generally preferred over WARN() or WARN_ON(), because it
+is common for a given warning condition, if it occurs at all, to occur
+multiple times. This can fill up and wrap the kernel log, and can even slow
+the system enough that the excessive logging turns into its own, additional
+problem.
+
+Do not WARN lightly
+*******************
+
+WARN*() is intended for unexpected, this-should-never-happen situations.
+WARN*() macros are not to be used for anything that is expected to happen
+during normal operation. These are not pre- or post-condition asserts, for
+example. Again: WARN*() must not be used for a condition that is expected
+to trigger easily, for example, by user space actions. pr_warn_once() is a
+possible alternative, if you need to notify the user of a problem.
+
+Do not worry about panic_on_warn users
+**************************************
+
+A few more words about panic_on_warn: Remember that ``panic_on_warn`` is an
+available kernel option, and that many users set this option. This is why
+there is a "Do not WARN lightly" writeup, above. However, the existence of
+panic_on_warn users is not a valid reason to avoid the judicious use
+WARN*(). That is because, whoever enables panic_on_warn has explicitly
+asked the kernel to crash if a WARN*() fires, and such users must be
+prepared to deal with the consequences of a system that is somewhat more
+likely to crash.
+
+Use BUILD_BUG_ON() for compile-time assertions
+**********************************************
+
+The use of BUILD_BUG_ON() is acceptable and encouraged, because it is a
+compile-time assertion that has no effect at runtime.
+
 Appendix I) References
 ----------------------
 
-- 
cgit v1.2.3


From 69d517e6e21099f81efbd39e47874649ae575804 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 23 Sep 2022 13:34:25 +0200
Subject: checkpatch: warn on usage of VM_BUG_ON() and other BUG variants

checkpatch does not point out that VM_BUG_ON() and friends should be
avoided, however, Linus notes:

    VM_BUG_ON() has the exact same semantics as BUG_ON. It is literally
    no different, the only difference is "we can make the code smaller
    because these are less important". [1]

So let's warn on VM_BUG_ON() and other BUG variants as well. While at it,
make it clearer that the kernel really shouldn't be crashed.

As there are some subsystem BUG macros that actually don't end up crashing
the kernel -- for example, KVM_BUG_ON() -- exclude these manually.

[1] https://lore.kernel.org/r/CAHk-=wg40EAZofO16Eviaj7mfqDhZ2gVEbvfsMf6gYzspRjYvw@mail.gmail.com

Signed-off-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/20220923113426.52871-3-david@redhat.com
Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 scripts/checkpatch.pl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 812af52f97d2..4aa09e0cb86a 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -4695,12 +4695,12 @@ sub process {
 			}
 		}
 
-# avoid BUG() or BUG_ON()
-		if ($line =~ /\b(?:BUG|BUG_ON)\b/) {
+# do not use BUG() or variants
+		if ($line =~ /\b(?!AA_|BUILD_|DCCP_|IDA_|KVM_|RWLOCK_|snd_|SPIN_)(?:[a-zA-Z_]*_)?BUG(?:_ON)?(?:_[A-Z_]+)?\s*\(/) {
 			my $msg_level = \&WARN;
 			$msg_level = \&CHK if ($file);
 			&{$msg_level}("AVOID_BUG",
-				      "Avoid crashing the kernel - try using WARN_ON & recovery code rather than BUG() or BUG_ON()\n" . $herecurr);
+				      "Do not crash the kernel unless it is absolutely unavoidable--use WARN_ON_ONCE() plus recovery code (if feasible) instead of BUG() or variants\n" . $herecurr);
 		}
 
 # avoid LINUX_VERSION_CODE
-- 
cgit v1.2.3


From 35fcbc4243aad7e7d020b7c1dfb14bb888b20a4f Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 29 Sep 2022 13:27:13 -0700
Subject: Bluetooth: L2CAP: Fix user-after-free

This uses l2cap_chan_hold_unless_zero() after calling
__l2cap_get_chan_blah() to prevent the following trace:

Bluetooth: l2cap_core.c:static void l2cap_chan_destroy(struct kref
*kref)
Bluetooth: chan 0000000023c4974d
Bluetooth: parent 00000000ae861c08
==================================================================
BUG: KASAN: use-after-free in __mutex_waiter_is_first
kernel/locking/mutex.c:191 [inline]
BUG: KASAN: use-after-free in __mutex_lock_common
kernel/locking/mutex.c:671 [inline]
BUG: KASAN: use-after-free in __mutex_lock+0x278/0x400
kernel/locking/mutex.c:729
Read of size 8 at addr ffff888006a49b08 by task kworker/u3:2/389

Link: https://lore.kernel.org/lkml/20220622082716.478486-1-lee.jones@linaro.org
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Sungwoo Kim <iam@sung-woo.kim>
---
 net/bluetooth/l2cap_core.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 770891f68703..1f34b82ca0ec 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -4309,6 +4309,12 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn,
 		}
 	}
 
+	chan = l2cap_chan_hold_unless_zero(chan);
+	if (!chan) {
+		err = -EBADSLT;
+		goto unlock;
+	}
+
 	err = 0;
 
 	l2cap_chan_lock(chan);
@@ -4338,6 +4344,7 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn,
 	}
 
 	l2cap_chan_unlock(chan);
+	l2cap_chan_put(chan);
 
 unlock:
 	mutex_unlock(&conn->chan_lock);
-- 
cgit v1.2.3


From 9f4beead610c83065cc0410bfe97ff51d8e9578d Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Thu, 29 Sep 2022 22:39:03 +0200
Subject: binfmt: remove taso from linux_binprm struct

With commit 987f20a9dcce ("a.out: Remove the a.out implementation"), the
use of the special taso flag for alpha architectures in the linux_binprm
struct is gone.

Remove the definition of taso in the linux_binprm struct.

No functional change.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220929203903.9475-1-lukas.bulwahn@gmail.com
---
 include/linux/binfmts.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 3dc20c4f394c..8d51f69f9f5e 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -43,9 +43,6 @@ struct linux_binprm {
 		 * original userspace.
 		 */
 		point_of_no_return:1;
-#ifdef __alpha__
-	unsigned int taso:1;
-#endif
 	struct file *executable; /* Executable to pass to the interpreter */
 	struct file *interpreter;
 	struct file *file;
-- 
cgit v1.2.3


From 0e3f72931fc47bb81686020cc643cde5d9cd0bb8 Mon Sep 17 00:00:00 2001
From: Junichi Uekawa <uekawa@chromium.org>
Date: Wed, 28 Sep 2022 15:45:38 +0900
Subject: vhost/vsock: Use kvmalloc/kvfree for larger packets.

When copying a large file over sftp over vsock, data size is usually 32kB,
and kmalloc seems to fail to try to allocate 32 32kB regions.

 vhost-5837: page allocation failure: order:4, mode:0x24040c0
 Call Trace:
  [<ffffffffb6a0df64>] dump_stack+0x97/0xdb
  [<ffffffffb68d6aed>] warn_alloc_failed+0x10f/0x138
  [<ffffffffb68d868a>] ? __alloc_pages_direct_compact+0x38/0xc8
  [<ffffffffb664619f>] __alloc_pages_nodemask+0x84c/0x90d
  [<ffffffffb6646e56>] alloc_kmem_pages+0x17/0x19
  [<ffffffffb6653a26>] kmalloc_order_trace+0x2b/0xdb
  [<ffffffffb66682f3>] __kmalloc+0x177/0x1f7
  [<ffffffffb66e0d94>] ? copy_from_iter+0x8d/0x31d
  [<ffffffffc0689ab7>] vhost_vsock_handle_tx_kick+0x1fa/0x301 [vhost_vsock]
  [<ffffffffc06828d9>] vhost_worker+0xf7/0x157 [vhost]
  [<ffffffffb683ddce>] kthread+0xfd/0x105
  [<ffffffffc06827e2>] ? vhost_dev_set_owner+0x22e/0x22e [vhost]
  [<ffffffffb683dcd1>] ? flush_kthread_worker+0xf3/0xf3
  [<ffffffffb6eb332e>] ret_from_fork+0x4e/0x80
  [<ffffffffb683dcd1>] ? flush_kthread_worker+0xf3/0xf3

Work around by doing kvmalloc instead.

Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko")
Signed-off-by: Junichi Uekawa <uekawa@chromium.org>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Link: https://lore.kernel.org/r/20220928064538.667678-1-uekawa@chromium.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/vhost/vsock.c                   | 2 +-
 net/vmw_vsock/virtio_transport_common.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 368330417bde..5703775af129 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -393,7 +393,7 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
 		return NULL;
 	}
 
-	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
+	pkt->buf = kvmalloc(pkt->len, GFP_KERNEL);
 	if (!pkt->buf) {
 		kfree(pkt);
 		return NULL;
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index ec2c2afbf0d0..3a12aee33e92 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -1342,7 +1342,7 @@ EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
 
 void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
 {
-	kfree(pkt->buf);
+	kvfree(pkt->buf);
 	kfree(pkt);
 }
 EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
-- 
cgit v1.2.3


From 17006e86a7641fa3c50324cfb602f0e74dac8527 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 30 Aug 2022 13:58:42 -0700
Subject: sparc: Unbreak the build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the following build errors:

arch/sparc/mm/srmmu.c: In function ‘smp_flush_page_for_dma’:
arch/sparc/mm/srmmu.c:1639:13: error: cast between incompatible function types from ‘void (*)(long unsigned int)’ to ‘void (*)(long unsigned int,  long unsigned int,  long unsigned int,  long unsigned int,  long unsigned int)’ [-Werror=cast-function-type]
 1639 |         xc1((smpfunc_t) local_ops->page_for_dma, page);
      |             ^
arch/sparc/mm/srmmu.c: In function ‘smp_flush_cache_mm’:
arch/sparc/mm/srmmu.c:1662:29: error: cast between incompatible function types from ‘void (*)(struct mm_struct *)’ to ‘void (*)(long unsigned int,  long unsigned int,  long unsigned int,  long unsigned int,  long unsigned int)’ [-Werror=cast-function-type]
 1662 |                         xc1((smpfunc_t) local_ops->cache_mm, (unsigned long) mm);
      |
[ ... ]

Compile-tested only.

Fixes: 552a23a0e5d0 ("Makefile: Enable -Wcast-function-type")
Cc: stable@vger.kernel.org
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Andreas Larsson <andreas@gaisler.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220830205854.1918026-1-bvanassche@acm.org
---
 arch/sparc/include/asm/smp_32.h | 15 ++++++---------
 arch/sparc/kernel/leon_smp.c    | 12 +++++++-----
 arch/sparc/kernel/sun4d_smp.c   | 12 +++++++-----
 arch/sparc/kernel/sun4m_smp.c   | 10 ++++++----
 arch/sparc/mm/srmmu.c           | 29 +++++++++++++----------------
 5 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/arch/sparc/include/asm/smp_32.h b/arch/sparc/include/asm/smp_32.h
index 856081761b0f..2cf7971d7f6c 100644
--- a/arch/sparc/include/asm/smp_32.h
+++ b/arch/sparc/include/asm/smp_32.h
@@ -33,9 +33,6 @@ extern volatile unsigned long cpu_callin_map[NR_CPUS];
 extern cpumask_t smp_commenced_mask;
 extern struct linux_prom_registers smp_penguin_ctable;
 
-typedef void (*smpfunc_t)(unsigned long, unsigned long, unsigned long,
-		       unsigned long, unsigned long);
-
 void cpu_panic(void);
 
 /*
@@ -57,7 +54,7 @@ void smp_bogo(struct seq_file *);
 void smp_info(struct seq_file *);
 
 struct sparc32_ipi_ops {
-	void (*cross_call)(smpfunc_t func, cpumask_t mask, unsigned long arg1,
+	void (*cross_call)(void *func, cpumask_t mask, unsigned long arg1,
 			   unsigned long arg2, unsigned long arg3,
 			   unsigned long arg4);
 	void (*resched)(int cpu);
@@ -66,28 +63,28 @@ struct sparc32_ipi_ops {
 };
 extern const struct sparc32_ipi_ops *sparc32_ipi_ops;
 
-static inline void xc0(smpfunc_t func)
+static inline void xc0(void *func)
 {
 	sparc32_ipi_ops->cross_call(func, *cpu_online_mask, 0, 0, 0, 0);
 }
 
-static inline void xc1(smpfunc_t func, unsigned long arg1)
+static inline void xc1(void *func, unsigned long arg1)
 {
 	sparc32_ipi_ops->cross_call(func, *cpu_online_mask, arg1, 0, 0, 0);
 }
-static inline void xc2(smpfunc_t func, unsigned long arg1, unsigned long arg2)
+static inline void xc2(void *func, unsigned long arg1, unsigned long arg2)
 {
 	sparc32_ipi_ops->cross_call(func, *cpu_online_mask, arg1, arg2, 0, 0);
 }
 
-static inline void xc3(smpfunc_t func, unsigned long arg1, unsigned long arg2,
+static inline void xc3(void *func, unsigned long arg1, unsigned long arg2,
 		       unsigned long arg3)
 {
 	sparc32_ipi_ops->cross_call(func, *cpu_online_mask,
 				    arg1, arg2, arg3, 0);
 }
 
-static inline void xc4(smpfunc_t func, unsigned long arg1, unsigned long arg2,
+static inline void xc4(void *func, unsigned long arg1, unsigned long arg2,
 		       unsigned long arg3, unsigned long arg4)
 {
 	sparc32_ipi_ops->cross_call(func, *cpu_online_mask,
diff --git a/arch/sparc/kernel/leon_smp.c b/arch/sparc/kernel/leon_smp.c
index 1eed26d423fb..991e9ad3d3e8 100644
--- a/arch/sparc/kernel/leon_smp.c
+++ b/arch/sparc/kernel/leon_smp.c
@@ -359,7 +359,7 @@ void leonsmp_ipi_interrupt(void)
 }
 
 static struct smp_funcall {
-	smpfunc_t func;
+	void *func;
 	unsigned long arg1;
 	unsigned long arg2;
 	unsigned long arg3;
@@ -372,7 +372,7 @@ static struct smp_funcall {
 static DEFINE_SPINLOCK(cross_call_lock);
 
 /* Cross calls must be serialized, at least currently. */
-static void leon_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1,
+static void leon_cross_call(void *func, cpumask_t mask, unsigned long arg1,
 			    unsigned long arg2, unsigned long arg3,
 			    unsigned long arg4)
 {
@@ -384,7 +384,7 @@ static void leon_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1,
 
 		{
 			/* If you make changes here, make sure gcc generates proper code... */
-			register smpfunc_t f asm("i0") = func;
+			register void *f asm("i0") = func;
 			register unsigned long a1 asm("i1") = arg1;
 			register unsigned long a2 asm("i2") = arg2;
 			register unsigned long a3 asm("i3") = arg3;
@@ -444,11 +444,13 @@ static void leon_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1,
 /* Running cross calls. */
 void leon_cross_call_irq(void)
 {
+	void (*func)(unsigned long, unsigned long, unsigned long, unsigned long,
+		     unsigned long) = ccall_info.func;
 	int i = smp_processor_id();
 
 	ccall_info.processors_in[i] = 1;
-	ccall_info.func(ccall_info.arg1, ccall_info.arg2, ccall_info.arg3,
-			ccall_info.arg4, ccall_info.arg5);
+	func(ccall_info.arg1, ccall_info.arg2, ccall_info.arg3, ccall_info.arg4,
+	     ccall_info.arg5);
 	ccall_info.processors_out[i] = 1;
 }
 
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index ff30f03beb7c..9a62a5cf3337 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -268,7 +268,7 @@ static void sun4d_ipi_resched(int cpu)
 }
 
 static struct smp_funcall {
-	smpfunc_t func;
+	void *func;
 	unsigned long arg1;
 	unsigned long arg2;
 	unsigned long arg3;
@@ -281,7 +281,7 @@ static struct smp_funcall {
 static DEFINE_SPINLOCK(cross_call_lock);
 
 /* Cross calls must be serialized, at least currently. */
-static void sun4d_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1,
+static void sun4d_cross_call(void *func, cpumask_t mask, unsigned long arg1,
 			     unsigned long arg2, unsigned long arg3,
 			     unsigned long arg4)
 {
@@ -296,7 +296,7 @@ static void sun4d_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1,
 			 * If you make changes here, make sure
 			 * gcc generates proper code...
 			 */
-			register smpfunc_t f asm("i0") = func;
+			register void *f asm("i0") = func;
 			register unsigned long a1 asm("i1") = arg1;
 			register unsigned long a2 asm("i2") = arg2;
 			register unsigned long a3 asm("i3") = arg3;
@@ -353,11 +353,13 @@ static void sun4d_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1,
 /* Running cross calls. */
 void smp4d_cross_call_irq(void)
 {
+	void (*func)(unsigned long, unsigned long, unsigned long, unsigned long,
+		     unsigned long) = ccall_info.func;
 	int i = hard_smp_processor_id();
 
 	ccall_info.processors_in[i] = 1;
-	ccall_info.func(ccall_info.arg1, ccall_info.arg2, ccall_info.arg3,
-			ccall_info.arg4, ccall_info.arg5);
+	func(ccall_info.arg1, ccall_info.arg2, ccall_info.arg3, ccall_info.arg4,
+	     ccall_info.arg5);
 	ccall_info.processors_out[i] = 1;
 }
 
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index 228a6527082d..056df034e79e 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -157,7 +157,7 @@ static void sun4m_ipi_mask_one(int cpu)
 }
 
 static struct smp_funcall {
-	smpfunc_t func;
+	void *func;
 	unsigned long arg1;
 	unsigned long arg2;
 	unsigned long arg3;
@@ -170,7 +170,7 @@ static struct smp_funcall {
 static DEFINE_SPINLOCK(cross_call_lock);
 
 /* Cross calls must be serialized, at least currently. */
-static void sun4m_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1,
+static void sun4m_cross_call(void *func, cpumask_t mask, unsigned long arg1,
 			     unsigned long arg2, unsigned long arg3,
 			     unsigned long arg4)
 {
@@ -230,11 +230,13 @@ static void sun4m_cross_call(smpfunc_t func, cpumask_t mask, unsigned long arg1,
 /* Running cross calls. */
 void smp4m_cross_call_irq(void)
 {
+	void (*func)(unsigned long, unsigned long, unsigned long, unsigned long,
+		     unsigned long) = ccall_info.func;
 	int i = smp_processor_id();
 
 	ccall_info.processors_in[i] = 1;
-	ccall_info.func(ccall_info.arg1, ccall_info.arg2, ccall_info.arg3,
-			ccall_info.arg4, ccall_info.arg5);
+	func(ccall_info.arg1, ccall_info.arg2, ccall_info.arg3, ccall_info.arg4,
+	     ccall_info.arg5);
 	ccall_info.processors_out[i] = 1;
 }
 
diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index a9aa6a92c7fe..13f027afc875 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -1636,19 +1636,19 @@ static void __init get_srmmu_type(void)
 /* Local cross-calls. */
 static void smp_flush_page_for_dma(unsigned long page)
 {
-	xc1((smpfunc_t) local_ops->page_for_dma, page);
+	xc1(local_ops->page_for_dma, page);
 	local_ops->page_for_dma(page);
 }
 
 static void smp_flush_cache_all(void)
 {
-	xc0((smpfunc_t) local_ops->cache_all);
+	xc0(local_ops->cache_all);
 	local_ops->cache_all();
 }
 
 static void smp_flush_tlb_all(void)
 {
-	xc0((smpfunc_t) local_ops->tlb_all);
+	xc0(local_ops->tlb_all);
 	local_ops->tlb_all();
 }
 
@@ -1659,7 +1659,7 @@ static void smp_flush_cache_mm(struct mm_struct *mm)
 		cpumask_copy(&cpu_mask, mm_cpumask(mm));
 		cpumask_clear_cpu(smp_processor_id(), &cpu_mask);
 		if (!cpumask_empty(&cpu_mask))
-			xc1((smpfunc_t) local_ops->cache_mm, (unsigned long) mm);
+			xc1(local_ops->cache_mm, (unsigned long)mm);
 		local_ops->cache_mm(mm);
 	}
 }
@@ -1671,7 +1671,7 @@ static void smp_flush_tlb_mm(struct mm_struct *mm)
 		cpumask_copy(&cpu_mask, mm_cpumask(mm));
 		cpumask_clear_cpu(smp_processor_id(), &cpu_mask);
 		if (!cpumask_empty(&cpu_mask)) {
-			xc1((smpfunc_t) local_ops->tlb_mm, (unsigned long) mm);
+			xc1(local_ops->tlb_mm, (unsigned long)mm);
 			if (atomic_read(&mm->mm_users) == 1 && current->active_mm == mm)
 				cpumask_copy(mm_cpumask(mm),
 					     cpumask_of(smp_processor_id()));
@@ -1691,8 +1691,8 @@ static void smp_flush_cache_range(struct vm_area_struct *vma,
 		cpumask_copy(&cpu_mask, mm_cpumask(mm));
 		cpumask_clear_cpu(smp_processor_id(), &cpu_mask);
 		if (!cpumask_empty(&cpu_mask))
-			xc3((smpfunc_t) local_ops->cache_range,
-			    (unsigned long) vma, start, end);
+			xc3(local_ops->cache_range, (unsigned long)vma, start,
+			    end);
 		local_ops->cache_range(vma, start, end);
 	}
 }
@@ -1708,8 +1708,8 @@ static void smp_flush_tlb_range(struct vm_area_struct *vma,
 		cpumask_copy(&cpu_mask, mm_cpumask(mm));
 		cpumask_clear_cpu(smp_processor_id(), &cpu_mask);
 		if (!cpumask_empty(&cpu_mask))
-			xc3((smpfunc_t) local_ops->tlb_range,
-			    (unsigned long) vma, start, end);
+			xc3(local_ops->tlb_range, (unsigned long)vma, start,
+			    end);
 		local_ops->tlb_range(vma, start, end);
 	}
 }
@@ -1723,8 +1723,7 @@ static void smp_flush_cache_page(struct vm_area_struct *vma, unsigned long page)
 		cpumask_copy(&cpu_mask, mm_cpumask(mm));
 		cpumask_clear_cpu(smp_processor_id(), &cpu_mask);
 		if (!cpumask_empty(&cpu_mask))
-			xc2((smpfunc_t) local_ops->cache_page,
-			    (unsigned long) vma, page);
+			xc2(local_ops->cache_page, (unsigned long)vma, page);
 		local_ops->cache_page(vma, page);
 	}
 }
@@ -1738,8 +1737,7 @@ static void smp_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 		cpumask_copy(&cpu_mask, mm_cpumask(mm));
 		cpumask_clear_cpu(smp_processor_id(), &cpu_mask);
 		if (!cpumask_empty(&cpu_mask))
-			xc2((smpfunc_t) local_ops->tlb_page,
-			    (unsigned long) vma, page);
+			xc2(local_ops->tlb_page, (unsigned long)vma, page);
 		local_ops->tlb_page(vma, page);
 	}
 }
@@ -1753,7 +1751,7 @@ static void smp_flush_page_to_ram(unsigned long page)
 	 * XXX This experiment failed, research further... -DaveM
 	 */
 #if 1
-	xc1((smpfunc_t) local_ops->page_to_ram, page);
+	xc1(local_ops->page_to_ram, page);
 #endif
 	local_ops->page_to_ram(page);
 }
@@ -1764,8 +1762,7 @@ static void smp_flush_sig_insns(struct mm_struct *mm, unsigned long insn_addr)
 	cpumask_copy(&cpu_mask, mm_cpumask(mm));
 	cpumask_clear_cpu(smp_processor_id(), &cpu_mask);
 	if (!cpumask_empty(&cpu_mask))
-		xc2((smpfunc_t) local_ops->sig_insns,
-		    (unsigned long) mm, insn_addr);
+		xc2(local_ops->sig_insns, (unsigned long)mm, insn_addr);
 	local_ops->sig_insns(mm, insn_addr);
 }
 
-- 
cgit v1.2.3


From 6ad1c94e1e7e374d88f0cfd77936dddb8339aaba Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 28 Sep 2022 11:12:36 -0700
Subject: eth: alx: take rtnl_lock on resume

Zbynek reports that alx trips an rtnl assertion on resume:

 RTNL: assertion failed at net/core/dev.c (2891)
 RIP: 0010:netif_set_real_num_tx_queues+0x1ac/0x1c0
 Call Trace:
  <TASK>
  __alx_open+0x230/0x570 [alx]
  alx_resume+0x54/0x80 [alx]
  ? pci_legacy_resume+0x80/0x80
  dpm_run_callback+0x4a/0x150
  device_resume+0x8b/0x190
  async_resume+0x19/0x30
  async_run_entry_fn+0x30/0x130
  process_one_work+0x1e5/0x3b0

indeed the driver does not hold rtnl_lock during its internal close
and re-open functions during suspend/resume. Note that this is not
a huge bug as the driver implements its own locking, and does not
implement changing the number of queues, but we need to silence
the splat.

Fixes: 4a5fe57e7751 ("alx: use fine-grained locking instead of RTNL")
Reported-and-tested-by: Zbynek Michl <zbynek.michl@gmail.com>
Reviewed-by: Niels Dossche <dossche.niels@gmail.com>
Link: https://lore.kernel.org/r/20220928181236.1053043-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/atheros/alx/main.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c
index a89b93cb4e26..d5939586c82e 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -1912,11 +1912,14 @@ static int alx_suspend(struct device *dev)
 
 	if (!netif_running(alx->dev))
 		return 0;
+
+	rtnl_lock();
 	netif_device_detach(alx->dev);
 
 	mutex_lock(&alx->mtx);
 	__alx_stop(alx);
 	mutex_unlock(&alx->mtx);
+	rtnl_unlock();
 
 	return 0;
 }
@@ -1927,6 +1930,7 @@ static int alx_resume(struct device *dev)
 	struct alx_hw *hw = &alx->hw;
 	int err;
 
+	rtnl_lock();
 	mutex_lock(&alx->mtx);
 	alx_reset_phy(hw);
 
@@ -1943,6 +1947,7 @@ static int alx_resume(struct device *dev)
 
 unlock:
 	mutex_unlock(&alx->mtx);
+	rtnl_unlock();
 	return err;
 }
 
-- 
cgit v1.2.3


From 607e57c6c62c00965ae276902c166834ce73014a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 29 Sep 2022 22:57:43 -0700
Subject: hardening: Remove Clang's enable flag for
 -ftrivial-auto-var-init=zero

Now that Clang's -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang
option is no longer required, remove it from the command line. Clang 16
and later will warn when it is used, which will cause Kconfig to think
it can't use -ftrivial-auto-var-init=zero at all. Check for whether it
is required and only use it when so.

Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: linux-kbuild@vger.kernel.org
Cc: llvm@lists.linux.dev
Cc: stable@vger.kernel.org
Fixes: f02003c860d9 ("hardening: Avoid harmless Clang option under CONFIG_INIT_STACK_ALL_ZERO")
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 Makefile                   |  4 ++--
 security/Kconfig.hardening | 14 ++++++++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index c7705f749601..02c857e2243c 100644
--- a/Makefile
+++ b/Makefile
@@ -831,8 +831,8 @@ endif
 # Initialize all stack variables with a zero value.
 ifdef CONFIG_INIT_STACK_ALL_ZERO
 KBUILD_CFLAGS	+= -ftrivial-auto-var-init=zero
-ifdef CONFIG_CC_IS_CLANG
-# https://bugs.llvm.org/show_bug.cgi?id=45497
+ifdef CONFIG_CC_HAS_AUTO_VAR_INIT_ZERO_ENABLER
+# https://github.com/llvm/llvm-project/issues/44842
 KBUILD_CFLAGS	+= -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang
 endif
 endif
diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index bd2aabb2c60f..995bc42003e6 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -22,11 +22,17 @@ menu "Memory initialization"
 config CC_HAS_AUTO_VAR_INIT_PATTERN
 	def_bool $(cc-option,-ftrivial-auto-var-init=pattern)
 
-config CC_HAS_AUTO_VAR_INIT_ZERO
-	# GCC ignores the -enable flag, so we can test for the feature with
-	# a single invocation using the flag, but drop it as appropriate in
-	# the Makefile, depending on the presence of Clang.
+config CC_HAS_AUTO_VAR_INIT_ZERO_BARE
+	def_bool $(cc-option,-ftrivial-auto-var-init=zero)
+
+config CC_HAS_AUTO_VAR_INIT_ZERO_ENABLER
+	# Clang 16 and later warn about using the -enable flag, but it
+	# is required before then.
 	def_bool $(cc-option,-ftrivial-auto-var-init=zero -enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang)
+	depends on !CC_HAS_AUTO_VAR_INIT_ZERO_BARE
+
+config CC_HAS_AUTO_VAR_INIT_ZERO
+	def_bool CC_HAS_AUTO_VAR_INIT_ZERO_BARE || CC_HAS_AUTO_VAR_INIT_ZERO_ENABLER
 
 choice
 	prompt "Initialize kernel stack variables at function entry"
-- 
cgit v1.2.3


From 6336a810db5c7e8e48b55b12fbb5e9cbd36a3d19 Mon Sep 17 00:00:00 2001
From: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Date: Mon, 26 Sep 2022 04:29:23 -0400
Subject: KVM: selftests: replace assertion with warning in
 access_tracking_perf_test

Page_idle uses {ptep/pmdp}_clear_young_notify which in turn calls
the mmu notifier callback ->clear_young(), which purposefully
does not flush the TLB.

When running the test in a nested guest, point 1. of the test
doc header is violated, because KVM TLB is unbounded by size
and since no flush is forced, KVM does not update the sptes
accessed/idle bits resulting in guest assertion failure.

More precisely, only the first ACCESS_WRITE in run_test() actually
makes visible changes, because sptes are created and the accessed
bit is set to 1 (or idle bit is 0). Then the first mark_memory_idle()
passes since access bit is still one, and sets all pages as idle
(or not accessed). When the next write is performed, the update
is not flushed therefore idle is still 1 and next mark_memory_idle()
fails.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Message-Id: <20220926082923.299554-1-eesposit@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../selftests/kvm/access_tracking_perf_test.c      | 25 ++++++++++++++--------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c
index 1c2749b1481a..76c583a07ea2 100644
--- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
+++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
@@ -31,8 +31,9 @@
  * These limitations are worked around in this test by using a large enough
  * region of memory for each vCPU such that the number of translations cached in
  * the TLB and the number of pages held in pagevecs are a small fraction of the
- * overall workload. And if either of those conditions are not true this test
- * will fail rather than silently passing.
+ * overall workload. And if either of those conditions are not true (for example
+ * in nesting, where TLB size is unlimited) this test will print a warning
+ * rather than silently passing.
  */
 #include <inttypes.h>
 #include <limits.h>
@@ -172,17 +173,23 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm,
 		    vcpu_idx, no_pfn, pages);
 
 	/*
-	 * Test that at least 90% of memory has been marked idle (the rest might
-	 * not be marked idle because the pages have not yet made it to an LRU
-	 * list or the translations are still cached in the TLB). 90% is
+	 * Check that at least 90% of memory has been marked idle (the rest
+	 * might not be marked idle because the pages have not yet made it to an
+	 * LRU list or the translations are still cached in the TLB). 90% is
 	 * arbitrary; high enough that we ensure most memory access went through
 	 * access tracking but low enough as to not make the test too brittle
 	 * over time and across architectures.
+	 *
+	 * Note that when run in nested virtualization, this check will trigger
+	 * much more frequently because TLB size is unlimited and since no flush
+	 * happens, much more pages are cached there and guest won't see the
+	 * "idle" bit cleared.
 	 */
-	TEST_ASSERT(still_idle < pages / 10,
-		    "vCPU%d: Too many pages still idle (%"PRIu64 " out of %"
-		    PRIu64 ").\n",
-		    vcpu_idx, still_idle, pages);
+	if (still_idle < pages / 10)
+		printf("WARNING: vCPU%d: Too many pages still idle (%" PRIu64
+		       "out of %" PRIu64 "), this will affect performance results"
+		       ".\n",
+		       vcpu_idx, still_idle, pages);
 
 	close(page_idle_fd);
 	close(pagemap_fd);
-- 
cgit v1.2.3


From 09636efd1bd164ac782ff0d3a714db2c53964776 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Thu, 22 Sep 2022 16:17:23 -0700
Subject: KVM: selftests: Gracefully handle empty stack traces

Bail out of test_dump_stack() if the stack trace is empty rather than
invoking addr2line with zero addresses. The problem with the latter is
that addr2line will block waiting for addresses to be passed in via
stdin, e.g. if running a selftest from an interactive terminal.

Opportunistically fix up the comment that mentions skipping 3 frames
since only 2 are skipped in the code.

Cc: Vipin Sharma <vipinsh@google.com>
Cc: Sean Christopherson <seanjc@google.com>
Signed-off-by: David Matlack <dmatlack@google.com>
Message-Id: <20220922231724.3560211-1-dmatlack@google.com>
[Small tweak to keep backtrace() call close to if(). - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/lib/assert.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c
index 71ade6100fd3..2bd25b191d15 100644
--- a/tools/testing/selftests/kvm/lib/assert.c
+++ b/tools/testing/selftests/kvm/lib/assert.c
@@ -22,7 +22,7 @@ static void test_dump_stack(void)
 	 * Build and run this command:
 	 *
 	 *	addr2line -s -e /proc/$PPID/exe -fpai {backtrace addresses} | \
-	 *		grep -v test_dump_stack | cat -n 1>&2
+	 *		cat -n 1>&2
 	 *
 	 * Note that the spacing is different and there's no newline.
 	 */
@@ -36,18 +36,24 @@ static void test_dump_stack(void)
 		 n * (((sizeof(void *)) * 2) + 1) +
 		 /* Null terminator: */
 		 1];
-	char *c;
+	char *c = cmd;
 
 	n = backtrace(stack, n);
-	c = &cmd[0];
-	c += sprintf(c, "%s", addr2line);
 	/*
-	 * Skip the first 3 frames: backtrace, test_dump_stack, and
-	 * test_assert. We hope that backtrace isn't inlined and the other two
-	 * we've declared noinline.
+	 * Skip the first 2 frames, which should be test_dump_stack() and
+	 * test_assert(); both of which are declared noinline.  Bail if the
+	 * resulting stack trace would be empty. Otherwise, addr2line will block
+	 * waiting for addresses to be passed in via stdin.
 	 */
+	if (n <= 2) {
+		fputs("  (stack trace empty)\n", stderr);
+		return;
+	}
+
+	c += sprintf(c, "%s", addr2line);
 	for (i = 2; i < n; i++)
 		c += sprintf(c, " %lx", ((unsigned long) stack[i]) - 1);
+
 	c += sprintf(c, "%s", pipeline);
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-result"
-- 
cgit v1.2.3


From aae2e72229cdb21f90df2dbe4244c977e5d3265b Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Thu, 22 Sep 2022 16:18:54 -0700
Subject: KVM: x86: Hide IA32_PLATFORM_DCA_CAP[31:0] from the guest

The only thing reported by CPUID.9 is the value of
IA32_PLATFORM_DCA_CAP[31:0] in EAX. This MSR doesn't even exist in the
guest, since CPUID.1:ECX.DCA[bit 18] is clear in the guest.

Clear CPUID.9 in KVM_GET_SUPPORTED_CPUID.

Fixes: 24c82e576b78 ("KVM: Sanitize cpuid")
Signed-off-by: Jim Mattson <jmattson@google.com>
Message-Id: <20220922231854.249383-1-jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/cpuid.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 4c1c2c06e96b..2796dde06302 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -902,8 +902,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
 			entry->edx = 0;
 		}
 		break;
-	case 9:
-		break;
 	case 0xa: { /* Architectural Performance Monitoring */
 		union cpuid10_eax eax;
 		union cpuid10_edx edx;
-- 
cgit v1.2.3


From 6b6f71484bf4fbe169fdbd401c829d8981365fd2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 28 Sep 2022 23:36:46 +0000
Subject: KVM: selftests: Implement memcmp(), memcpy(), and memset() for guest
 use

Implement memcmp(), memcpy(), and memset() to override the compiler's
built-in versions in order to guarantee that the compiler won't generate
out-of-line calls to external functions via the PLT.  This allows the
helpers to be safely used in guest code, as KVM selftests don't support
dynamic loading of guest code.

Steal the implementations from the kernel's generic versions, sans the
optimizations in memcmp() for unaligned accesses.

Put the utilities in a separate compilation unit and build with
-ffreestanding to fudge around a gcc "feature" where it will optimize
memset(), memcpy(), etc... by generating a recursive call.  I.e. the
compiler optimizes itself into infinite recursion.  Alternatively, the
individual functions could be tagged with
optimize("no-tree-loop-distribute-patterns"), but using "optimize" for
anything but debug is discouraged, and Linus NAK'd the use of the flag
in the kernel proper[*].

https://lore.kernel.org/lkml/CAHk-=wik-oXnUpfZ6Hw37uLykc-_P0Apyn2XuX-odh-3Nzop8w@mail.gmail.com

Cc: Andrew Jones <andrew.jones@linux.dev>
Cc: Anup Patel <anup@brainfault.org>
Cc: Atish Patra <atishp@atishpatra.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Claudio Imbrenda <imbrenda@linux.ibm.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Message-Id: <20220928233652.783504-2-seanjc@google.com>
Reviewed-by: Andrew Jones <andrew.jones@linux.dev>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 tools/testing/selftests/kvm/Makefile              | 11 ++++++-
 tools/testing/selftests/kvm/lib/string_override.c | 39 +++++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/kvm/lib/string_override.c

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 4c122f1b1737..6448cb9f710f 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -48,6 +48,8 @@ LIBKVM += lib/rbtree.c
 LIBKVM += lib/sparsebit.c
 LIBKVM += lib/test_util.c
 
+LIBKVM_STRING += lib/string_override.c
+
 LIBKVM_x86_64 += lib/x86_64/apic.c
 LIBKVM_x86_64 += lib/x86_64/handlers.S
 LIBKVM_x86_64 += lib/x86_64/perf_test_util.c
@@ -220,7 +222,8 @@ LIBKVM_C := $(filter %.c,$(LIBKVM))
 LIBKVM_S := $(filter %.S,$(LIBKVM))
 LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
 LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
-LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)
+LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
+LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ)
 
 EXTRA_CLEAN += $(LIBKVM_OBJS) cscope.*
 
@@ -231,6 +234,12 @@ $(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c
 $(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
 
+# Compile the string overrides as freestanding to prevent the compiler from
+# generating self-referential code, e.g. without "freestanding" the compiler may
+# "optimize" memcmp() by invoking memcmp(), thus causing infinite recursion.
+$(LIBKVM_STRING_OBJ): $(OUTPUT)/%.o: %.c
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c -ffreestanding $< -o $@
+
 x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
 $(TEST_GEN_PROGS): $(LIBKVM_OBJS)
 $(TEST_GEN_PROGS_EXTENDED): $(LIBKVM_OBJS)
diff --git a/tools/testing/selftests/kvm/lib/string_override.c b/tools/testing/selftests/kvm/lib/string_override.c
new file mode 100644
index 000000000000..632398adc229
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/string_override.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <stddef.h>
+
+/*
+ * Override the "basic" built-in string helpers so that they can be used in
+ * guest code.  KVM selftests don't support dynamic loading in guest code and
+ * will jump into the weeds if the compiler decides to insert an out-of-line
+ * call via the PLT.
+ */
+int memcmp(const void *cs, const void *ct, size_t count)
+{
+	const unsigned char *su1, *su2;
+	int res = 0;
+
+	for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--) {
+		if ((res = *su1 - *su2) != 0)
+			break;
+	}
+	return res;
+}
+
+void *memcpy(void *dest, const void *src, size_t count)
+{
+	char *tmp = dest;
+	const char *s = src;
+
+	while (count--)
+		*tmp++ = *s++;
+	return dest;
+}
+
+void *memset(void *s, int c, size_t count)
+{
+	char *xs = s;
+
+	while (count--)
+		*xs++ = c;
+	return s;
+}
-- 
cgit v1.2.3


From 394265079b6c271fdc191ac31b1ebfbee3dd6d63 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 28 Sep 2022 23:36:47 +0000
Subject: KVM: selftests: Compare insn opcodes directly in fix_hypercall_test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Directly compare the expected versus observed hypercall instructions when
verifying that KVM patched in the native hypercall (FIX_HYPERCALL_INSN
quirk enabled).  gcc rightly complains that doing a 4-byte memcpy() with
an "unsigned char" as the source generates an out-of-bounds accesses.

Alternatively, "exp" and "obs" could be declared as 3-byte arrays, but
there's no known reason to copy locally instead of comparing directly.

In function ‘assert_hypercall_insn’,
    inlined from ‘guest_main’ at x86_64/fix_hypercall_test.c:91:2:
x86_64/fix_hypercall_test.c:63:9: error: array subscript ‘unsigned int[0]’
 is partly outside array bounds of ‘unsigned char[1]’ [-Werror=array-bounds]
   63 |         memcpy(&exp, exp_insn, sizeof(exp));
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
x86_64/fix_hypercall_test.c: In function ‘guest_main’:
x86_64/fix_hypercall_test.c:42:22: note: object ‘vmx_hypercall_insn’ of size 1
   42 | extern unsigned char vmx_hypercall_insn;
      |                      ^~~~~~~~~~~~~~~~~~
x86_64/fix_hypercall_test.c:25:22: note: object ‘svm_hypercall_insn’ of size 1
   25 | extern unsigned char svm_hypercall_insn;
      |                      ^~~~~~~~~~~~~~~~~~
In function ‘assert_hypercall_insn’,
    inlined from ‘guest_main’ at x86_64/fix_hypercall_test.c:91:2:
x86_64/fix_hypercall_test.c:64:9: error: array subscript ‘unsigned int[0]’
 is partly outside array bounds of ‘unsigned char[1]’ [-Werror=array-bounds]
   64 |         memcpy(&obs, obs_insn, sizeof(obs));
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
x86_64/fix_hypercall_test.c: In function ‘guest_main’:
x86_64/fix_hypercall_test.c:25:22: note: object ‘svm_hypercall_insn’ of size 1
   25 | extern unsigned char svm_hypercall_insn;
      |                      ^~~~~~~~~~~~~~~~~~
x86_64/fix_hypercall_test.c:42:22: note: object ‘vmx_hypercall_insn’ of size 1
   42 | extern unsigned char vmx_hypercall_insn;
      |                      ^~~~~~~~~~~~~~~~~~
cc1: all warnings being treated as errors
make: *** [../lib.mk:135: tools/testing/selftests/kvm/x86_64/fix_hypercall_test] Error 1

Fixes: 6c2fa8b20d0c ("selftests: KVM: Test KVM_X86_QUIRK_FIX_HYPERCALL_INSN")
Cc: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Message-Id: <20220928233652.783504-3-seanjc@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 .../selftests/kvm/x86_64/fix_hypercall_test.c      | 34 ++++++++++------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
index b1905d280ef5..e0004bd26536 100644
--- a/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
+++ b/tools/testing/selftests/kvm/x86_64/fix_hypercall_test.c
@@ -14,6 +14,9 @@
 #include "kvm_util.h"
 #include "processor.h"
 
+/* VMCALL and VMMCALL are both 3-byte opcodes. */
+#define HYPERCALL_INSN_SIZE	3
+
 static bool ud_expected;
 
 static void guest_ud_handler(struct ex_regs *regs)
@@ -22,7 +25,7 @@ static void guest_ud_handler(struct ex_regs *regs)
 	GUEST_DONE();
 }
 
-extern unsigned char svm_hypercall_insn;
+extern uint8_t svm_hypercall_insn[HYPERCALL_INSN_SIZE];
 static uint64_t svm_do_sched_yield(uint8_t apic_id)
 {
 	uint64_t ret;
@@ -39,7 +42,7 @@ static uint64_t svm_do_sched_yield(uint8_t apic_id)
 	return ret;
 }
 
-extern unsigned char vmx_hypercall_insn;
+extern uint8_t vmx_hypercall_insn[HYPERCALL_INSN_SIZE];
 static uint64_t vmx_do_sched_yield(uint8_t apic_id)
 {
 	uint64_t ret;
@@ -56,30 +59,20 @@ static uint64_t vmx_do_sched_yield(uint8_t apic_id)
 	return ret;
 }
 
-static void assert_hypercall_insn(unsigned char *exp_insn, unsigned char *obs_insn)
-{
-	uint32_t exp = 0, obs = 0;
-
-	memcpy(&exp, exp_insn, sizeof(exp));
-	memcpy(&obs, obs_insn, sizeof(obs));
-
-	GUEST_ASSERT_EQ(exp, obs);
-}
-
 static void guest_main(void)
 {
-	unsigned char *native_hypercall_insn, *hypercall_insn;
+	uint8_t *native_hypercall_insn, *hypercall_insn;
 	uint8_t apic_id;
 
 	apic_id = GET_APIC_ID_FIELD(xapic_read_reg(APIC_ID));
 
 	if (is_intel_cpu()) {
-		native_hypercall_insn = &vmx_hypercall_insn;
-		hypercall_insn = &svm_hypercall_insn;
+		native_hypercall_insn = vmx_hypercall_insn;
+		hypercall_insn = svm_hypercall_insn;
 		svm_do_sched_yield(apic_id);
 	} else if (is_amd_cpu()) {
-		native_hypercall_insn = &svm_hypercall_insn;
-		hypercall_insn = &vmx_hypercall_insn;
+		native_hypercall_insn = svm_hypercall_insn;
+		hypercall_insn = vmx_hypercall_insn;
 		vmx_do_sched_yield(apic_id);
 	} else {
 		GUEST_ASSERT(0);
@@ -87,8 +80,13 @@ static void guest_main(void)
 		return;
 	}
 
+	/*
+	 * The hypercall didn't #UD (guest_ud_handler() signals "done" if a #UD
+	 * occurs).  Verify that a #UD is NOT expected and that KVM patched in
+	 * the native hypercall.
+	 */
 	GUEST_ASSERT(!ud_expected);
-	assert_hypercall_insn(native_hypercall_insn, hypercall_insn);
+	GUEST_ASSERT(!memcmp(native_hypercall_insn, hypercall_insn, HYPERCALL_INSN_SIZE));
 	GUEST_DONE();
 }
 
-- 
cgit v1.2.3


From 2568a7e0832ee30b0a351016d03062ab4e0e0a3f Mon Sep 17 00:00:00 2001
From: Duoming Zhou <duoming@zju.edu.cn>
Date: Wed, 28 Sep 2022 21:39:38 +0800
Subject: mISDN: fix use-after-free bugs in l1oip timer handlers

The l1oip_cleanup() traverses the l1oip_ilist and calls
release_card() to cleanup module and stack. However,
release_card() calls del_timer() to delete the timers
such as keep_tl and timeout_tl. If the timer handler is
running, the del_timer() will not stop it and result in
UAF bugs. One of the processes is shown below:

    (cleanup routine)          |        (timer handler)
release_card()                 | l1oip_timeout()
 ...                           |
 del_timer()                   | ...
 ...                           |
 kfree(hc) //FREE              |
                               | hc->timeout_on = 0 //USE

Fix by calling del_timer_sync() in release_card(), which
makes sure the timer handlers have finished before the
resources, such as l1oip and so on, have been deallocated.

What's more, the hc->workq and hc->socket_thread can kick
those timers right back in. We add a bool flag to show
if card is released. Then, check this flag in hc->workq
and hc->socket_thread.

Fixes: 3712b42d4b1b ("Add layer1 over IP support")
Signed-off-by: Duoming Zhou <duoming@zju.edu.cn>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/isdn/mISDN/l1oip.h      |  1 +
 drivers/isdn/mISDN/l1oip_core.c | 13 +++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/isdn/mISDN/l1oip.h b/drivers/isdn/mISDN/l1oip.h
index 7ea10db20e3a..48133d022812 100644
--- a/drivers/isdn/mISDN/l1oip.h
+++ b/drivers/isdn/mISDN/l1oip.h
@@ -59,6 +59,7 @@ struct l1oip {
 	int			bundle;		/* bundle channels in one frm */
 	int			codec;		/* codec to use for transmis. */
 	int			limit;		/* limit number of bchannels */
+	bool			shutdown;	/* if card is released */
 
 	/* timer */
 	struct timer_list	keep_tl;
diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c
index 2c40412466e6..a77195e378b7 100644
--- a/drivers/isdn/mISDN/l1oip_core.c
+++ b/drivers/isdn/mISDN/l1oip_core.c
@@ -275,7 +275,7 @@ l1oip_socket_send(struct l1oip *hc, u8 localcodec, u8 channel, u32 chanmask,
 	p = frame;
 
 	/* restart timer */
-	if (time_before(hc->keep_tl.expires, jiffies + 5 * HZ))
+	if (time_before(hc->keep_tl.expires, jiffies + 5 * HZ) && !hc->shutdown)
 		mod_timer(&hc->keep_tl, jiffies + L1OIP_KEEPALIVE * HZ);
 	else
 		hc->keep_tl.expires = jiffies + L1OIP_KEEPALIVE * HZ;
@@ -601,7 +601,9 @@ multiframe:
 		goto multiframe;
 
 	/* restart timer */
-	if (time_before(hc->timeout_tl.expires, jiffies + 5 * HZ) || !hc->timeout_on) {
+	if ((time_before(hc->timeout_tl.expires, jiffies + 5 * HZ) ||
+	     !hc->timeout_on) &&
+	    !hc->shutdown) {
 		hc->timeout_on = 1;
 		mod_timer(&hc->timeout_tl, jiffies + L1OIP_TIMEOUT * HZ);
 	} else /* only adjust timer */
@@ -1232,11 +1234,10 @@ release_card(struct l1oip *hc)
 {
 	int	ch;
 
-	if (timer_pending(&hc->keep_tl))
-		del_timer(&hc->keep_tl);
+	hc->shutdown = true;
 
-	if (timer_pending(&hc->timeout_tl))
-		del_timer(&hc->timeout_tl);
+	del_timer_sync(&hc->keep_tl);
+	del_timer_sync(&hc->timeout_tl);
 
 	cancel_work_sync(&hc->workq);
 
-- 
cgit v1.2.3


From 022152aaebe116a25c39818a07e175a8cd3c1e11 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 28 Sep 2022 14:10:13 -0400
Subject: sctp: handle the error returned from sctp_auth_asoc_init_active_key

When it returns an error from sctp_auth_asoc_init_active_key(), the
active_key is actually not updated. The old sh_key will be freeed
while it's still used as active key in asoc. Then an use-after-free
will be triggered when sending patckets, as found by syzbot:

  sctp_auth_shkey_hold+0x22/0xa0 net/sctp/auth.c:112
  sctp_set_owner_w net/sctp/socket.c:132 [inline]
  sctp_sendmsg_to_asoc+0xbd5/0x1a20 net/sctp/socket.c:1863
  sctp_sendmsg+0x1053/0x1d50 net/sctp/socket.c:2025
  inet_sendmsg+0x99/0xe0 net/ipv4/af_inet.c:819
  sock_sendmsg_nosec net/socket.c:714 [inline]
  sock_sendmsg+0xcf/0x120 net/socket.c:734

This patch is to fix it by not replacing the sh_key when it returns
errors from sctp_auth_asoc_init_active_key() in sctp_auth_set_key().
For sctp_auth_set_active_key(), old active_key_id will be set back
to asoc->active_key_id when the same thing happens.

Fixes: 58acd1009226 ("sctp: update active_key for asoc when old key is being replaced")
Reported-by: syzbot+a236dd8e9622ed8954a3@syzkaller.appspotmail.com
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/auth.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index db6b7373d16c..34964145514e 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -863,12 +863,17 @@ int sctp_auth_set_key(struct sctp_endpoint *ep,
 	}
 
 	list_del_init(&shkey->key_list);
-	sctp_auth_shkey_release(shkey);
 	list_add(&cur_key->key_list, sh_keys);
 
-	if (asoc && asoc->active_key_id == auth_key->sca_keynumber)
-		sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL);
+	if (asoc && asoc->active_key_id == auth_key->sca_keynumber &&
+	    sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) {
+		list_del_init(&cur_key->key_list);
+		sctp_auth_shkey_release(cur_key);
+		list_add(&shkey->key_list, sh_keys);
+		return -ENOMEM;
+	}
 
+	sctp_auth_shkey_release(shkey);
 	return 0;
 }
 
@@ -902,8 +907,13 @@ int sctp_auth_set_active_key(struct sctp_endpoint *ep,
 		return -EINVAL;
 
 	if (asoc) {
+		__u16  active_key_id = asoc->active_key_id;
+
 		asoc->active_key_id = key_id;
-		sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL);
+		if (sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) {
+			asoc->active_key_id = active_key_id;
+			return -ENOMEM;
+		}
 	} else
 		ep->active_key_id = key_id;
 
-- 
cgit v1.2.3


From f4ce91ce12a7c6ead19b128ffa8cff6e3ded2a14 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 28 Sep 2022 16:03:31 -0400
Subject: tcp: fix tcp_cwnd_validate() to not forget is_cwnd_limited

This commit fixes a bug in the tracking of max_packets_out and
is_cwnd_limited. This bug can cause the connection to fail to remember
that is_cwnd_limited is true, causing the connection to fail to grow
cwnd when it should, causing throughput to be lower than it should be.

The following event sequence is an example that triggers the bug:

 (a) The connection is cwnd_limited, but packets_out is not at its
     peak due to TSO deferral deciding not to send another skb yet.
     In such cases the connection can advance max_packets_seq and set
     tp->is_cwnd_limited to true and max_packets_out to a small
     number.

(b) Then later in the round trip the connection is pacing-limited (not
     cwnd-limited), and packets_out is larger. In such cases the
     connection would raise max_packets_out to a bigger number but
     (unexpectedly) flip tp->is_cwnd_limited from true to false.

This commit fixes that bug.

One straightforward fix would be to separately track (a) the next
window after max_packets_out reaches a maximum, and (b) the next
window after tp->is_cwnd_limited is set to true. But this would
require consuming an extra u32 sequence number.

Instead, to save space we track only the most important
information. Specifically, we track the strongest available signal of
the degree to which the cwnd is fully utilized:

(1) If the connection is cwnd-limited then we remember that fact for
the current window.

(2) If the connection not cwnd-limited then we track the maximum
number of outstanding packets in the current window.

In particular, note that the new logic cannot trigger the buggy
(a)/(b) sequence above because with the new logic a condition where
tp->packets_out > tp->max_packets_out can only trigger an update of
tp->is_cwnd_limited if tp->is_cwnd_limited is false.

This first showed up in a testing of a BBRv2 dev branch, but this
buggy behavior highlighted a general issue with the
tcp_cwnd_validate() logic that can cause cwnd to fail to increase at
the proper rate for any TCP congestion control, including Reno or
CUBIC.

Fixes: ca8a22634381 ("tcp: make cwnd-limited checks measurement-based, and gentler")
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Kevin(Yudong) Yang <yyd@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |  2 +-
 include/net/tcp.h     |  5 ++++-
 net/ipv4/tcp.c        |  2 ++
 net/ipv4/tcp_output.c | 19 ++++++++++++-------
 4 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a9fbe22732c3..4791fd801945 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -295,7 +295,7 @@ struct tcp_sock {
 	u32	packets_out;	/* Packets which are "in flight"	*/
 	u32	retrans_out;	/* Retransmitted packets out		*/
 	u32	max_packets_out;  /* max packets_out in last window */
-	u32	max_packets_seq;  /* right edge of max_packets_out flight */
+	u32	cwnd_usage_seq;  /* right edge of cwnd usage tracking flight */
 
 	u16	urg_data;	/* Saved octet of OOB data and control flags */
 	u8	ecn_flags;	/* ECN status bits.			*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d10962b9f0d0..95c1d51393ac 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1295,11 +1295,14 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
 
+	if (tp->is_cwnd_limited)
+		return true;
+
 	/* If in slow start, ensure cwnd grows to twice what was ACKed. */
 	if (tcp_in_slow_start(tp))
 		return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out;
 
-	return tp->is_cwnd_limited;
+	return false;
 }
 
 /* BBR congestion control needs pacing.
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e373dde1f46f..997a80ce1e13 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3137,6 +3137,8 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
 	tp->snd_cwnd_cnt = 0;
+	tp->is_cwnd_limited = 0;
+	tp->max_packets_out = 0;
 	tp->window_clamp = 0;
 	tp->delivered = 0;
 	tp->delivered_ce = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 290019de766d..c69f4d966024 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1875,15 +1875,20 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	/* Track the maximum number of outstanding packets in each
-	 * window, and remember whether we were cwnd-limited then.
+	/* Track the strongest available signal of the degree to which the cwnd
+	 * is fully utilized. If cwnd-limited then remember that fact for the
+	 * current window. If not cwnd-limited then track the maximum number of
+	 * outstanding packets in the current window. (If cwnd-limited then we
+	 * chose to not update tp->max_packets_out to avoid an extra else
+	 * clause with no functional impact.)
 	 */
-	if (!before(tp->snd_una, tp->max_packets_seq) ||
-	    tp->packets_out > tp->max_packets_out ||
-	    is_cwnd_limited) {
-		tp->max_packets_out = tp->packets_out;
-		tp->max_packets_seq = tp->snd_nxt;
+	if (!before(tp->snd_una, tp->cwnd_usage_seq) ||
+	    is_cwnd_limited ||
+	    (!tp->is_cwnd_limited &&
+	     tp->packets_out > tp->max_packets_out)) {
 		tp->is_cwnd_limited = is_cwnd_limited;
+		tp->max_packets_out = tp->packets_out;
+		tp->cwnd_usage_seq = tp->snd_nxt;
 	}
 
 	if (tcp_is_cwnd_limited(sk)) {
-- 
cgit v1.2.3


From 84aff0b6905c1101aab1525c2d649127f307218b Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Thu, 29 Sep 2022 12:14:51 +0200
Subject: mips: update config files

Clean up config files by:
  - removing configs that were deleted in the past
  - removing configs not in tree and without recently pending patches
  - adding new configs that are replacements for old configs in the file

For some detailed information, see Link.

Link: https://lore.kernel.org/kernel-janitors/20220929090645.1389-1-lukas.bulwahn@gmail.com/

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/configs/ar7_defconfig               |  4 ----
 arch/mips/configs/ath25_defconfig             |  4 ----
 arch/mips/configs/ath79_defconfig             | 10 ----------
 arch/mips/configs/bcm63xx_defconfig           |  3 ---
 arch/mips/configs/bigsur_defconfig            |  9 ---------
 arch/mips/configs/bmips_be_defconfig          |  3 ---
 arch/mips/configs/bmips_stb_defconfig         | 23 ++---------------------
 arch/mips/configs/cavium_octeon_defconfig     |  1 -
 arch/mips/configs/db1xxx_defconfig            |  1 -
 arch/mips/configs/decstation_64_defconfig     | 10 ----------
 arch/mips/configs/decstation_defconfig        | 10 ----------
 arch/mips/configs/decstation_r4k_defconfig    | 10 ----------
 arch/mips/configs/fuloong2e_defconfig         |  9 ---------
 arch/mips/configs/generic/board-ocelot.config |  1 -
 arch/mips/configs/gpr_defconfig               |  8 --------
 arch/mips/configs/ip22_defconfig              | 10 ----------
 arch/mips/configs/ip27_defconfig              | 19 -------------------
 arch/mips/configs/ip28_defconfig              |  3 ---
 arch/mips/configs/ip32_defconfig              |  2 --
 arch/mips/configs/jazz_defconfig              |  1 -
 arch/mips/configs/lemote2f_defconfig          |  9 ---------
 arch/mips/configs/loongson1b_defconfig        |  4 ----
 arch/mips/configs/loongson1c_defconfig        |  4 ----
 arch/mips/configs/loongson2k_defconfig        |  3 ---
 arch/mips/configs/loongson3_defconfig         |  2 --
 arch/mips/configs/malta_defconfig             |  5 -----
 arch/mips/configs/malta_kvm_defconfig         |  5 -----
 arch/mips/configs/malta_qemu_32r6_defconfig   |  3 ---
 arch/mips/configs/maltaaprp_defconfig         |  3 ---
 arch/mips/configs/maltasmvp_defconfig         |  3 ---
 arch/mips/configs/maltasmvp_eva_defconfig     |  3 ---
 arch/mips/configs/maltaup_defconfig           |  3 ---
 arch/mips/configs/maltaup_xpa_defconfig       |  5 -----
 arch/mips/configs/mtx1_defconfig              | 10 ----------
 arch/mips/configs/omega2p_defconfig           |  3 ---
 arch/mips/configs/pic32mzda_defconfig         |  1 -
 arch/mips/configs/rb532_defconfig             |  4 ----
 arch/mips/configs/rbtx49xx_defconfig          |  7 -------
 arch/mips/configs/rm200_defconfig             |  7 -------
 arch/mips/configs/rt305x_defconfig            |  4 ----
 arch/mips/configs/sb1250_swarm_defconfig      |  2 --
 arch/mips/configs/vocore2_defconfig           |  3 ---
 arch/mips/configs/xway_defconfig              |  4 ----
 43 files changed, 2 insertions(+), 236 deletions(-)

diff --git a/arch/mips/configs/ar7_defconfig b/arch/mips/configs/ar7_defconfig
index cf9c6329b807..ed4a6388791e 100644
--- a/arch/mips/configs/ar7_defconfig
+++ b/arch/mips/configs/ar7_defconfig
@@ -32,9 +32,6 @@ CONFIG_IP_ROUTE_MULTIPATH=y
 CONFIG_IP_ROUTE_VERBOSE=y
 CONFIG_IP_MROUTE=y
 CONFIG_SYN_COOKIES=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_DIAG is not set
 CONFIG_TCP_CONG_ADVANCED=y
 # CONFIG_TCP_CONG_BIC is not set
@@ -117,7 +114,6 @@ CONFIG_JFFS2_SUMMARY=y
 CONFIG_JFFS2_COMPRESSION_OPTIONS=y
 CONFIG_SQUASHFS=y
 # CONFIG_CRYPTO_HW is not set
-# CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
 CONFIG_CMDLINE_BOOL=y
diff --git a/arch/mips/configs/ath25_defconfig b/arch/mips/configs/ath25_defconfig
index 7143441f5476..afd1c16242e9 100644
--- a/arch/mips/configs/ath25_defconfig
+++ b/arch/mips/configs/ath25_defconfig
@@ -29,9 +29,6 @@ CONFIG_UNIX=y
 CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
 CONFIG_IP_ADVANCED_ROUTER=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_IPV6 is not set
 CONFIG_CFG80211=m
 CONFIG_MAC80211=m
@@ -108,7 +105,6 @@ CONFIG_SQUASHFS_XZ=y
 # CONFIG_XZ_DEC_ARMTHUMB is not set
 # CONFIG_XZ_DEC_SPARC is not set
 CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
 # CONFIG_SCHED_DEBUG is not set
diff --git a/arch/mips/configs/ath79_defconfig b/arch/mips/configs/ath79_defconfig
index 96622a2ad333..0b741716c852 100644
--- a/arch/mips/configs/ath79_defconfig
+++ b/arch/mips/configs/ath79_defconfig
@@ -10,12 +10,6 @@ CONFIG_EMBEDDED=y
 # CONFIG_SLUB_DEBUG is not set
 # CONFIG_COMPAT_BRK is not set
 CONFIG_ATH79=y
-CONFIG_ATH79_MACH_AP121=y
-CONFIG_ATH79_MACH_AP136=y
-CONFIG_ATH79_MACH_AP81=y
-CONFIG_ATH79_MACH_DB120=y
-CONFIG_ATH79_MACH_PB44=y
-CONFIG_ATH79_MACH_UBNT_XM=y
 CONFIG_HZ_100=y
 # CONFIG_SECCOMP is not set
 CONFIG_PCI=y
@@ -29,9 +23,6 @@ CONFIG_UNIX=y
 CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
 CONFIG_IP_ADVANCED_ROUTER=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_IPV6 is not set
 CONFIG_CFG80211=m
 CONFIG_MAC80211=m
@@ -92,7 +83,6 @@ CONFIG_LEDS_GPIO=y
 # CONFIG_DNOTIFY is not set
 # CONFIG_PROC_PAGE_MONITOR is not set
 CONFIG_CRC_ITU_T=m
-# CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
 # CONFIG_SCHED_DEBUG is not set
diff --git a/arch/mips/configs/bcm63xx_defconfig b/arch/mips/configs/bcm63xx_defconfig
index 861f680184b9..34d0ca638ef0 100644
--- a/arch/mips/configs/bcm63xx_defconfig
+++ b/arch/mips/configs/bcm63xx_defconfig
@@ -24,9 +24,6 @@ CONFIG_PCMCIA_BCM63XX=y
 CONFIG_NET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 CONFIG_CFG80211=y
diff --git a/arch/mips/configs/bigsur_defconfig b/arch/mips/configs/bigsur_defconfig
index d83e7d600b0a..d15961f00ece 100644
--- a/arch/mips/configs/bigsur_defconfig
+++ b/arch/mips/configs/bigsur_defconfig
@@ -49,8 +49,6 @@ CONFIG_IP_PIMSM_V2=y
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
 CONFIG_INET_IPCOMP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
 CONFIG_TCP_MD5SIG=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
@@ -59,7 +57,6 @@ CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
 CONFIG_INET6_IPCOMP=m
 CONFIG_IPV6_MIP6=m
-CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m
 CONFIG_IPV6_SIT_6RD=y
 CONFIG_IPV6_TUNNEL=m
 CONFIG_IPV6_MULTIPLE_TABLES=y
@@ -101,7 +98,6 @@ CONFIG_BAYCOM_SER_HDX=m
 CONFIG_YAM=m
 CONFIG_FW_LOADER=m
 CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_EEPROM_LEGACY=y
 CONFIG_EEPROM_MAX6875=y
@@ -230,12 +226,8 @@ CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
 CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
@@ -243,7 +235,6 @@ CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/mips/configs/bmips_be_defconfig b/arch/mips/configs/bmips_be_defconfig
index 032bb51defe8..daef132d000b 100644
--- a/arch/mips/configs/bmips_be_defconfig
+++ b/arch/mips/configs/bmips_be_defconfig
@@ -17,9 +17,6 @@ CONFIG_PACKET=y
 CONFIG_PACKET_DIAG=y
 CONFIG_UNIX=y
 CONFIG_INET=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_DIAG is not set
 CONFIG_CFG80211=y
 CONFIG_NL80211_TESTMODE=y
diff --git a/arch/mips/configs/bmips_stb_defconfig b/arch/mips/configs/bmips_stb_defconfig
index 5956fb95c19f..cd0dc37c3d84 100644
--- a/arch/mips/configs/bmips_stb_defconfig
+++ b/arch/mips/configs/bmips_stb_defconfig
@@ -12,7 +12,6 @@ CONFIG_HIGHMEM=y
 CONFIG_HIGH_RES_TIMERS=y
 CONFIG_SMP=y
 CONFIG_NR_CPUS=4
-CONFIG_CC_STACKPROTECTOR_STRONG=y
 # CONFIG_SECCOMP is not set
 CONFIG_MIPS_O32_FP64_SUPPORT=y
 # CONFIG_RD_GZIP is not set
@@ -21,8 +20,6 @@ CONFIG_MIPS_O32_FP64_SUPPORT=y
 CONFIG_RD_XZ=y
 # CONFIG_RD_LZO is not set
 # CONFIG_RD_LZ4 is not set
-# CONFIG_IOSCHED_DEADLINE is not set
-# CONFIG_IOSCHED_CFQ is not set
 CONFIG_PCI=y
 CONFIG_PCI_MSI=y
 CONFIG_PCIEASPM_POWERSAVE=y
@@ -30,7 +27,6 @@ CONFIG_PCIEPORTBUS=y
 CONFIG_PCIE_BRCMSTB=y
 CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_STAT=y
-CONFIG_CPU_FREQ_STAT_DETAILS=y
 CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
 CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
 CONFIG_CPU_FREQ_GOV_POWERSAVE=y
@@ -44,15 +40,11 @@ CONFIG_PACKET=y
 CONFIG_PACKET_DIAG=y
 CONFIG_UNIX=y
 CONFIG_INET=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_DIAG is not set
 CONFIG_CFG80211=y
 CONFIG_NL80211_TESTMODE=y
 CONFIG_WIRELESS=y
 CONFIG_MAC80211=y
-CONFIG_NL80211=y
 CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
 # CONFIG_STANDALONE is not set
@@ -70,10 +62,6 @@ CONFIG_IP_PNP_RARP=y
 CONFIG_IP_MROUTE=y
 CONFIG_IP_PIMSM_V1=y
 CONFIG_IP_PIMSM_V2=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
-# CONFIG_INET_LRO is not set
 CONFIG_INET_UDP_DIAG=y
 CONFIG_TCP_CONG_ADVANCED=y
 CONFIG_TCP_CONG_BIC=y
@@ -93,7 +81,6 @@ CONFIG_NET_SWITCHDEV=y
 CONFIG_DMA_CMA=y
 CONFIG_CMA_ALIGNMENT=12
 CONFIG_SPI=y
-CONFIG_SPI_BRCMSTB=y
 CONFIG_MTD=y
 CONFIG_MTD_CMDLINE_PARTS=y
 CONFIG_MTD_BLOCK=y
@@ -105,14 +92,11 @@ CONFIG_MTD_CFI_STAA=y
 CONFIG_MTD_ROM=y
 CONFIG_MTD_ABSENT=y
 CONFIG_MTD_PHYSMAP_OF=y
-CONFIG_MTD_M25P80=y
-CONFIG_MTD_NAND=y
 CONFIG_MTD_NAND_BRCMNAND=y
 CONFIG_MTD_SPI_NOR=y
 # CONFIG_MTD_SPI_NOR_USE_4K_SECTORS is not set
 CONFIG_MTD_UBI=y
 CONFIG_MTD_UBI_GLUEBI=y
-CONFIG_PROC_DEVICETREE=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
@@ -120,7 +104,6 @@ CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_SG=y
-CONFIG_SCSI_MULTI_LUN=y
 # CONFIG_SCSI_LOWLEVEL is not set
 CONFIG_NETDEVICES=y
 CONFIG_VLAN_8021Q=y
@@ -135,7 +118,6 @@ CONFIG_INPUT_UINPUT=y
 # CONFIG_SERIO is not set
 CONFIG_VT=y
 CONFIG_VT_HW_CONSOLE_BINDING=y
-# CONFIG_DEVKMEM is not set
 CONFIG_SERIAL_8250=y
 # CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
 CONFIG_SERIAL_8250_CONSOLE=y
@@ -203,17 +185,14 @@ CONFIG_CMDLINE="earlycon"
 CONFIG_MIPS_CMDLINE_DTB_EXTEND=y
 # CONFIG_MIPS_CMDLINE_FROM_BOOTLOADER is not set
 # CONFIG_CRYPTO_HW is not set
-CONFIG_DT_BCM974XX=y
 CONFIG_FW_CFE=y
 CONFIG_ATA=y
 CONFIG_SATA_AHCI_PLATFORM=y
-CONFIG_AHCI_BRCMSTB=y
 CONFIG_GENERIC_PHY=y
 CONFIG_GPIOLIB=y
 CONFIG_GPIO_SYSFS=y
 CONFIG_PHY_BRCM_USB=y
 CONFIG_PHY_BRCM_SATA=y
-CONFIG_PM_RUNTIME=y
 CONFIG_PM_DEBUG=y
 CONFIG_SYSVIPC=y
 CONFIG_FUNCTION_GRAPH_TRACER=y
@@ -227,3 +206,5 @@ CONFIG_FTRACE_SYSCALLS=y
 CONFIG_TRACER_SNAPSHOT=y
 CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y
 CONFIG_STACK_TRACER=y
+CONFIG_AHCI_BRCM=y
+CONFIG_MTD_RAW_NAND=y
diff --git a/arch/mips/configs/cavium_octeon_defconfig b/arch/mips/configs/cavium_octeon_defconfig
index 97ceaf080c0c..a2311495af79 100644
--- a/arch/mips/configs/cavium_octeon_defconfig
+++ b/arch/mips/configs/cavium_octeon_defconfig
@@ -71,7 +71,6 @@ CONFIG_NETDEVICES=y
 # CONFIG_NET_VENDOR_DEC is not set
 # CONFIG_NET_VENDOR_DLINK is not set
 # CONFIG_NET_VENDOR_EMULEX is not set
-# CONFIG_NET_VENDOR_HP is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MELLANOX is not set
diff --git a/arch/mips/configs/db1xxx_defconfig b/arch/mips/configs/db1xxx_defconfig
index b8bd66300996..a8b62df3c021 100644
--- a/arch/mips/configs/db1xxx_defconfig
+++ b/arch/mips/configs/db1xxx_defconfig
@@ -61,7 +61,6 @@ CONFIG_INET6_AH=y
 CONFIG_INET6_ESP=y
 CONFIG_INET6_IPCOMP=y
 CONFIG_IPV6_MIP6=y
-CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=y
 CONFIG_IPV6_VTI=y
 CONFIG_IPV6_SIT_6RD=y
 CONFIG_IPV6_GRE=y
diff --git a/arch/mips/configs/decstation_64_defconfig b/arch/mips/configs/decstation_64_defconfig
index 0021427a1bbe..4f74c4bde9f6 100644
--- a/arch/mips/configs/decstation_64_defconfig
+++ b/arch/mips/configs/decstation_64_defconfig
@@ -37,9 +37,6 @@ CONFIG_SYN_COOKIES=y
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
 CONFIG_INET_IPCOMP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
 CONFIG_TCP_MD5SIG=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
@@ -47,7 +44,6 @@ CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
 CONFIG_INET6_IPCOMP=m
 CONFIG_IPV6_MIP6=m
-CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_SUBTREES=y
 CONFIG_NETWORK_SECMARK=y
@@ -79,7 +75,6 @@ CONFIG_NETDEVICES=y
 CONFIG_DECLANCE=y
 # CONFIG_NET_VENDOR_AQUANTIA is not set
 # CONFIG_NET_VENDOR_ARC is not set
-# CONFIG_NET_VENDOR_AURORA is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 # CONFIG_NET_VENDOR_CADENCE is not set
 # CONFIG_NET_VENDOR_CAVIUM is not set
@@ -193,12 +188,8 @@ CONFIG_CRYPTO_CRC32=m
 CONFIG_CRYPTO_CRCT10DIF=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
 CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_ARC4=m
@@ -208,7 +199,6 @@ CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/mips/configs/decstation_defconfig b/arch/mips/configs/decstation_defconfig
index 7a97a0818ce4..f0d0546ea1e5 100644
--- a/arch/mips/configs/decstation_defconfig
+++ b/arch/mips/configs/decstation_defconfig
@@ -33,9 +33,6 @@ CONFIG_SYN_COOKIES=y
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
 CONFIG_INET_IPCOMP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
 CONFIG_TCP_MD5SIG=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
@@ -43,7 +40,6 @@ CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
 CONFIG_INET6_IPCOMP=m
 CONFIG_IPV6_MIP6=m
-CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_SUBTREES=y
 CONFIG_NETWORK_SECMARK=y
@@ -75,7 +71,6 @@ CONFIG_NETDEVICES=y
 CONFIG_DECLANCE=y
 # CONFIG_NET_VENDOR_AQUANTIA is not set
 # CONFIG_NET_VENDOR_ARC is not set
-# CONFIG_NET_VENDOR_AURORA is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 # CONFIG_NET_VENDOR_CADENCE is not set
 # CONFIG_NET_VENDOR_CAVIUM is not set
@@ -188,12 +183,8 @@ CONFIG_CRYPTO_CRC32=m
 CONFIG_CRYPTO_CRCT10DIF=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
 CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_ARC4=m
@@ -203,7 +194,6 @@ CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/mips/configs/decstation_r4k_defconfig b/arch/mips/configs/decstation_r4k_defconfig
index a0643363526d..df5ff9ddf8f4 100644
--- a/arch/mips/configs/decstation_r4k_defconfig
+++ b/arch/mips/configs/decstation_r4k_defconfig
@@ -32,9 +32,6 @@ CONFIG_SYN_COOKIES=y
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
 CONFIG_INET_IPCOMP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
 CONFIG_TCP_MD5SIG=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
@@ -42,7 +39,6 @@ CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
 CONFIG_INET6_IPCOMP=m
 CONFIG_IPV6_MIP6=m
-CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_SUBTREES=y
 CONFIG_NETWORK_SECMARK=y
@@ -74,7 +70,6 @@ CONFIG_NETDEVICES=y
 CONFIG_DECLANCE=y
 # CONFIG_NET_VENDOR_AQUANTIA is not set
 # CONFIG_NET_VENDOR_ARC is not set
-# CONFIG_NET_VENDOR_AURORA is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 # CONFIG_NET_VENDOR_CADENCE is not set
 # CONFIG_NET_VENDOR_CAVIUM is not set
@@ -188,12 +183,8 @@ CONFIG_CRYPTO_CRC32=m
 CONFIG_CRYPTO_CRCT10DIF=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
 CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_ARC4=m
@@ -203,7 +194,6 @@ CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig
index ba47c5e929b7..843d6a5a4f61 100644
--- a/arch/mips/configs/fuloong2e_defconfig
+++ b/arch/mips/configs/fuloong2e_defconfig
@@ -35,8 +35,6 @@ CONFIG_IP_MULTICAST=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_BOOTP=y
 CONFIG_NET_IPIP=m
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 CONFIG_NETFILTER=y
@@ -92,7 +90,6 @@ CONFIG_MTD_CFI_AMDSTD=m
 CONFIG_MTD_CFI_STAA=m
 CONFIG_MTD_PHYSMAP=m
 CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_RAM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
@@ -159,7 +156,6 @@ CONFIG_USB_MOUSE=y
 CONFIG_USB=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_OTG_PRODUCTLIST=y
-CONFIG_USB_WUSB_CBAF=m
 CONFIG_USB_C67X00_HCD=m
 CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_EHCI_ROOT_HUB_TT=y
@@ -219,15 +215,10 @@ CONFIG_CRYPTO_CTS=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
-CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_DEFLATE=m
 CONFIG_CRYPTO_LZO=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_CRC_CCITT=y
 CONFIG_CRC7=m
-# CONFIG_ENABLE_MUST_CHECK is not set
diff --git a/arch/mips/configs/generic/board-ocelot.config b/arch/mips/configs/generic/board-ocelot.config
index 510709565404..8cfbafa532e0 100644
--- a/arch/mips/configs/generic/board-ocelot.config
+++ b/arch/mips/configs/generic/board-ocelot.config
@@ -25,7 +25,6 @@ CONFIG_NETDEVICES=y
 CONFIG_NET_SWITCHDEV=y
 CONFIG_NET_DSA=y
 CONFIG_MSCC_OCELOT_SWITCH=y
-CONFIG_MSCC_OCELOT_SWITCH_OCELOT=y
 CONFIG_MDIO_MSCC_MIIM=y
 CONFIG_MICROSEMI_PHY=y
 
diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig
index d82f4ebf687f..7cd321b47d01 100644
--- a/arch/mips/configs/gpr_defconfig
+++ b/arch/mips/configs/gpr_defconfig
@@ -29,9 +29,6 @@ CONFIG_IP_ROUTE_VERBOSE=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_BOOTP=y
 CONFIG_SYN_COOKIES=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_IPV6 is not set
 CONFIG_NETWORK_SECMARK=y
 CONFIG_NETFILTER=y
@@ -220,9 +217,6 @@ CONFIG_HDLC_X25=m
 CONFIG_PCI200SYN=m
 CONFIG_WANXL=m
 CONFIG_FARSYNC=m
-CONFIG_DSCC4=m
-CONFIG_DSCC4_PCISYNC=y
-CONFIG_DSCC4_PCI_RST=y
 CONFIG_LAPBETHER=m
 # CONFIG_INPUT_KEYBOARD is not set
 # CONFIG_INPUT_MOUSE is not set
@@ -288,7 +282,6 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
@@ -299,7 +292,6 @@ CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_DEFLATE=m
-# CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_MAGIC_SYSRQ=y
 CONFIG_CMDLINE_BOOL=y
 CONFIG_CMDLINE="console=ttyS0,115200 root=/dev/nfs rw ip=auto"
diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig
index 70a4ba90f491..13df29901237 100644
--- a/arch/mips/configs/ip22_defconfig
+++ b/arch/mips/configs/ip22_defconfig
@@ -36,9 +36,6 @@ CONFIG_IP_PNP_BOOTP=y
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
 CONFIG_INET_IPCOMP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
 CONFIG_TCP_MD5SIG=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
@@ -47,7 +44,6 @@ CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
 CONFIG_INET6_IPCOMP=m
 CONFIG_IPV6_MIP6=m
-CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m
 CONFIG_IPV6_TUNNEL=m
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_SUBTREES=y
@@ -226,7 +222,6 @@ CONFIG_SERIO_RAW=m
 CONFIG_VT_HW_CONSOLE_BINDING=y
 CONFIG_SERIAL_IP22_ZILOG=m
 # CONFIG_HW_RANDOM is not set
-CONFIG_RAW_DRIVER=m
 # CONFIG_HWMON is not set
 CONFIG_THERMAL=y
 CONFIG_WATCHDOG=y
@@ -320,11 +315,7 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
-CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
@@ -333,7 +324,6 @@ CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/mips/configs/ip27_defconfig b/arch/mips/configs/ip27_defconfig
index 821630ac1be7..3e86f8106ba0 100644
--- a/arch/mips/configs/ip27_defconfig
+++ b/arch/mips/configs/ip27_defconfig
@@ -33,9 +33,6 @@ CONFIG_NET_KEY_MIGRATE=y
 CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
 CONFIG_IP_PNP=y
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
 CONFIG_TCP_MD5SIG=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
@@ -44,10 +41,6 @@ CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
 CONFIG_INET6_IPCOMP=m
 CONFIG_IPV6_MIP6=m
-CONFIG_INET6_XFRM_MODE_TRANSPORT=m
-CONFIG_INET6_XFRM_MODE_TUNNEL=m
-CONFIG_INET6_XFRM_MODE_BEET=m
-CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m
 CONFIG_IPV6_SIT=m
 CONFIG_IPV6_SIT_6RD=y
 CONFIG_IPV6_TUNNEL=m
@@ -92,7 +85,6 @@ CONFIG_CFG80211=m
 CONFIG_MAC80211=m
 CONFIG_RFKILL=m
 CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_SCSI=y
@@ -115,7 +107,6 @@ CONFIG_SCSI_AIC94XX=m
 # CONFIG_AIC94XX_DEBUG is not set
 CONFIG_SCSI_MVSAS=m
 # CONFIG_SCSI_MVSAS_DEBUG is not set
-CONFIG_SCSI_DPT_I2O=m
 CONFIG_SCSI_MPT2SAS=m
 CONFIG_LIBFC=m
 CONFIG_SCSI_QLOGIC_1280=y
@@ -126,8 +117,6 @@ CONFIG_SCSI_DH_RDAC=m
 CONFIG_SCSI_DH_HP_SW=m
 CONFIG_SCSI_DH_EMC=m
 CONFIG_SCSI_DH_ALUA=m
-CONFIG_SCSI_OSD_INITIATOR=m
-CONFIG_SCSI_OSD_ULD=m
 CONFIG_MD=y
 CONFIG_BLK_DEV_MD=y
 CONFIG_MD_LINEAR=m
@@ -166,7 +155,6 @@ CONFIG_JME=m
 CONFIG_MLX4_EN=m
 # CONFIG_MLX4_DEBUG is not set
 CONFIG_KS8851_MLL=m
-CONFIG_VXGE=m
 CONFIG_AX88796=m
 CONFIG_AX88796_93CX6=y
 CONFIG_ETHOC=m
@@ -264,7 +252,6 @@ CONFIG_I2C_VIAPRO=m
 CONFIG_I2C_OCORES=m
 CONFIG_I2C_PCA_PLATFORM=m
 CONFIG_I2C_SIMTEC=m
-CONFIG_I2C_PARPORT_LIGHT=m
 CONFIG_I2C_TAOS_EVM=m
 CONFIG_I2C_STUB=m
 # CONFIG_HWMON is not set
@@ -309,7 +296,6 @@ CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
 CONFIG_SQUASHFS=m
 CONFIG_OMFS_FS=m
-CONFIG_EXOFS_FS=m
 CONFIG_NFS_FS=y
 CONFIG_SECURITYFS=y
 CONFIG_CRYPTO_CRYPTD=m
@@ -321,12 +307,8 @@ CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_VMAC=m
 CONFIG_CRYPTO_MD4=m
-CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
 CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
@@ -335,7 +317,6 @@ CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig
index 0921ef38e9fb..ba13eea0509f 100644
--- a/arch/mips/configs/ip28_defconfig
+++ b/arch/mips/configs/ip28_defconfig
@@ -29,9 +29,6 @@ CONFIG_IP_MULTICAST=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_DHCP=y
 CONFIG_IP_PNP_BOOTP=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 CONFIG_TCP_MD5SIG=y
 # CONFIG_IPV6 is not set
 CONFIG_SCSI=y
diff --git a/arch/mips/configs/ip32_defconfig b/arch/mips/configs/ip32_defconfig
index 74020aa3440b..8ced2224c328 100644
--- a/arch/mips/configs/ip32_defconfig
+++ b/arch/mips/configs/ip32_defconfig
@@ -43,7 +43,6 @@ CONFIG_IPV6_TUNNEL=m
 CONFIG_NETWORK_SECMARK=y
 CONFIG_CONNECTOR=y
 CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_RAID_ATTRS=y
 CONFIG_SCSI=y
@@ -165,7 +164,6 @@ CONFIG_CRYPTO_MICHAEL_MIC=y
 CONFIG_CRYPTO_SHA1=y
 CONFIG_CRYPTO_SHA256=y
 CONFIG_CRYPTO_SHA512=y
-CONFIG_CRYPTO_TGR192=y
 CONFIG_CRYPTO_WP512=y
 CONFIG_CRYPTO_ANUBIS=y
 CONFIG_CRYPTO_ARC4=y
diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig
index 843f360da5f2..106b21cb677f 100644
--- a/arch/mips/configs/jazz_defconfig
+++ b/arch/mips/configs/jazz_defconfig
@@ -32,7 +32,6 @@ CONFIG_PARPORT_1284=y
 CONFIG_DEVTMPFS=y
 CONFIG_BLK_DEV_FD=m
 CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=m
 CONFIG_CDROM_PKTCDVD=m
diff --git a/arch/mips/configs/lemote2f_defconfig b/arch/mips/configs/lemote2f_defconfig
index 791894c4d8fb..7e598d338979 100644
--- a/arch/mips/configs/lemote2f_defconfig
+++ b/arch/mips/configs/lemote2f_defconfig
@@ -43,9 +43,6 @@ CONFIG_IP_MROUTE=y
 CONFIG_IP_PIMSM_V1=y
 CONFIG_IP_PIMSM_V2=y
 CONFIG_SYN_COOKIES=y
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
 CONFIG_TCP_CONG_ADVANCED=y
 CONFIG_TCP_CONG_BIC=y
 CONFIG_DEFAULT_BIC=y
@@ -77,7 +74,6 @@ CONFIG_MAC80211_LEDS=y
 CONFIG_RFKILL=m
 CONFIG_RFKILL_INPUT=y
 CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_BLK_DEV_SD=y
@@ -312,12 +308,8 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_XTS=m
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_RMD128=m
 CONFIG_CRYPTO_RMD160=m
-CONFIG_CRYPTO_RMD256=m
-CONFIG_CRYPTO_RMD320=m
 CONFIG_CRYPTO_SHA1=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
@@ -326,7 +318,6 @@ CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SEED=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
diff --git a/arch/mips/configs/loongson1b_defconfig b/arch/mips/configs/loongson1b_defconfig
index 25e70423e17d..68207b31dc20 100644
--- a/arch/mips/configs/loongson1b_defconfig
+++ b/arch/mips/configs/loongson1b_defconfig
@@ -28,9 +28,6 @@ CONFIG_INET=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_DHCP=y
 CONFIG_SYN_COOKIES=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
@@ -115,7 +112,6 @@ CONFIG_NLS_ISO8859_1=m
 # CONFIG_CRYPTO_ECHAINIV is not set
 # CONFIG_CRYPTO_HW is not set
 CONFIG_DYNAMIC_DEBUG=y
-# CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_DEBUG_FS=y
 CONFIG_MAGIC_SYSRQ=y
 # CONFIG_SCHED_DEBUG is not set
diff --git a/arch/mips/configs/loongson1c_defconfig b/arch/mips/configs/loongson1c_defconfig
index 3a158d4d2fab..c3910a9dee9e 100644
--- a/arch/mips/configs/loongson1c_defconfig
+++ b/arch/mips/configs/loongson1c_defconfig
@@ -29,9 +29,6 @@ CONFIG_INET=y
 CONFIG_IP_PNP=y
 CONFIG_IP_PNP_DHCP=y
 CONFIG_SYN_COOKIES=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
@@ -116,7 +113,6 @@ CONFIG_NLS_ISO8859_1=m
 # CONFIG_CRYPTO_ECHAINIV is not set
 # CONFIG_CRYPTO_HW is not set
 CONFIG_DYNAMIC_DEBUG=y
-# CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_DEBUG_FS=y
 CONFIG_MAGIC_SYSRQ=y
 # CONFIG_SCHED_DEBUG is not set
diff --git a/arch/mips/configs/loongson2k_defconfig b/arch/mips/configs/loongson2k_defconfig
index e948ca487e2d..728bef666f7a 100644
--- a/arch/mips/configs/loongson2k_defconfig
+++ b/arch/mips/configs/loongson2k_defconfig
@@ -95,7 +95,6 @@ CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_MTD=m
 CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_CRYPTOLOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_RAID_ATTRS=m
@@ -229,7 +228,6 @@ CONFIG_SERIAL_8250_RSA=y
 CONFIG_SERIAL_OF_PLATFORM=y
 CONFIG_SERIAL_NONSTANDARD=y
 CONFIG_HW_RANDOM=y
-CONFIG_RAW_DRIVER=m
 CONFIG_I2C_CHARDEV=y
 CONFIG_I2C_PIIX4=y
 CONFIG_GPIO_LOONGSON=y
@@ -336,7 +334,6 @@ CONFIG_DEFAULT_SECURITY_DAC=y
 CONFIG_CRYPTO_SEQIV=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MD5=y
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAST5=m
diff --git a/arch/mips/configs/loongson3_defconfig b/arch/mips/configs/loongson3_defconfig
index 25ecd15bc952..aca66a5f330d 100644
--- a/arch/mips/configs/loongson3_defconfig
+++ b/arch/mips/configs/loongson3_defconfig
@@ -143,7 +143,6 @@ CONFIG_DEVTMPFS=y
 CONFIG_DEVTMPFS_MOUNT=y
 CONFIG_MTD=m
 CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_CRYPTOLOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
 CONFIG_VIRTIO_BLK=y
@@ -268,7 +267,6 @@ CONFIG_SERIAL_OF_PLATFORM=y
 CONFIG_SERIAL_NONSTANDARD=y
 CONFIG_VIRTIO_CONSOLE=y
 CONFIG_HW_RANDOM=y
-CONFIG_RAW_DRIVER=m
 CONFIG_I2C_CHARDEV=y
 CONFIG_I2C_PIIX4=y
 CONFIG_GPIO_LOONGSON=y
diff --git a/arch/mips/configs/malta_defconfig b/arch/mips/configs/malta_defconfig
index 7a5bdd236a2a..265d38dffbf6 100644
--- a/arch/mips/configs/malta_defconfig
+++ b/arch/mips/configs/malta_defconfig
@@ -42,8 +42,6 @@ CONFIG_SYN_COOKIES=y
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
 CONFIG_INET_IPCOMP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
 CONFIG_TCP_MD5SIG=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
@@ -229,7 +227,6 @@ CONFIG_MTD_UBI=m
 CONFIG_MTD_UBI_GLUEBI=m
 CONFIG_BLK_DEV_FD=m
 CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_CDROM_PKTCDVD=m
@@ -237,7 +234,6 @@ CONFIG_ATA_OVER_ETH=m
 CONFIG_RAID_ATTRS=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_ST=m
-CONFIG_CHR_DEV_OSST=m
 CONFIG_BLK_DEV_SR=y
 CONFIG_CHR_DEV_SG=m
 CONFIG_SCSI_CONSTANTS=y
@@ -408,7 +404,6 @@ CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
diff --git a/arch/mips/configs/malta_kvm_defconfig b/arch/mips/configs/malta_kvm_defconfig
index b5ba08d7ab57..1d2b248c7cd3 100644
--- a/arch/mips/configs/malta_kvm_defconfig
+++ b/arch/mips/configs/malta_kvm_defconfig
@@ -46,8 +46,6 @@ CONFIG_SYN_COOKIES=y
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
 CONFIG_INET_IPCOMP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
 CONFIG_TCP_MD5SIG=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
@@ -233,7 +231,6 @@ CONFIG_MTD_UBI=m
 CONFIG_MTD_UBI_GLUEBI=m
 CONFIG_BLK_DEV_FD=m
 CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_CDROM_PKTCDVD=m
@@ -241,7 +238,6 @@ CONFIG_ATA_OVER_ETH=m
 CONFIG_RAID_ATTRS=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_ST=m
-CONFIG_CHR_DEV_OSST=m
 CONFIG_BLK_DEV_SR=y
 CONFIG_CHR_DEV_SG=m
 CONFIG_SCSI_CONSTANTS=y
@@ -415,7 +411,6 @@ CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
diff --git a/arch/mips/configs/malta_qemu_32r6_defconfig b/arch/mips/configs/malta_qemu_32r6_defconfig
index 6fb9bc29f4a0..fd63a2b152f6 100644
--- a/arch/mips/configs/malta_qemu_32r6_defconfig
+++ b/arch/mips/configs/malta_qemu_32r6_defconfig
@@ -76,7 +76,6 @@ CONFIG_NET_ACT_POLICE=y
 # CONFIG_WIRELESS is not set
 CONFIG_DEVTMPFS=y
 CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_SG=y
 # CONFIG_SCSI_LOWLEVEL is not set
@@ -98,7 +97,6 @@ CONFIG_PCNET32=y
 # CONFIG_NET_VENDOR_DEC is not set
 # CONFIG_NET_VENDOR_DLINK is not set
 # CONFIG_NET_VENDOR_EMULEX is not set
-# CONFIG_NET_VENDOR_HP is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MELLANOX is not set
@@ -172,7 +170,6 @@ CONFIG_NLS_ISO8859_1=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
diff --git a/arch/mips/configs/maltaaprp_defconfig b/arch/mips/configs/maltaaprp_defconfig
index eb72df528243..1f07e354c954 100644
--- a/arch/mips/configs/maltaaprp_defconfig
+++ b/arch/mips/configs/maltaaprp_defconfig
@@ -78,7 +78,6 @@ CONFIG_NET_ACT_POLICE=y
 # CONFIG_WIRELESS is not set
 CONFIG_DEVTMPFS=y
 CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_SG=y
 # CONFIG_SCSI_LOWLEVEL is not set
@@ -100,7 +99,6 @@ CONFIG_PCNET32=y
 # CONFIG_NET_VENDOR_DEC is not set
 # CONFIG_NET_VENDOR_DLINK is not set
 # CONFIG_NET_VENDOR_EMULEX is not set
-# CONFIG_NET_VENDOR_HP is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MELLANOX is not set
@@ -173,7 +171,6 @@ CONFIG_NLS_ISO8859_1=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
diff --git a/arch/mips/configs/maltasmvp_defconfig b/arch/mips/configs/maltasmvp_defconfig
index 1fb40d310f49..5cd3eca236de 100644
--- a/arch/mips/configs/maltasmvp_defconfig
+++ b/arch/mips/configs/maltasmvp_defconfig
@@ -79,7 +79,6 @@ CONFIG_NET_ACT_POLICE=y
 # CONFIG_WIRELESS is not set
 CONFIG_DEVTMPFS=y
 CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_SG=y
 # CONFIG_SCSI_LOWLEVEL is not set
@@ -99,7 +98,6 @@ CONFIG_PCNET32=y
 # CONFIG_NET_VENDOR_DEC is not set
 # CONFIG_NET_VENDOR_DLINK is not set
 # CONFIG_NET_VENDOR_EMULEX is not set
-# CONFIG_NET_VENDOR_HP is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MELLANOX is not set
@@ -174,7 +172,6 @@ CONFIG_NLS_ISO8859_1=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
diff --git a/arch/mips/configs/maltasmvp_eva_defconfig b/arch/mips/configs/maltasmvp_eva_defconfig
index 75cb778c6149..45688e742a15 100644
--- a/arch/mips/configs/maltasmvp_eva_defconfig
+++ b/arch/mips/configs/maltasmvp_eva_defconfig
@@ -80,7 +80,6 @@ CONFIG_NET_ACT_POLICE=y
 # CONFIG_WIRELESS is not set
 CONFIG_DEVTMPFS=y
 CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_SG=y
 # CONFIG_SCSI_LOWLEVEL is not set
@@ -102,7 +101,6 @@ CONFIG_PCNET32=y
 # CONFIG_NET_VENDOR_DEC is not set
 # CONFIG_NET_VENDOR_DLINK is not set
 # CONFIG_NET_VENDOR_EMULEX is not set
-# CONFIG_NET_VENDOR_HP is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MELLANOX is not set
@@ -176,7 +174,6 @@ CONFIG_NLS_ISO8859_1=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
diff --git a/arch/mips/configs/maltaup_defconfig b/arch/mips/configs/maltaup_defconfig
index 7b4f247dc60c..136f965784db 100644
--- a/arch/mips/configs/maltaup_defconfig
+++ b/arch/mips/configs/maltaup_defconfig
@@ -77,7 +77,6 @@ CONFIG_NET_ACT_POLICE=y
 # CONFIG_WIRELESS is not set
 CONFIG_DEVTMPFS=y
 CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_SG=y
 # CONFIG_SCSI_LOWLEVEL is not set
@@ -99,7 +98,6 @@ CONFIG_PCNET32=y
 # CONFIG_NET_VENDOR_DEC is not set
 # CONFIG_NET_VENDOR_DLINK is not set
 # CONFIG_NET_VENDOR_EMULEX is not set
-# CONFIG_NET_VENDOR_HP is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MELLANOX is not set
@@ -172,7 +170,6 @@ CONFIG_NLS_ISO8859_1=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
diff --git a/arch/mips/configs/maltaup_xpa_defconfig b/arch/mips/configs/maltaup_xpa_defconfig
index 8d58653f1b4e..75b8da8d9927 100644
--- a/arch/mips/configs/maltaup_xpa_defconfig
+++ b/arch/mips/configs/maltaup_xpa_defconfig
@@ -45,8 +45,6 @@ CONFIG_SYN_COOKIES=y
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
 CONFIG_INET_IPCOMP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
 CONFIG_TCP_MD5SIG=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
@@ -231,7 +229,6 @@ CONFIG_MTD_UBI=m
 CONFIG_MTD_UBI_GLUEBI=m
 CONFIG_BLK_DEV_FD=m
 CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_CDROM_PKTCDVD=m
@@ -239,7 +236,6 @@ CONFIG_ATA_OVER_ETH=m
 CONFIG_RAID_ATTRS=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_ST=m
-CONFIG_CHR_DEV_OSST=m
 CONFIG_BLK_DEV_SR=y
 CONFIG_CHR_DEV_SG=m
 CONFIG_SCSI_CONSTANTS=y
@@ -414,7 +410,6 @@ CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig
index 4194e79b435c..efbfaa539938 100644
--- a/arch/mips/configs/mtx1_defconfig
+++ b/arch/mips/configs/mtx1_defconfig
@@ -58,13 +58,9 @@ CONFIG_SYN_COOKIES=y
 CONFIG_INET_AH=m
 CONFIG_INET_ESP=m
 CONFIG_INET_IPCOMP=m
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
 CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
 CONFIG_INET6_IPCOMP=m
-CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m
 CONFIG_IPV6_TUNNEL=m
 CONFIG_NETWORK_SECMARK=y
 CONFIG_NETFILTER=y
@@ -284,7 +280,6 @@ CONFIG_PCMCIA_XIRCOM=m
 CONFIG_DL2K=m
 CONFIG_SUNDANCE=m
 CONFIG_PCMCIA_FMVJ18X=m
-CONFIG_HP100=m
 CONFIG_E100=m
 CONFIG_E1000=m
 CONFIG_IXGB=m
@@ -368,9 +363,6 @@ CONFIG_HDLC_X25=m
 CONFIG_PCI200SYN=m
 CONFIG_WANXL=m
 CONFIG_FARSYNC=m
-CONFIG_DSCC4=m
-CONFIG_DSCC4_PCISYNC=y
-CONFIG_DSCC4_PCI_RST=y
 CONFIG_LAPBETHER=m
 # CONFIG_KEYBOARD_ATKBD is not set
 CONFIG_KEYBOARD_GPIO=y
@@ -683,7 +675,6 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
@@ -692,5 +683,4 @@ CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
-# CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/mips/configs/omega2p_defconfig b/arch/mips/configs/omega2p_defconfig
index fc39ddf610a9..9c34daf83563 100644
--- a/arch/mips/configs/omega2p_defconfig
+++ b/arch/mips/configs/omega2p_defconfig
@@ -35,9 +35,6 @@ CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig
index fd567247adc7..48dd02d01ac1 100644
--- a/arch/mips/configs/pic32mzda_defconfig
+++ b/arch/mips/configs/pic32mzda_defconfig
@@ -45,7 +45,6 @@ CONFIG_KEYBOARD_GPIO_POLLED=m
 CONFIG_SERIAL_PIC32=y
 CONFIG_SERIAL_PIC32_CONSOLE=y
 CONFIG_HW_RANDOM=y
-CONFIG_RAW_DRIVER=m
 CONFIG_GPIO_SYSFS=y
 # CONFIG_HWMON is not set
 CONFIG_HIDRAW=y
diff --git a/arch/mips/configs/rb532_defconfig b/arch/mips/configs/rb532_defconfig
index 252d472387aa..93306f5e045b 100644
--- a/arch/mips/configs/rb532_defconfig
+++ b/arch/mips/configs/rb532_defconfig
@@ -33,9 +33,6 @@ CONFIG_IP_MULTIPLE_TABLES=y
 CONFIG_IP_ROUTE_MULTIPATH=y
 CONFIG_IP_ROUTE_VERBOSE=y
 CONFIG_SYN_COOKIES=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 CONFIG_INET_DIAG=m
 CONFIG_TCP_CONG_ADVANCED=y
 CONFIG_TCP_CONG_CUBIC=m
@@ -162,5 +159,4 @@ CONFIG_SQUASHFS=y
 CONFIG_CRYPTO_TEST=m
 # CONFIG_CRYPTO_HW is not set
 CONFIG_CRC16=m
-# CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_STRIP_ASM_SYMS=y
diff --git a/arch/mips/configs/rbtx49xx_defconfig b/arch/mips/configs/rbtx49xx_defconfig
index f8212a813be7..30c195f28278 100644
--- a/arch/mips/configs/rbtx49xx_defconfig
+++ b/arch/mips/configs/rbtx49xx_defconfig
@@ -21,9 +21,6 @@ CONFIG_UNIX=y
 CONFIG_INET=y
 CONFIG_IP_MULTICAST=y
 CONFIG_IP_PNP=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
 CONFIG_MTD=y
@@ -51,7 +48,6 @@ CONFIG_TC35815=y
 CONFIG_SERIAL_TXX9_CONSOLE=y
 CONFIG_SERIAL_TXX9_STDSERIAL=y
 CONFIG_SPI=y
-CONFIG_SPI_TXX9=y
 # CONFIG_HWMON is not set
 CONFIG_WATCHDOG=y
 CONFIG_TXX9_WDT=m
@@ -65,8 +61,6 @@ CONFIG_SND=m
 # CONFIG_SND_SPI is not set
 # CONFIG_SND_MIPS is not set
 CONFIG_SND_SOC=m
-CONFIG_SND_SOC_TXX9ACLC=m
-CONFIG_SND_SOC_TXX9ACLC_GENERIC=m
 # CONFIG_USB_SUPPORT is not set
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
@@ -78,7 +72,6 @@ CONFIG_RTC_CLASS=y
 CONFIG_RTC_INTF_DEV_UIE_EMUL=y
 CONFIG_RTC_DRV_RS5C348=y
 CONFIG_RTC_DRV_DS1742=y
-CONFIG_RTC_DRV_TX4939=y
 CONFIG_DMADEVICES=y
 CONFIG_TXX9_DMAC=m
 # CONFIG_DNOTIFY is not set
diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig
index 7d6f235e8ccb..d6981855221b 100644
--- a/arch/mips/configs/rm200_defconfig
+++ b/arch/mips/configs/rm200_defconfig
@@ -29,9 +29,6 @@ CONFIG_NET_IPIP=m
 CONFIG_IP_MROUTE=y
 CONFIG_IP_PIMSM_V1=y
 CONFIG_IP_PIMSM_V2=y
-CONFIG_INET_XFRM_MODE_TRANSPORT=m
-CONFIG_INET_XFRM_MODE_TUNNEL=m
-CONFIG_INET_XFRM_MODE_BEET=m
 CONFIG_TCP_MD5SIG=y
 CONFIG_IPV6_ROUTER_PREF=y
 CONFIG_IPV6_ROUTE_INFO=y
@@ -39,7 +36,6 @@ CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
 CONFIG_INET6_IPCOMP=m
 CONFIG_IPV6_MIP6=m
-CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m
 CONFIG_IPV6_TUNNEL=m
 CONFIG_IPV6_MULTIPLE_TABLES=y
 CONFIG_IPV6_SUBTREES=y
@@ -192,9 +188,7 @@ CONFIG_PARIDE_KTTI=m
 CONFIG_PARIDE_ON20=m
 CONFIG_PARIDE_ON26=m
 CONFIG_BLK_DEV_LOOP=m
-CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_SX8=m
 CONFIG_BLK_DEV_RAM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
@@ -400,7 +394,6 @@ CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
 CONFIG_CRYPTO_BLOWFISH=m
diff --git a/arch/mips/configs/rt305x_defconfig b/arch/mips/configs/rt305x_defconfig
index eb359db15dba..bf017d493002 100644
--- a/arch/mips/configs/rt305x_defconfig
+++ b/arch/mips/configs/rt305x_defconfig
@@ -35,9 +35,6 @@ CONFIG_IP_ROUTE_VERBOSE=y
 CONFIG_IP_MROUTE=y
 CONFIG_IP_MROUTE_MULTIPLE_TABLES=y
 CONFIG_SYN_COOKIES=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_DIAG is not set
 CONFIG_TCP_CONG_ADVANCED=y
 # CONFIG_TCP_CONG_BIC is not set
@@ -140,7 +137,6 @@ CONFIG_CRC32_SARWATE=y
 # CONFIG_XZ_DEC_ARMTHUMB is not set
 # CONFIG_XZ_DEC_SPARC is not set
 CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
 CONFIG_MAGIC_SYSRQ=y
diff --git a/arch/mips/configs/sb1250_swarm_defconfig b/arch/mips/configs/sb1250_swarm_defconfig
index de94bf756a93..030186f89501 100644
--- a/arch/mips/configs/sb1250_swarm_defconfig
+++ b/arch/mips/configs/sb1250_swarm_defconfig
@@ -88,7 +88,6 @@ CONFIG_CRYPTO_XCBC=m
 CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
@@ -96,7 +95,6 @@ CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
-CONFIG_CRYPTO_SALSA20=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_DEFLATE=m
diff --git a/arch/mips/configs/vocore2_defconfig b/arch/mips/configs/vocore2_defconfig
index a14f8ea5c386..0722a3bf03c0 100644
--- a/arch/mips/configs/vocore2_defconfig
+++ b/arch/mips/configs/vocore2_defconfig
@@ -35,9 +35,6 @@ CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_DIAG is not set
 # CONFIG_IPV6 is not set
 # CONFIG_WIRELESS is not set
diff --git a/arch/mips/configs/xway_defconfig b/arch/mips/configs/xway_defconfig
index eeb689f715cb..eb5acf1f24ae 100644
--- a/arch/mips/configs/xway_defconfig
+++ b/arch/mips/configs/xway_defconfig
@@ -37,9 +37,6 @@ CONFIG_IP_ROUTE_VERBOSE=y
 CONFIG_IP_MROUTE=y
 CONFIG_IP_MROUTE_MULTIPLE_TABLES=y
 CONFIG_SYN_COOKIES=y
-# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
-# CONFIG_INET_XFRM_MODE_TUNNEL is not set
-# CONFIG_INET_XFRM_MODE_BEET is not set
 # CONFIG_INET_DIAG is not set
 CONFIG_TCP_CONG_ADVANCED=y
 # CONFIG_TCP_CONG_BIC is not set
@@ -146,7 +143,6 @@ CONFIG_CRYPTO_ARC4=m
 CONFIG_CRC_ITU_T=m
 CONFIG_CRC32_SARWATE=y
 CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_MUST_CHECK is not set
 CONFIG_STRIP_ASM_SYMS=y
 CONFIG_DEBUG_FS=y
 CONFIG_MAGIC_SYSRQ=y
-- 
cgit v1.2.3


From 0668951705d356d77c8263010c7ae59e78fdb2c6 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Thu, 29 Sep 2022 15:39:57 +0800
Subject: MIPS: Silence missing prototype warning

Silence the following two warnings when make W=1:

  CC      arch/mips/lib/bswapsi.o
arch/mips/lib/bswapsi.c:5:22: warning: no previous prototype for '__bswapsi2' [-Wmissing-prototypes]
 unsigned int notrace __bswapsi2(unsigned int u)
                      ^~~~~~~~~~
  CC      arch/mips/lib/bswapdi.o
arch/mips/lib/bswapdi.c:5:28: warning: no previous prototype for '__bswapdi2' [-Wmissing-prototypes]
 unsigned long long notrace __bswapdi2(unsigned long long u)
                            ^~~~~~~~~~
  AR      arch/mips/lib/built-in.a

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/lib/bswapdi.c | 4 +++-
 arch/mips/lib/bswapsi.c | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/mips/lib/bswapdi.c b/arch/mips/lib/bswapdi.c
index fcef74084492..1d020e1c96a0 100644
--- a/arch/mips/lib/bswapdi.c
+++ b/arch/mips/lib/bswapdi.c
@@ -2,6 +2,9 @@
 #include <linux/export.h>
 #include <linux/compiler.h>
 
+/* To silence -Wmissing-prototypes. */
+unsigned long long __bswapdi2(unsigned long long u);
+
 unsigned long long notrace __bswapdi2(unsigned long long u)
 {
 	return (((u) & 0xff00000000000000ull) >> 56) |
@@ -13,5 +16,4 @@ unsigned long long notrace __bswapdi2(unsigned long long u)
 	       (((u) & 0x000000000000ff00ull) << 40) |
 	       (((u) & 0x00000000000000ffull) << 56);
 }
-
 EXPORT_SYMBOL(__bswapdi2);
diff --git a/arch/mips/lib/bswapsi.c b/arch/mips/lib/bswapsi.c
index 22d8e4f6d66e..02d9df489197 100644
--- a/arch/mips/lib/bswapsi.c
+++ b/arch/mips/lib/bswapsi.c
@@ -2,6 +2,9 @@
 #include <linux/export.h>
 #include <linux/compiler.h>
 
+/* To silence -Wmissing-prototypes. */
+unsigned int __bswapsi2(unsigned int u);
+
 unsigned int notrace __bswapsi2(unsigned int u)
 {
 	return (((u) & 0xff000000) >> 24) |
@@ -9,5 +12,4 @@ unsigned int notrace __bswapsi2(unsigned int u)
 	       (((u) & 0x0000ff00) <<  8) |
 	       (((u) & 0x000000ff) << 24);
 }
-
 EXPORT_SYMBOL(__bswapsi2);
-- 
cgit v1.2.3


From 8e6ec6ce02b5a58f01099118ddfed71d3f657b1c Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Thu, 29 Sep 2022 15:39:58 +0800
Subject: MIPS: Simplify __bswapdi2() and __bswapsi2()

Use macro definitions ___constant_swab64 and ___constant_swab32
to simplify __bswapdi2() and __bswapsi2().

Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/lib/bswapdi.c | 10 ++--------
 arch/mips/lib/bswapsi.c |  6 ++----
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/arch/mips/lib/bswapdi.c b/arch/mips/lib/bswapdi.c
index 1d020e1c96a0..88242dc7de17 100644
--- a/arch/mips/lib/bswapdi.c
+++ b/arch/mips/lib/bswapdi.c
@@ -1,19 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/export.h>
 #include <linux/compiler.h>
+#include <uapi/linux/swab.h>
 
 /* To silence -Wmissing-prototypes. */
 unsigned long long __bswapdi2(unsigned long long u);
 
 unsigned long long notrace __bswapdi2(unsigned long long u)
 {
-	return (((u) & 0xff00000000000000ull) >> 56) |
-	       (((u) & 0x00ff000000000000ull) >> 40) |
-	       (((u) & 0x0000ff0000000000ull) >> 24) |
-	       (((u) & 0x000000ff00000000ull) >>  8) |
-	       (((u) & 0x00000000ff000000ull) <<  8) |
-	       (((u) & 0x0000000000ff0000ull) << 24) |
-	       (((u) & 0x000000000000ff00ull) << 40) |
-	       (((u) & 0x00000000000000ffull) << 56);
+	return ___constant_swab64(u);
 }
 EXPORT_SYMBOL(__bswapdi2);
diff --git a/arch/mips/lib/bswapsi.c b/arch/mips/lib/bswapsi.c
index 02d9df489197..2ed655497de5 100644
--- a/arch/mips/lib/bswapsi.c
+++ b/arch/mips/lib/bswapsi.c
@@ -1,15 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/export.h>
 #include <linux/compiler.h>
+#include <uapi/linux/swab.h>
 
 /* To silence -Wmissing-prototypes. */
 unsigned int __bswapsi2(unsigned int u);
 
 unsigned int notrace __bswapsi2(unsigned int u)
 {
-	return (((u) & 0xff000000) >> 24) |
-	       (((u) & 0x00ff0000) >>  8) |
-	       (((u) & 0x0000ff00) <<  8) |
-	       (((u) & 0x000000ff) << 24);
+	return ___constant_swab32(u);
 }
 EXPORT_SYMBOL(__bswapsi2);
-- 
cgit v1.2.3


From 40158dbf7eb2b13d8851fe0b875b4c3170ea15db Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Date: Thu, 29 Sep 2022 18:55:15 -0300
Subject: Revert "pstore: migrate to crypto acomp interface"

This reverts commit e4f0a7ec586b7644107839f5394fb685cf1aadcc.

When using this new interface, both efi_pstore and ramoops
backends are unable to properly decompress dmesg if using
zstd, lz4 and lzo algorithms (and maybe more). It does succeed
with deflate though.

The message observed in the kernel log is:

[2.328828] pstore: crypto_acomp_decompress failed, ret = -22!

The pstore infrastructure is able to collect the dmesg with
both backends tested, but since decompression fails it's
unreadable. With this revert everything is back to normal.

Fixes: e4f0a7ec586b ("pstore: migrate to crypto acomp interface")
Cc: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220929215515.276486-1-gpiccoli@igalia.com
---
 fs/pstore/platform.c | 63 ++++++++++------------------------------------------
 1 file changed, 12 insertions(+), 51 deletions(-)

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index b2fd3c20e7c2..0c034ea39954 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -28,14 +28,11 @@
 #include <linux/crypto.h>
 #include <linux/string.h>
 #include <linux/timer.h>
-#include <linux/scatterlist.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/jiffies.h>
 #include <linux/workqueue.h>
 
-#include <crypto/acompress.h>
-
 #include "internal.h"
 
 /*
@@ -93,8 +90,7 @@ module_param(compress, charp, 0444);
 MODULE_PARM_DESC(compress, "compression to use");
 
 /* Compression parameters */
-static struct crypto_acomp *tfm;
-static struct acomp_req *creq;
+static struct crypto_comp *tfm;
 
 struct pstore_zbackend {
 	int (*zbufsize)(size_t size);
@@ -272,21 +268,12 @@ static const struct pstore_zbackend zbackends[] = {
 static int pstore_compress(const void *in, void *out,
 			   unsigned int inlen, unsigned int outlen)
 {
-	struct scatterlist src, dst;
 	int ret;
 
 	if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS))
 		return -EINVAL;
 
-	sg_init_table(&src, 1);
-	sg_set_buf(&src, in, inlen);
-
-	sg_init_table(&dst, 1);
-	sg_set_buf(&dst, out, outlen);
-
-	acomp_request_set_params(creq, &src, &dst, inlen, outlen);
-
-	ret = crypto_acomp_compress(creq);
+	ret = crypto_comp_compress(tfm, in, inlen, out, &outlen);
 	if (ret) {
 		pr_err("crypto_comp_compress failed, ret = %d!\n", ret);
 		return ret;
@@ -297,7 +284,7 @@ static int pstore_compress(const void *in, void *out,
 
 static void allocate_buf_for_compression(void)
 {
-	struct crypto_acomp *acomp;
+	struct crypto_comp *ctx;
 	int size;
 	char *buf;
 
@@ -309,7 +296,7 @@ static void allocate_buf_for_compression(void)
 	if (!psinfo || tfm)
 		return;
 
-	if (!crypto_has_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC)) {
+	if (!crypto_has_comp(zbackend->name, 0, 0)) {
 		pr_err("Unknown compression: %s\n", zbackend->name);
 		return;
 	}
@@ -328,24 +315,16 @@ static void allocate_buf_for_compression(void)
 		return;
 	}
 
-	acomp = crypto_alloc_acomp(zbackend->name, 0, CRYPTO_ALG_ASYNC);
-	if (IS_ERR_OR_NULL(acomp)) {
+	ctx = crypto_alloc_comp(zbackend->name, 0, 0);
+	if (IS_ERR_OR_NULL(ctx)) {
 		kfree(buf);
 		pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name,
-		       PTR_ERR(acomp));
-		return;
-	}
-
-	creq = acomp_request_alloc(acomp);
-	if (!creq) {
-		crypto_free_acomp(acomp);
-		kfree(buf);
-		pr_err("acomp_request_alloc('%s') failed\n", zbackend->name);
+		       PTR_ERR(ctx));
 		return;
 	}
 
 	/* A non-NULL big_oops_buf indicates compression is available. */
-	tfm = acomp;
+	tfm = ctx;
 	big_oops_buf_sz = size;
 	big_oops_buf = buf;
 
@@ -355,8 +334,7 @@ static void allocate_buf_for_compression(void)
 static void free_buf_for_compression(void)
 {
 	if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) {
-		acomp_request_free(creq);
-		crypto_free_acomp(tfm);
+		crypto_free_comp(tfm);
 		tfm = NULL;
 	}
 	kfree(big_oops_buf);
@@ -693,8 +671,6 @@ static void decompress_record(struct pstore_record *record)
 	int ret;
 	int unzipped_len;
 	char *unzipped, *workspace;
-	struct acomp_req *dreq;
-	struct scatterlist src, dst;
 
 	if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed)
 		return;
@@ -718,30 +694,16 @@ static void decompress_record(struct pstore_record *record)
 	if (!workspace)
 		return;
 
-	dreq = acomp_request_alloc(tfm);
-	if (!dreq) {
-		kfree(workspace);
-		return;
-	}
-
-	sg_init_table(&src, 1);
-	sg_set_buf(&src, record->buf, record->size);
-
-	sg_init_table(&dst, 1);
-	sg_set_buf(&dst, workspace, unzipped_len);
-
-	acomp_request_set_params(dreq, &src, &dst, record->size, unzipped_len);
-
 	/* After decompression "unzipped_len" is almost certainly smaller. */
-	ret = crypto_acomp_decompress(dreq);
+	ret = crypto_comp_decompress(tfm, record->buf, record->size,
+					  workspace, &unzipped_len);
 	if (ret) {
-		pr_err("crypto_acomp_decompress failed, ret = %d!\n", ret);
+		pr_err("crypto_comp_decompress failed, ret = %d!\n", ret);
 		kfree(workspace);
 		return;
 	}
 
 	/* Append ECC notice to decompressed buffer. */
-	unzipped_len = dreq->dlen;
 	memcpy(workspace + unzipped_len, record->buf + record->size,
 	       record->ecc_notice_size);
 
@@ -749,7 +711,6 @@ static void decompress_record(struct pstore_record *record)
 	unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size,
 			   GFP_KERNEL);
 	kfree(workspace);
-	acomp_request_free(dreq);
 	if (!unzipped)
 		return;
 
-- 
cgit v1.2.3


From bab715bdaa9ebf28d99a6d1efb2704a30125e96d Mon Sep 17 00:00:00 2001
From: Orlando Chamberlain <redecorating@protonmail.com>
Date: Thu, 29 Sep 2022 11:49:56 +0000
Subject: efi: Correct Macmini DMI match in uefi cert quirk

It turns out Apple doesn't capitalise the "mini" in "Macmini" in DMI, which
is inconsistent with other model line names.

Correct the capitalisation of Macmini in the quirk for skipping loading
platform certs on T2 Macs.

Currently users get:

------------[ cut here ]------------
[Firmware Bug]: Page fault caused by firmware at PA: 0xffffa30640054000
WARNING: CPU: 1 PID: 8 at arch/x86/platform/efi/quirks.c:735 efi_crash_gracefully_on_page_fault+0x55/0xe0
Modules linked in:
CPU: 1 PID: 8 Comm: kworker/u12:0 Not tainted 5.18.14-arch1-2-t2 #1 4535eb3fc40fd08edab32a509fbf4c9bc52d111e
Hardware name: Apple Inc. Macmini8,1/Mac-7BA5B2DFE22DDD8C, BIOS 1731.120.10.0.0 (iBridge: 19.16.15071.0.0,0) 04/24/2022
Workqueue: efi_rts_wq efi_call_rts
...
---[ end trace 0000000000000000 ]---
efi: Froze efi_rts_wq and disabled EFI Runtime Services
integrity: Couldn't get size: 0x8000000000000015
integrity: MODSIGN: Couldn't get UEFI db list
efi: EFI Runtime Services are disabled!
integrity: Couldn't get size: 0x8000000000000015
integrity: Couldn't get UEFI dbx list

Fixes: 155ca952c7ca ("efi: Do not import certificates from UEFI Secure Boot for T2 Macs")
Cc: stable@vger.kernel.org
Cc: Aditya Garg <gargaditya08@live.com>
Tested-by: Samuel Jiang <chyishian.jiang@gmail.com>
Signed-off-by: Orlando Chamberlain <redecorating@protonmail.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 security/integrity/platform_certs/load_uefi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/integrity/platform_certs/load_uefi.c b/security/integrity/platform_certs/load_uefi.c
index 093894a640dc..b78753d27d8e 100644
--- a/security/integrity/platform_certs/load_uefi.c
+++ b/security/integrity/platform_certs/load_uefi.c
@@ -31,7 +31,7 @@ static const struct dmi_system_id uefi_skip_cert[] = {
 	{ UEFI_QUIRK_SKIP_CERT("Apple Inc.", "MacBookAir8,1") },
 	{ UEFI_QUIRK_SKIP_CERT("Apple Inc.", "MacBookAir8,2") },
 	{ UEFI_QUIRK_SKIP_CERT("Apple Inc.", "MacBookAir9,1") },
-	{ UEFI_QUIRK_SKIP_CERT("Apple Inc.", "MacMini8,1") },
+	{ UEFI_QUIRK_SKIP_CERT("Apple Inc.", "Macmini8,1") },
 	{ UEFI_QUIRK_SKIP_CERT("Apple Inc.", "MacPro7,1") },
 	{ UEFI_QUIRK_SKIP_CERT("Apple Inc.", "iMac20,1") },
 	{ UEFI_QUIRK_SKIP_CERT("Apple Inc.", "iMac20,2") },
-- 
cgit v1.2.3


From 82b1ec794d701478381482264f3bfada3a7bf2d9 Mon Sep 17 00:00:00 2001
From: Sumeet Pawnikar <sumeet.r.pawnikar@intel.com>
Date: Tue, 27 Sep 2022 21:17:09 +0530
Subject: thermal: core: Increase maximum number of trip points

On one of the Chrome system, if we define more than 12 trip points,
probe for thermal sensor fails with
"int3403 thermal: probe of INTC1046:03 failed with error -22"
and throws an error as
"thermal_sys: Error: Incorrect number of thermal trips".

The thermal_zone_device_register() interface needs maximum
number of trip points supported in a zone as an argument.
This number can't exceed THERMAL_MAX_TRIPS, which is currently
set to 12. To address this issue, THERMAL_MAX_TRIPS value
has to be increased.

This interface also has an argument to specify a mask of trips
which are writable. This mask is defined as an int.
This mask sets the ceiling for increasing maximum number of
supported trips. With the current implementation, maximum number
of trips can be supported is 31.

Also, THERMAL_MAX_TRIPS macro is used in one place only.
So, remove THERMAL_MAX_TRIPS macro and compare num_trips
directly with using a macro BITS_PER_TYPE(int)-1.

Signed-off-by: Sumeet Pawnikar <sumeet.r.pawnikar@intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/thermal_core.c | 15 ++++++++++++++-
 include/linux/thermal.h        |  2 --
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 7dc7cb53ce6f..7e669b60a065 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1192,7 +1192,20 @@ thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *t
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (num_trips > THERMAL_MAX_TRIPS || num_trips < 0 || mask >> num_trips) {
+	/*
+	 * Max trip count can't exceed 31 as the "mask >> num_trips" condition.
+	 * For example, shifting by 32 will result in compiler warning:
+	 * warning: right shift count >= width of type [-Wshift-count- overflow]
+	 *
+	 * Also "mask >> num_trips" will always be true with 32 bit shift.
+	 * E.g. mask = 0x80000000 for trip id 31 to be RW. Then
+	 * mask >> 32 = 0x80000000
+	 * This will result in failure for the below condition.
+	 *
+	 * Check will be true when the bit 31 of the mask is set.
+	 * 32 bit shift will cause overflow of 4 byte integer.
+	 */
+	if (num_trips > (BITS_PER_TYPE(int) - 1) || num_trips < 0 || mask >> num_trips) {
 		pr_err("Incorrect number of thermal trips\n");
 		return ERR_PTR(-EINVAL);
 	}
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 86c24ddd5985..6f1ec4fb7ef8 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -17,8 +17,6 @@
 #include <linux/workqueue.h>
 #include <uapi/linux/thermal.h>
 
-#define THERMAL_MAX_TRIPS	12
-
 /* invalid cooling state */
 #define THERMAL_CSTATE_INVALID -1UL
 
-- 
cgit v1.2.3


From da13b3361bb609f5e3fde3f57b8e2b42001513a3 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 29 Aug 2022 17:11:28 +0300
Subject: ACPI: LPSS: Replace loop with first entry retrieval

After the commit 6505e452371d ("ACPI: LPSS: Use the helper
acpi_dev_get_memory_resources()") the list is empty or
contains only resource of IORESOURCE_MEM type. Hence, no
need to check for the type, and since we break after the
first found, no need to iterate over full list. That said,
replace loop with first entry retrieval.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_lpss.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index 4f6cba8fe8de..c39a0a88f3a3 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -656,16 +656,14 @@ static int acpi_lpss_create_device(struct acpi_device *adev,
 	if (ret < 0)
 		goto err_out;
 
-	list_for_each_entry(rentry, &resource_list, node)
-		if (resource_type(rentry->res) == IORESOURCE_MEM) {
-			if (dev_desc->prv_size_override)
-				pdata->mmio_size = dev_desc->prv_size_override;
-			else
-				pdata->mmio_size = resource_size(rentry->res);
-			pdata->mmio_base = ioremap(rentry->res->start,
-						   pdata->mmio_size);
-			break;
-		}
+	rentry = list_first_entry_or_null(&resource_list, struct resource_entry, node);
+	if (rentry) {
+		if (dev_desc->prv_size_override)
+			pdata->mmio_size = dev_desc->prv_size_override;
+		else
+			pdata->mmio_size = resource_size(rentry->res);
+		pdata->mmio_base = ioremap(rentry->res->start, pdata->mmio_size);
+	}
 
 	acpi_dev_free_resource_list(&resource_list);
 
-- 
cgit v1.2.3


From 6cc401be16482bf35522c328be395507e331f9df Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 29 Aug 2022 17:11:29 +0300
Subject: ACPI: LPSS: Deduplicate skipping device in acpi_lpss_create_device()

Add a new label to deduplicate skipping device code in the
acpi_lpss_create_device(). No functional change intended.

While at it, convert the last conditional to use the classical
pattern, i.e.

	if (err)
		...handle err...

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_lpss.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index c39a0a88f3a3..7a73528aa9c2 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -670,9 +670,7 @@ static int acpi_lpss_create_device(struct acpi_device *adev,
 	if (!pdata->mmio_base) {
 		/* Avoid acpi_bus_attach() instantiating a pdev for this dev. */
 		adev->pnp.type.platform_id = 0;
-		/* Skip the device, but continue the namespace scan. */
-		ret = 0;
-		goto err_out;
+		goto out_free;
 	}
 
 	pdata->adev = adev;
@@ -683,11 +681,8 @@ static int acpi_lpss_create_device(struct acpi_device *adev,
 
 	if (dev_desc->flags & LPSS_CLK) {
 		ret = register_device_clock(adev, pdata);
-		if (ret) {
-			/* Skip the device, but continue the namespace scan. */
-			ret = 0;
-			goto err_out;
-		}
+		if (ret)
+			goto out_free;
 	}
 
 	/*
@@ -699,15 +694,19 @@ static int acpi_lpss_create_device(struct acpi_device *adev,
 
 	adev->driver_data = pdata;
 	pdev = acpi_create_platform_device(adev, dev_desc->properties);
-	if (!IS_ERR_OR_NULL(pdev)) {
-		acpi_lpss_create_device_links(adev, pdev);
-		return 1;
+	if (IS_ERR_OR_NULL(pdev)) {
+		adev->driver_data = NULL;
+		ret = PTR_ERR(pdev);
+		goto err_out;
 	}
 
-	ret = PTR_ERR(pdev);
-	adev->driver_data = NULL;
+	acpi_lpss_create_device_links(adev, pdev);
+	return 1;
 
- err_out:
+out_free:
+	/* Skip the device, but continue the namespace scan */
+	ret = 0;
+err_out:
 	kfree(pdata);
 	return ret;
 }
-- 
cgit v1.2.3


From 6166da0a02cde26c065692d0c05eb685178fee75 Mon Sep 17 00:00:00 2001
From: Dave Thaler <dthaler@microsoft.com>
Date: Tue, 27 Sep 2022 18:59:44 +0000
Subject: bpf, docs: Move legacy packet instructions to a separate file

Move legacy packet instructions to a separate file.

Signed-off-by: Dave Thaler <dthaler@microsoft.com>
Link: https://lore.kernel.org/r/20220927185958.14995-1-dthaler1968@googlemail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/instruction-set.rst | 38 ++------------------
 Documentation/bpf/linux-notes.rst     | 65 +++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 35 deletions(-)
 create mode 100644 Documentation/bpf/linux-notes.rst

diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst
index 1b0e6711dec9..352f25a1eb17 100644
--- a/Documentation/bpf/instruction-set.rst
+++ b/Documentation/bpf/instruction-set.rst
@@ -282,8 +282,6 @@ arithmetic operations in the imm field to encode the atomic operation:
 
   *(u64 *)(dst_reg + off16) += src_reg
 
-``BPF_XADD`` is a deprecated name for ``BPF_ATOMIC | BPF_ADD``.
-
 In addition to the simple atomic operations, there also is a modifier and
 two complex atomic operations:
 
@@ -331,36 +329,6 @@ There is currently only one such instruction.
 Legacy BPF Packet access instructions
 -------------------------------------
 
-eBPF has special instructions for access to packet data that have been
-carried over from classic BPF to retain the performance of legacy socket
-filters running in the eBPF interpreter.
-
-The instructions come in two forms: ``BPF_ABS | <size> | BPF_LD`` and
-``BPF_IND | <size> | BPF_LD``.
-
-These instructions are used to access packet data and can only be used when
-the program context is a pointer to networking packet.  ``BPF_ABS``
-accesses packet data at an absolute offset specified by the immediate data
-and ``BPF_IND`` access packet data at an offset that includes the value of
-a register in addition to the immediate data.
-
-These instructions have seven implicit operands:
-
- * Register R6 is an implicit input that must contain pointer to a
-   struct sk_buff.
- * Register R0 is an implicit output which contains the data fetched from
-   the packet.
- * Registers R1-R5 are scratch registers that are clobbered after a call to
-   ``BPF_ABS | BPF_LD`` or ``BPF_IND | BPF_LD`` instructions.
-
-These instructions have an implicit program exit condition as well. When an
-eBPF program is trying to access the data beyond the packet boundary, the
-program execution will be aborted.
-
-``BPF_ABS | BPF_W | BPF_LD`` means::
-
-  R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + imm32))
-
-``BPF_IND | BPF_W | BPF_LD`` means::
-
-  R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32))
+eBPF previously introduced special instructions for access to packet data that were
+carried over from classic BPF. However, these instructions are
+deprecated and should no longer be used.
diff --git a/Documentation/bpf/linux-notes.rst b/Documentation/bpf/linux-notes.rst
new file mode 100644
index 000000000000..93c01386d92c
--- /dev/null
+++ b/Documentation/bpf/linux-notes.rst
@@ -0,0 +1,65 @@
+.. contents::
+.. sectnum::
+
+==========================
+Linux implementation notes
+==========================
+
+This document provides more details specific to the Linux kernel implementation of the eBPF instruction set.
+
+Legacy BPF Packet access instructions
+=====================================
+
+As mentioned in the `ISA standard documentation <instruction-set.rst#legacy-bpf-packet-access-instructions>`_,
+Linux has special eBPF instructions for access to packet data that have been
+carried over from classic BPF to retain the performance of legacy socket
+filters running in the eBPF interpreter.
+
+The instructions come in two forms: ``BPF_ABS | <size> | BPF_LD`` and
+``BPF_IND | <size> | BPF_LD``.
+
+These instructions are used to access packet data and can only be used when
+the program context is a pointer to a networking packet.  ``BPF_ABS``
+accesses packet data at an absolute offset specified by the immediate data
+and ``BPF_IND`` access packet data at an offset that includes the value of
+a register in addition to the immediate data.
+
+These instructions have seven implicit operands:
+
+* Register R6 is an implicit input that must contain a pointer to a
+  struct sk_buff.
+* Register R0 is an implicit output which contains the data fetched from
+  the packet.
+* Registers R1-R5 are scratch registers that are clobbered by the
+  instruction.
+
+These instructions have an implicit program exit condition as well. If an
+eBPF program attempts access data beyond the packet boundary, the
+program execution will be aborted.
+
+``BPF_ABS | BPF_W | BPF_LD`` (0x20) means::
+
+  R0 = ntohl(*(u32 *) ((struct sk_buff *) R6->data + imm))
+
+where ``ntohl()`` converts a 32-bit value from network byte order to host byte order.
+
+``BPF_IND | BPF_W | BPF_LD`` (0x40) means::
+
+  R0 = ntohl(*(u32 *) ((struct sk_buff *) R6->data + src + imm))
+
+Appendix
+========
+
+For reference, the following table lists legacy Linux-specific opcodes in order by value.
+
+======  ====  ===================================================  =============
+opcode  imm   description                                          reference
+======  ====  ===================================================  =============
+0x20    any   dst = ntohl(\*(uint32_t \*)(R6->data + imm))         `Legacy BPF Packet access instructions`_
+0x28    any   dst = ntohs(\*(uint16_t \*)(R6->data + imm))         `Legacy BPF Packet access instructions`_
+0x30    any   dst = (\*(uint8_t \*)(R6->data + imm))               `Legacy BPF Packet access instructions`_
+0x38    any   dst = ntohll(\*(uint64_t \*)(R6->data + imm))        `Legacy BPF Packet access instructions`_
+0x40    any   dst = ntohl(\*(uint32_t \*)(R6->data + src + imm))   `Legacy BPF Packet access instructions`_
+0x48    any   dst = ntohs(\*(uint16_t \*)(R6->data + src + imm))   `Legacy BPF Packet access instructions`_
+0x50    any   dst = \*(uint8_t \*)(R6->data + src + imm))          `Legacy BPF Packet access instructions`_
+0x58    any   dst = ntohll(\*(uint64_t \*)(R6->data + src + imm))  `Legacy BPF Packet access instructions`_
-- 
cgit v1.2.3


From 9a0bf21337c667375d918adc41239ce54304a12c Mon Sep 17 00:00:00 2001
From: Dave Thaler <dthaler@microsoft.com>
Date: Tue, 27 Sep 2022 18:59:45 +0000
Subject: bpf, docs: Linux byteswap note

Add Linux byteswap note.

Signed-off-by: Dave Thaler <dthaler@microsoft.com>
Link: https://lore.kernel.org/r/20220927185958.14995-2-dthaler1968@googlemail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/instruction-set.rst | 4 ----
 Documentation/bpf/linux-notes.rst     | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst
index 352f25a1eb17..1735b91ec4c7 100644
--- a/Documentation/bpf/instruction-set.rst
+++ b/Documentation/bpf/instruction-set.rst
@@ -156,10 +156,6 @@ Examples:
 
   dst_reg = htobe64(dst_reg)
 
-``BPF_FROM_LE`` and ``BPF_FROM_BE`` exist as aliases for ``BPF_TO_LE`` and
-``BPF_TO_BE`` respectively.
-
-
 Jump instructions
 -----------------
 
diff --git a/Documentation/bpf/linux-notes.rst b/Documentation/bpf/linux-notes.rst
index 93c01386d92c..1c31379b469f 100644
--- a/Documentation/bpf/linux-notes.rst
+++ b/Documentation/bpf/linux-notes.rst
@@ -7,6 +7,11 @@ Linux implementation notes
 
 This document provides more details specific to the Linux kernel implementation of the eBPF instruction set.
 
+Byte swap instructions
+======================
+
+``BPF_FROM_LE`` and ``BPF_FROM_BE`` exist as aliases for ``BPF_TO_LE`` and ``BPF_TO_BE`` respectively.
+
 Legacy BPF Packet access instructions
 =====================================
 
-- 
cgit v1.2.3


From 6c7aaffb24efbd5d1ae067b2b629b3ffcc37e18e Mon Sep 17 00:00:00 2001
From: Dave Thaler <dthaler@microsoft.com>
Date: Tue, 27 Sep 2022 18:59:46 +0000
Subject: bpf, docs: Move Clang notes to a separate file

Move Clang notes to a separate file.

Signed-off-by: Dave Thaler <dthaler@microsoft.com>
Link: https://lore.kernel.org/r/20220927185958.14995-3-dthaler1968@googlemail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/clang-notes.rst     | 24 ++++++++++++++++++++++++
 Documentation/bpf/instruction-set.rst |  6 ------
 2 files changed, 24 insertions(+), 6 deletions(-)
 create mode 100644 Documentation/bpf/clang-notes.rst

diff --git a/Documentation/bpf/clang-notes.rst b/Documentation/bpf/clang-notes.rst
new file mode 100644
index 000000000000..b15179cb5117
--- /dev/null
+++ b/Documentation/bpf/clang-notes.rst
@@ -0,0 +1,24 @@
+.. contents::
+.. sectnum::
+
+==========================
+Clang implementation notes
+==========================
+
+This document provides more details specific to the Clang/LLVM implementation of the eBPF instruction set.
+
+Versions
+========
+
+Clang defined "CPU" versions, where a CPU version of 3 corresponds to the current eBPF ISA.
+
+Clang can select the eBPF ISA version using ``-mcpu=v3`` for example to select version 3.
+
+Atomic operations
+=================
+
+Clang can generate atomic instructions by default when ``-mcpu=v3`` is
+enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction
+Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable
+the atomics features, while keeping a lower ``-mcpu`` version, you can use
+``-Xclang -target-feature -Xclang +alu32``.
diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst
index 1735b91ec4c7..541483118f65 100644
--- a/Documentation/bpf/instruction-set.rst
+++ b/Documentation/bpf/instruction-set.rst
@@ -303,12 +303,6 @@ The ``BPF_CMPXCHG`` operation atomically compares the value addressed by
 value that was at ``dst_reg + off`` before the operation is zero-extended
 and loaded back to ``R0``.
 
-Clang can generate atomic instructions by default when ``-mcpu=v3`` is
-enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction
-Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable
-the atomics features, while keeping a lower ``-mcpu`` version, you can use
-``-Xclang -target-feature -Xclang +alu32``.
-
 64-bit immediate instructions
 -----------------------------
 
-- 
cgit v1.2.3


From ee159bdbdbce293e66d7b9249208f367faff5d81 Mon Sep 17 00:00:00 2001
From: Dave Thaler <dthaler@microsoft.com>
Date: Tue, 27 Sep 2022 18:59:47 +0000
Subject: bpf, docs: Add Clang note about BPF_ALU

Add Clang note about BPF_ALU.

Signed-off-by: Dave Thaler <dthaler@microsoft.com>
Link: https://lore.kernel.org/r/20220927185958.14995-4-dthaler1968@googlemail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/clang-notes.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/bpf/clang-notes.rst b/Documentation/bpf/clang-notes.rst
index b15179cb5117..528feddf2db9 100644
--- a/Documentation/bpf/clang-notes.rst
+++ b/Documentation/bpf/clang-notes.rst
@@ -14,6 +14,12 @@ Clang defined "CPU" versions, where a CPU version of 3 corresponds to the curren
 
 Clang can select the eBPF ISA version using ``-mcpu=v3`` for example to select version 3.
 
+Arithmetic instructions
+=======================
+
+For CPU versions prior to 3, Clang v7.0 and later can enable ``BPF_ALU`` support with
+``-Xclang -target-feature -Xclang +alu32``.  In CPU version 3, support is automatically included.
+
 Atomic operations
 =================
 
-- 
cgit v1.2.3


From 5a8921ba96ceaec0c00c8855e48940d2739c5c3b Mon Sep 17 00:00:00 2001
From: Dave Thaler <dthaler@microsoft.com>
Date: Tue, 27 Sep 2022 18:59:48 +0000
Subject: bpf, docs: Add TOC and fix formatting.

Add TOC and fix formatting.

Signed-off-by: Dave Thaler <dthaler@microsoft.com>
Link: https://lore.kernel.org/r/20220927185958.14995-5-dthaler1968@googlemail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/instruction-set.rst | 268 +++++++++++++++++-----------------
 1 file changed, 136 insertions(+), 132 deletions(-)

diff --git a/Documentation/bpf/instruction-set.rst b/Documentation/bpf/instruction-set.rst
index 541483118f65..4997d2088fef 100644
--- a/Documentation/bpf/instruction-set.rst
+++ b/Documentation/bpf/instruction-set.rst
@@ -1,7 +1,12 @@
+.. contents::
+.. sectnum::
+
+========================================
+eBPF Instruction Set Specification, v1.0
+========================================
+
+This document specifies version 1.0 of the eBPF instruction set.
 
-====================
-eBPF Instruction Set
-====================
 
 Registers and calling convention
 ================================
@@ -11,10 +16,10 @@ all of which are 64-bits wide.
 
 The eBPF calling convention is defined as:
 
- * R0: return value from function calls, and exit value for eBPF programs
- * R1 - R5: arguments for function calls
- * R6 - R9: callee saved registers that function calls will preserve
- * R10: read-only frame pointer to access stack
+* R0: return value from function calls, and exit value for eBPF programs
+* R1 - R5: arguments for function calls
+* R6 - R9: callee saved registers that function calls will preserve
+* R10: read-only frame pointer to access stack
 
 R0 - R5 are scratch registers and eBPF programs needs to spill/fill them if
 necessary across calls.
@@ -24,17 +29,17 @@ Instruction encoding
 
 eBPF has two instruction encodings:
 
- * the basic instruction encoding, which uses 64 bits to encode an instruction
- * the wide instruction encoding, which appends a second 64-bit immediate value
-   (imm64) after the basic instruction for a total of 128 bits.
+* the basic instruction encoding, which uses 64 bits to encode an instruction
+* the wide instruction encoding, which appends a second 64-bit immediate value
+  (imm64) after the basic instruction for a total of 128 bits.
 
 The basic instruction encoding looks as follows:
 
- =============  =======  ===============  ====================  ============
- 32 bits (MSB)  16 bits  4 bits           4 bits                8 bits (LSB)
- =============  =======  ===============  ====================  ============
- immediate      offset   source register  destination register  opcode
- =============  =======  ===============  ====================  ============
+=============  =======  ===============  ====================  ============
+32 bits (MSB)  16 bits  4 bits           4 bits                8 bits (LSB)
+=============  =======  ===============  ====================  ============
+immediate      offset   source register  destination register  opcode
+=============  =======  ===============  ====================  ============
 
 Note that most instructions do not use all of the fields.
 Unused fields shall be cleared to zero.
@@ -44,30 +49,30 @@ Instruction classes
 
 The three LSB bits of the 'opcode' field store the instruction class:
 
-  =========  =====  ===============================
-  class      value  description
-  =========  =====  ===============================
-  BPF_LD     0x00   non-standard load operations
-  BPF_LDX    0x01   load into register operations
-  BPF_ST     0x02   store from immediate operations
-  BPF_STX    0x03   store from register operations
-  BPF_ALU    0x04   32-bit arithmetic operations
-  BPF_JMP    0x05   64-bit jump operations
-  BPF_JMP32  0x06   32-bit jump operations
-  BPF_ALU64  0x07   64-bit arithmetic operations
-  =========  =====  ===============================
+=========  =====  ===============================  ===================================
+class      value  description                      reference
+=========  =====  ===============================  ===================================
+BPF_LD     0x00   non-standard load operations     `Load and store instructions`_
+BPF_LDX    0x01   load into register operations    `Load and store instructions`_
+BPF_ST     0x02   store from immediate operations  `Load and store instructions`_
+BPF_STX    0x03   store from register operations   `Load and store instructions`_
+BPF_ALU    0x04   32-bit arithmetic operations     `Arithmetic and jump instructions`_
+BPF_JMP    0x05   64-bit jump operations           `Arithmetic and jump instructions`_
+BPF_JMP32  0x06   32-bit jump operations           `Arithmetic and jump instructions`_
+BPF_ALU64  0x07   64-bit arithmetic operations     `Arithmetic and jump instructions`_
+=========  =====  ===============================  ===================================
 
 Arithmetic and jump instructions
 ================================
 
-For arithmetic and jump instructions (BPF_ALU, BPF_ALU64, BPF_JMP and
-BPF_JMP32), the 8-bit 'opcode' field is divided into three parts:
+For arithmetic and jump instructions (``BPF_ALU``, ``BPF_ALU64``, ``BPF_JMP`` and
+``BPF_JMP32``), the 8-bit 'opcode' field is divided into three parts:
 
-  ==============  ======  =================
-  4 bits (MSB)    1 bit   3 bits (LSB)
-  ==============  ======  =================
-  operation code  source  instruction class
-  ==============  ======  =================
+==============  ======  =================
+4 bits (MSB)    1 bit   3 bits (LSB)
+==============  ======  =================
+operation code  source  instruction class
+==============  ======  =================
 
 The 4th bit encodes the source operand:
 
@@ -84,51 +89,51 @@ The four MSB bits store the operation code.
 Arithmetic instructions
 -----------------------
 
-BPF_ALU uses 32-bit wide operands while BPF_ALU64 uses 64-bit wide operands for
+``BPF_ALU`` uses 32-bit wide operands while ``BPF_ALU64`` uses 64-bit wide operands for
 otherwise identical operations.
-The code field encodes the operation as below:
-
-  ========  =====  =================================================
-  code      value  description
-  ========  =====  =================================================
-  BPF_ADD   0x00   dst += src
-  BPF_SUB   0x10   dst -= src
-  BPF_MUL   0x20   dst \*= src
-  BPF_DIV   0x30   dst /= src
-  BPF_OR    0x40   dst \|= src
-  BPF_AND   0x50   dst &= src
-  BPF_LSH   0x60   dst <<= src
-  BPF_RSH   0x70   dst >>= src
-  BPF_NEG   0x80   dst = ~src
-  BPF_MOD   0x90   dst %= src
-  BPF_XOR   0xa0   dst ^= src
-  BPF_MOV   0xb0   dst = src
-  BPF_ARSH  0xc0   sign extending shift right
-  BPF_END   0xd0   byte swap operations (see separate section below)
-  ========  =====  =================================================
-
-BPF_ADD | BPF_X | BPF_ALU means::
+The 'code' field encodes the operation as below:
+
+========  =====  ==========================================================
+code      value  description
+========  =====  ==========================================================
+BPF_ADD   0x00   dst += src
+BPF_SUB   0x10   dst -= src
+BPF_MUL   0x20   dst \*= src
+BPF_DIV   0x30   dst /= src
+BPF_OR    0x40   dst \|= src
+BPF_AND   0x50   dst &= src
+BPF_LSH   0x60   dst <<= src
+BPF_RSH   0x70   dst >>= src
+BPF_NEG   0x80   dst = ~src
+BPF_MOD   0x90   dst %= src
+BPF_XOR   0xa0   dst ^= src
+BPF_MOV   0xb0   dst = src
+BPF_ARSH  0xc0   sign extending shift right
+BPF_END   0xd0   byte swap operations (see `Byte swap instructions`_ below)
+========  =====  ==========================================================
+
+``BPF_ADD | BPF_X | BPF_ALU`` means::
 
   dst_reg = (u32) dst_reg + (u32) src_reg;
 
-BPF_ADD | BPF_X | BPF_ALU64 means::
+``BPF_ADD | BPF_X | BPF_ALU64`` means::
 
   dst_reg = dst_reg + src_reg
 
-BPF_XOR | BPF_K | BPF_ALU means::
+``BPF_XOR | BPF_K | BPF_ALU`` means::
 
   src_reg = (u32) src_reg ^ (u32) imm32
 
-BPF_XOR | BPF_K | BPF_ALU64 means::
+``BPF_XOR | BPF_K | BPF_ALU64`` means::
 
   src_reg = src_reg ^ imm32
 
 
 Byte swap instructions
-----------------------
+~~~~~~~~~~~~~~~~~~~~~~
 
 The byte swap instructions use an instruction class of ``BPF_ALU`` and a 4-bit
-code field of ``BPF_END``.
+'code' field of ``BPF_END``.
 
 The byte swap instructions operate on the destination register
 only and do not use a separate source register or immediate value.
@@ -136,14 +141,14 @@ only and do not use a separate source register or immediate value.
 The 1-bit source operand field in the opcode is used to to select what byte
 order the operation convert from or to:
 
-  =========  =====  =================================================
-  source     value  description
-  =========  =====  =================================================
-  BPF_TO_LE  0x00   convert between host byte order and little endian
-  BPF_TO_BE  0x08   convert between host byte order and big endian
-  =========  =====  =================================================
+=========  =====  =================================================
+source     value  description
+=========  =====  =================================================
+BPF_TO_LE  0x00   convert between host byte order and little endian
+BPF_TO_BE  0x08   convert between host byte order and big endian
+=========  =====  =================================================
 
-The imm field encodes the width of the swap operations.  The following widths
+The 'imm' field encodes the width of the swap operations.  The following widths
 are supported: 16, 32 and 64.
 
 Examples:
@@ -159,28 +164,28 @@ Examples:
 Jump instructions
 -----------------
 
-BPF_JMP32 uses 32-bit wide operands while BPF_JMP uses 64-bit wide operands for
+``BPF_JMP32`` uses 32-bit wide operands while ``BPF_JMP`` uses 64-bit wide operands for
 otherwise identical operations.
-The code field encodes the operation as below:
-
-  ========  =====  =========================  ============
-  code      value  description                notes
-  ========  =====  =========================  ============
-  BPF_JA    0x00   PC += off                  BPF_JMP only
-  BPF_JEQ   0x10   PC += off if dst == src
-  BPF_JGT   0x20   PC += off if dst > src     unsigned
-  BPF_JGE   0x30   PC += off if dst >= src    unsigned
-  BPF_JSET  0x40   PC += off if dst & src
-  BPF_JNE   0x50   PC += off if dst != src
-  BPF_JSGT  0x60   PC += off if dst > src     signed
-  BPF_JSGE  0x70   PC += off if dst >= src    signed
-  BPF_CALL  0x80   function call
-  BPF_EXIT  0x90   function / program return  BPF_JMP only
-  BPF_JLT   0xa0   PC += off if dst < src     unsigned
-  BPF_JLE   0xb0   PC += off if dst <= src    unsigned
-  BPF_JSLT  0xc0   PC += off if dst < src     signed
-  BPF_JSLE  0xd0   PC += off if dst <= src    signed
-  ========  =====  =========================  ============
+The 'code' field encodes the operation as below:
+
+========  =====  =========================  ============
+code      value  description                notes
+========  =====  =========================  ============
+BPF_JA    0x00   PC += off                  BPF_JMP only
+BPF_JEQ   0x10   PC += off if dst == src
+BPF_JGT   0x20   PC += off if dst > src     unsigned
+BPF_JGE   0x30   PC += off if dst >= src    unsigned
+BPF_JSET  0x40   PC += off if dst & src
+BPF_JNE   0x50   PC += off if dst != src
+BPF_JSGT  0x60   PC += off if dst > src     signed
+BPF_JSGE  0x70   PC += off if dst >= src    signed
+BPF_CALL  0x80   function call
+BPF_EXIT  0x90   function / program return  BPF_JMP only
+BPF_JLT   0xa0   PC += off if dst < src     unsigned
+BPF_JLE   0xb0   PC += off if dst <= src    unsigned
+BPF_JSLT  0xc0   PC += off if dst < src     signed
+BPF_JSLE  0xd0   PC += off if dst <= src    signed
+========  =====  =========================  ============
 
 The eBPF program needs to store the return value into register R0 before doing a
 BPF_EXIT.
@@ -189,14 +194,26 @@ BPF_EXIT.
 Load and store instructions
 ===========================
 
-For load and store instructions (BPF_LD, BPF_LDX, BPF_ST and BPF_STX), the
+For load and store instructions (``BPF_LD``, ``BPF_LDX``, ``BPF_ST``, and ``BPF_STX``), the
 8-bit 'opcode' field is divided as:
 
-  ============  ======  =================
-  3 bits (MSB)  2 bits  3 bits (LSB)
-  ============  ======  =================
-  mode          size    instruction class
-  ============  ======  =================
+============  ======  =================
+3 bits (MSB)  2 bits  3 bits (LSB)
+============  ======  =================
+mode          size    instruction class
+============  ======  =================
+
+The mode modifier is one of:
+
+  =============  =====  ====================================  =============
+  mode modifier  value  description                           reference
+  =============  =====  ====================================  =============
+  BPF_IMM        0x00   64-bit immediate instructions         `64-bit immediate instructions`_
+  BPF_ABS        0x20   legacy BPF packet access (absolute)   `Legacy BPF Packet access instructions`_
+  BPF_IND        0x40   legacy BPF packet access (indirect)   `Legacy BPF Packet access instructions`_
+  BPF_MEM        0x60   regular load and store operations     `Regular load and store operations`_
+  BPF_ATOMIC     0xc0   atomic operations                     `Atomic operations`_
+  =============  =====  ====================================  =============
 
 The size modifier is one of:
 
@@ -209,19 +226,6 @@ The size modifier is one of:
   BPF_DW         0x18   double word (8 bytes)
   =============  =====  =====================
 
-The mode modifier is one of:
-
-  =============  =====  ====================================
-  mode modifier  value  description
-  =============  =====  ====================================
-  BPF_IMM        0x00   64-bit immediate instructions
-  BPF_ABS        0x20   legacy BPF packet access (absolute)
-  BPF_IND        0x40   legacy BPF packet access (indirect)
-  BPF_MEM        0x60   regular load and store operations
-  BPF_ATOMIC     0xc0   atomic operations
-  =============  =====  ====================================
-
-
 Regular load and store operations
 ---------------------------------
 
@@ -252,42 +256,42 @@ by other eBPF programs or means outside of this specification.
 All atomic operations supported by eBPF are encoded as store operations
 that use the ``BPF_ATOMIC`` mode modifier as follows:
 
-  * ``BPF_ATOMIC | BPF_W | BPF_STX`` for 32-bit operations
-  * ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations
-  * 8-bit and 16-bit wide atomic operations are not supported.
+* ``BPF_ATOMIC | BPF_W | BPF_STX`` for 32-bit operations
+* ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations
+* 8-bit and 16-bit wide atomic operations are not supported.
 
-The imm field is used to encode the actual atomic operation.
+The 'imm' field is used to encode the actual atomic operation.
 Simple atomic operation use a subset of the values defined to encode
-arithmetic operations in the imm field to encode the atomic operation:
+arithmetic operations in the 'imm' field to encode the atomic operation:
 
-  ========  =====  ===========
-  imm       value  description
-  ========  =====  ===========
-  BPF_ADD   0x00   atomic add
-  BPF_OR    0x40   atomic or
-  BPF_AND   0x50   atomic and
-  BPF_XOR   0xa0   atomic xor
-  ========  =====  ===========
+========  =====  ===========
+imm       value  description
+========  =====  ===========
+BPF_ADD   0x00   atomic add
+BPF_OR    0x40   atomic or
+BPF_AND   0x50   atomic and
+BPF_XOR   0xa0   atomic xor
+========  =====  ===========
 
 
-``BPF_ATOMIC | BPF_W  | BPF_STX`` with imm = BPF_ADD means::
+``BPF_ATOMIC | BPF_W  | BPF_STX`` with 'imm' = BPF_ADD means::
 
   *(u32 *)(dst_reg + off16) += src_reg
 
-``BPF_ATOMIC | BPF_DW | BPF_STX`` with imm = BPF ADD means::
+``BPF_ATOMIC | BPF_DW | BPF_STX`` with 'imm' = BPF ADD means::
 
   *(u64 *)(dst_reg + off16) += src_reg
 
 In addition to the simple atomic operations, there also is a modifier and
 two complex atomic operations:
 
-  ===========  ================  ===========================
-  imm          value             description
-  ===========  ================  ===========================
-  BPF_FETCH    0x01              modifier: return old value
-  BPF_XCHG     0xe0 | BPF_FETCH  atomic exchange
-  BPF_CMPXCHG  0xf0 | BPF_FETCH  atomic compare and exchange
-  ===========  ================  ===========================
+===========  ================  ===========================
+imm          value             description
+===========  ================  ===========================
+BPF_FETCH    0x01              modifier: return old value
+BPF_XCHG     0xe0 | BPF_FETCH  atomic exchange
+BPF_CMPXCHG  0xf0 | BPF_FETCH  atomic compare and exchange
+===========  ================  ===========================
 
 The ``BPF_FETCH`` modifier is optional for simple atomic operations, and
 always set for the complex atomic operations.  If the ``BPF_FETCH`` flag
@@ -306,7 +310,7 @@ and loaded back to ``R0``.
 64-bit immediate instructions
 -----------------------------
 
-Instructions with the ``BPF_IMM`` mode modifier use the wide instruction
+Instructions with the ``BPF_IMM`` 'mode' modifier use the wide instruction
 encoding for an extra imm64 value.
 
 There is currently only one such instruction.
-- 
cgit v1.2.3


From af81a9201e0112e9c4ca7258e32cd28baf19f31a Mon Sep 17 00:00:00 2001
From: Hawking Zhang <Hawking.Zhang@amd.com>
Date: Tue, 13 Sep 2022 19:18:25 +0800
Subject: drm/amdgpu: save rlcv/rlcp ucode version in amdgpu_gfx

cache rlcv/rlcvp ucode version info in amdgpu_gfx
structure

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Likun Gao <Likun.Gao@amd.com>
Reviewed-by: Feifei Xu <Feifei.Xu@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h   | 4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h | 4 ++++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c    | 5 +++++
 3 files changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 23a696d38390..de5b936b016d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -304,6 +304,10 @@ struct amdgpu_gfx {
 	uint32_t			rlc_srlg_feature_version;
 	uint32_t			rlc_srls_fw_version;
 	uint32_t			rlc_srls_feature_version;
+	uint32_t			rlcp_ucode_version;
+	uint32_t			rlcp_ucode_feature_version;
+	uint32_t			rlcv_ucode_version;
+	uint32_t			rlcv_ucode_feature_version;
 	uint32_t			mec_feature_version;
 	uint32_t			mec2_feature_version;
 	bool				mec_fw_write_wait;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
index 96b6cf4c4d54..59edf32f775e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
@@ -260,8 +260,12 @@ struct rlc_firmware_header_v2_2 {
 /* version_major=2, version_minor=3 */
 struct rlc_firmware_header_v2_3 {
     struct rlc_firmware_header_v2_2 v2_2;
+    uint32_t rlcp_ucode_version;
+    uint32_t rlcp_ucode_feature_version;
     uint32_t rlcp_ucode_size_bytes;
     uint32_t rlcp_ucode_offset_bytes;
+    uint32_t rlcv_ucode_version;
+    uint32_t rlcv_ucode_feature_version;
     uint32_t rlcv_ucode_size_bytes;
     uint32_t rlcv_ucode_offset_bytes;
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index fbb13c9147f8..9174a5220bc0 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -511,8 +511,13 @@ static void gfx_v11_0_init_rlcp_rlcv_microcode(struct amdgpu_device *adev)
 	const struct rlc_firmware_header_v2_3 *rlc_hdr;
 
 	rlc_hdr = (const struct rlc_firmware_header_v2_3 *)adev->gfx.rlc_fw->data;
+	adev->gfx.rlcp_ucode_version = le32_to_cpu(rlc_hdr->rlcp_ucode_version);
+	adev->gfx.rlcp_ucode_feature_version = le32_to_cpu(rlc_hdr->rlcp_ucode_feature_version);
 	adev->gfx.rlc.rlcp_ucode_size_bytes = le32_to_cpu(rlc_hdr->rlcp_ucode_size_bytes);
 	adev->gfx.rlc.rlcp_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->rlcp_ucode_offset_bytes);
+
+	adev->gfx.rlcv_ucode_version = le32_to_cpu(rlc_hdr->rlcv_ucode_version);
+	adev->gfx.rlcv_ucode_feature_version = le32_to_cpu(rlc_hdr->rlcv_ucode_feature_version);
 	adev->gfx.rlc.rlcv_ucode_size_bytes = le32_to_cpu(rlc_hdr->rlcv_ucode_size_bytes);
 	adev->gfx.rlc.rlcv_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->rlcv_ucode_offset_bytes);
 }
-- 
cgit v1.2.3


From 2f3f958602721c7915ddb1f2a0e8dc5031bc0151 Mon Sep 17 00:00:00 2001
From: Hawking Zhang <Hawking.Zhang@amd.com>
Date: Wed, 21 Sep 2022 21:48:27 +0800
Subject: drm/amdgpu: add helper to init rlc fw in header v2_0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To initialize rlc firmware in header v2_0

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c | 64 +++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
index 6373bfb47d55..672977ca0f75 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
@@ -272,3 +272,67 @@ void amdgpu_gfx_rlc_fini(struct amdgpu_device *adev)
 			      &adev->gfx.rlc.cp_table_gpu_addr,
 			      (void **)&adev->gfx.rlc.cp_table_ptr);
 }
+
+static int amdgpu_gfx_rlc_init_microcode_v2_0(struct amdgpu_device *adev)
+{
+	const struct common_firmware_header *common_hdr;
+	const struct rlc_firmware_header_v2_0 *rlc_hdr;
+	struct amdgpu_firmware_info *info;
+	unsigned int *tmp;
+	unsigned int i;
+
+	rlc_hdr = (const struct rlc_firmware_header_v2_0 *)adev->gfx.rlc_fw->data;
+
+	adev->gfx.rlc_fw_version = le32_to_cpu(rlc_hdr->header.ucode_version);
+	adev->gfx.rlc_feature_version = le32_to_cpu(rlc_hdr->ucode_feature_version);
+	adev->gfx.rlc.save_and_restore_offset =
+		le32_to_cpu(rlc_hdr->save_and_restore_offset);
+	adev->gfx.rlc.clear_state_descriptor_offset =
+		le32_to_cpu(rlc_hdr->clear_state_descriptor_offset);
+	adev->gfx.rlc.avail_scratch_ram_locations =
+		le32_to_cpu(rlc_hdr->avail_scratch_ram_locations);
+	adev->gfx.rlc.reg_restore_list_size =
+		le32_to_cpu(rlc_hdr->reg_restore_list_size);
+	adev->gfx.rlc.reg_list_format_start =
+		le32_to_cpu(rlc_hdr->reg_list_format_start);
+	adev->gfx.rlc.reg_list_format_separate_start =
+		le32_to_cpu(rlc_hdr->reg_list_format_separate_start);
+	adev->gfx.rlc.starting_offsets_start =
+		le32_to_cpu(rlc_hdr->starting_offsets_start);
+	adev->gfx.rlc.reg_list_format_size_bytes =
+		le32_to_cpu(rlc_hdr->reg_list_format_size_bytes);
+	adev->gfx.rlc.reg_list_size_bytes =
+		le32_to_cpu(rlc_hdr->reg_list_size_bytes);
+	adev->gfx.rlc.register_list_format =
+		kmalloc(adev->gfx.rlc.reg_list_format_size_bytes +
+			adev->gfx.rlc.reg_list_size_bytes, GFP_KERNEL);
+	if (!adev->gfx.rlc.register_list_format) {
+		dev_err(adev->dev, "failed to allocate memory for rlc register_list_format\n");
+		return -ENOMEM;
+	}
+
+	tmp = (unsigned int *)((uintptr_t)rlc_hdr +
+			le32_to_cpu(rlc_hdr->reg_list_format_array_offset_bytes));
+	for (i = 0 ; i < (rlc_hdr->reg_list_format_size_bytes >> 2); i++)
+		adev->gfx.rlc.register_list_format[i] = le32_to_cpu(tmp[i]);
+
+	adev->gfx.rlc.register_restore = adev->gfx.rlc.register_list_format + i;
+
+	tmp = (unsigned int *)((uintptr_t)rlc_hdr +
+			le32_to_cpu(rlc_hdr->reg_list_array_offset_bytes));
+	for (i = 0 ; i < (rlc_hdr->reg_list_size_bytes >> 2); i++)
+		adev->gfx.rlc.register_restore[i] = le32_to_cpu(tmp[i]);
+
+	if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) {
+		info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_G];
+		info->ucode_id = AMDGPU_UCODE_ID_RLC_G;
+		info->fw = adev->gfx.rlc_fw;
+		if (info->fw) {
+			common_hdr = (const struct common_firmware_header *)info->fw->data;
+			adev->firmware.fw_size +=
+				ALIGN(le32_to_cpu(common_hdr->ucode_size_bytes), PAGE_SIZE);
+		}
+	}
+
+	return 0;
+}
-- 
cgit v1.2.3


From 90df15124526176ff03c246362c196a047c8a39a Mon Sep 17 00:00:00 2001
From: Hawking Zhang <Hawking.Zhang@amd.com>
Date: Wed, 21 Sep 2022 21:50:33 +0800
Subject: drm/amdgpu: add helper to init rlc fw in header v2_1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To initialize rlc firmware in header v2_1

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c | 40 +++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
index 672977ca0f75..65bb9a6a6beb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
@@ -336,3 +336,43 @@ static int amdgpu_gfx_rlc_init_microcode_v2_0(struct amdgpu_device *adev)
 
 	return 0;
 }
+
+static void amdgpu_gfx_rlc_init_microcode_v2_1(struct amdgpu_device *adev)
+{
+	const struct rlc_firmware_header_v2_1 *rlc_hdr;
+	struct amdgpu_firmware_info *info;
+
+	rlc_hdr = (const struct rlc_firmware_header_v2_1 *)adev->gfx.rlc_fw->data;
+	adev->gfx.rlc_srlc_fw_version = le32_to_cpu(rlc_hdr->save_restore_list_cntl_ucode_ver);
+	adev->gfx.rlc_srlc_feature_version = le32_to_cpu(rlc_hdr->save_restore_list_cntl_feature_ver);
+	adev->gfx.rlc.save_restore_list_cntl_size_bytes = le32_to_cpu(rlc_hdr->save_restore_list_cntl_size_bytes);
+	adev->gfx.rlc.save_restore_list_cntl = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->save_restore_list_cntl_offset_bytes);
+	adev->gfx.rlc_srlg_fw_version = le32_to_cpu(rlc_hdr->save_restore_list_gpm_ucode_ver);
+	adev->gfx.rlc_srlg_feature_version = le32_to_cpu(rlc_hdr->save_restore_list_gpm_feature_ver);
+	adev->gfx.rlc.save_restore_list_gpm_size_bytes = le32_to_cpu(rlc_hdr->save_restore_list_gpm_size_bytes);
+	adev->gfx.rlc.save_restore_list_gpm = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->save_restore_list_gpm_offset_bytes);
+	adev->gfx.rlc_srls_fw_version = le32_to_cpu(rlc_hdr->save_restore_list_srm_ucode_ver);
+	adev->gfx.rlc_srls_feature_version = le32_to_cpu(rlc_hdr->save_restore_list_srm_feature_ver);
+	adev->gfx.rlc.save_restore_list_srm_size_bytes = le32_to_cpu(rlc_hdr->save_restore_list_srm_size_bytes);
+	adev->gfx.rlc.save_restore_list_srm = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->save_restore_list_srm_offset_bytes);
+	adev->gfx.rlc.reg_list_format_direct_reg_list_length =
+		le32_to_cpu(rlc_hdr->reg_list_format_direct_reg_list_length);
+
+	if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) {
+		if (adev->gfx.rlc.save_restore_list_gpm_size_bytes) {
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM];
+			info->ucode_id = AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.save_restore_list_gpm_size_bytes, PAGE_SIZE);
+		}
+
+		if (adev->gfx.rlc.save_restore_list_srm_size_bytes) {
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM];
+			info->ucode_id = AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.save_restore_list_srm_size_bytes, PAGE_SIZE);
+		}
+	}
+}
-- 
cgit v1.2.3


From bcecb6524881c54ded92f14a6cf2406d5c439637 Mon Sep 17 00:00:00 2001
From: Hawking Zhang <Hawking.Zhang@amd.com>
Date: Wed, 21 Sep 2022 21:52:31 +0800
Subject: drm/amdgpu: add helper to init rlc fw in header v2_2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To initialize rlc firmware in header v2_2

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
index 65bb9a6a6beb..5088577d0403 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
@@ -376,3 +376,33 @@ static void amdgpu_gfx_rlc_init_microcode_v2_1(struct amdgpu_device *adev)
 		}
 	}
 }
+
+static void amdgpu_gfx_rlc_init_microcode_v2_2(struct amdgpu_device *adev)
+{
+	const struct rlc_firmware_header_v2_2 *rlc_hdr;
+	struct amdgpu_firmware_info *info;
+
+	rlc_hdr = (const struct rlc_firmware_header_v2_2 *)adev->gfx.rlc_fw->data;
+	adev->gfx.rlc.rlc_iram_ucode_size_bytes = le32_to_cpu(rlc_hdr->rlc_iram_ucode_size_bytes);
+	adev->gfx.rlc.rlc_iram_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->rlc_iram_ucode_offset_bytes);
+	adev->gfx.rlc.rlc_dram_ucode_size_bytes = le32_to_cpu(rlc_hdr->rlc_dram_ucode_size_bytes);
+	adev->gfx.rlc.rlc_dram_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->rlc_dram_ucode_offset_bytes);
+
+	if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) {
+		if (adev->gfx.rlc.rlc_iram_ucode_size_bytes) {
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_IRAM];
+			info->ucode_id = AMDGPU_UCODE_ID_RLC_IRAM;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.rlc_iram_ucode_size_bytes, PAGE_SIZE);
+		}
+
+		if (adev->gfx.rlc.rlc_dram_ucode_size_bytes) {
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_DRAM];
+			info->ucode_id = AMDGPU_UCODE_ID_RLC_DRAM;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.rlc_dram_ucode_size_bytes, PAGE_SIZE);
+		}
+	}
+}
-- 
cgit v1.2.3


From c1c3f41ffb1bd262e40d68938af68c6ab53a41f8 Mon Sep 17 00:00:00 2001
From: Hawking Zhang <Hawking.Zhang@amd.com>
Date: Wed, 21 Sep 2022 21:53:52 +0800
Subject: drm/amdgpu: add helper to init rlc fw in header v2_3
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To initialize rlc firmware in header v2_3

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c | 35 +++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
index 5088577d0403..5168f6966951 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
@@ -406,3 +406,38 @@ static void amdgpu_gfx_rlc_init_microcode_v2_2(struct amdgpu_device *adev)
 		}
 	}
 }
+
+static void amdgpu_gfx_rlc_init_microcode_v2_3(struct amdgpu_device *adev)
+{
+	const struct rlc_firmware_header_v2_3 *rlc_hdr;
+	struct amdgpu_firmware_info *info;
+
+	rlc_hdr = (const struct rlc_firmware_header_v2_3 *)adev->gfx.rlc_fw->data;
+	adev->gfx.rlcp_ucode_version = le32_to_cpu(rlc_hdr->rlcp_ucode_version);
+	adev->gfx.rlcp_ucode_feature_version = le32_to_cpu(rlc_hdr->rlcp_ucode_feature_version);
+	adev->gfx.rlc.rlcp_ucode_size_bytes = le32_to_cpu(rlc_hdr->rlcp_ucode_size_bytes);
+	adev->gfx.rlc.rlcp_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->rlcp_ucode_offset_bytes);
+
+	adev->gfx.rlcv_ucode_version = le32_to_cpu(rlc_hdr->rlcv_ucode_version);
+	adev->gfx.rlcv_ucode_feature_version = le32_to_cpu(rlc_hdr->rlcv_ucode_feature_version);
+	adev->gfx.rlc.rlcv_ucode_size_bytes = le32_to_cpu(rlc_hdr->rlcv_ucode_size_bytes);
+	adev->gfx.rlc.rlcv_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->rlcv_ucode_offset_bytes);
+
+	if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) {
+		if (adev->gfx.rlc.rlcp_ucode_size_bytes) {
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_P];
+			info->ucode_id = AMDGPU_UCODE_ID_RLC_P;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.rlcp_ucode_size_bytes, PAGE_SIZE);
+		}
+
+		if (adev->gfx.rlc.rlcv_ucode_size_bytes) {
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_V];
+			info->ucode_id = AMDGPU_UCODE_ID_RLC_V;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.rlcv_ucode_size_bytes, PAGE_SIZE);
+		}
+	}
+}
-- 
cgit v1.2.3


From b33139ee15352043123b81b32ca1a6a84340eb5d Mon Sep 17 00:00:00 2001
From: Hawking Zhang <Hawking.Zhang@amd.com>
Date: Wed, 21 Sep 2022 21:55:52 +0800
Subject: drm/amdgpu: add helper to init rlc fw in header v2_4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To initialize rlc firmware in header v2_4

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c | 60 +++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
index 5168f6966951..0c262afbc7ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
@@ -441,3 +441,63 @@ static void amdgpu_gfx_rlc_init_microcode_v2_3(struct amdgpu_device *adev)
 		}
 	}
 }
+
+static void amdgpu_gfx_rlc_init_microcode_v2_4(struct amdgpu_device *adev)
+{
+	const struct rlc_firmware_header_v2_4 *rlc_hdr;
+	struct amdgpu_firmware_info *info;
+
+	rlc_hdr = (const struct rlc_firmware_header_v2_4 *)adev->gfx.rlc_fw->data;
+	adev->gfx.rlc.global_tap_delays_ucode_size_bytes = le32_to_cpu(rlc_hdr->global_tap_delays_ucode_size_bytes);
+	adev->gfx.rlc.global_tap_delays_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->global_tap_delays_ucode_offset_bytes);
+	adev->gfx.rlc.se0_tap_delays_ucode_size_bytes = le32_to_cpu(rlc_hdr->se0_tap_delays_ucode_size_bytes);
+	adev->gfx.rlc.se0_tap_delays_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->se0_tap_delays_ucode_offset_bytes);
+	adev->gfx.rlc.se1_tap_delays_ucode_size_bytes = le32_to_cpu(rlc_hdr->se1_tap_delays_ucode_size_bytes);
+	adev->gfx.rlc.se1_tap_delays_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->se1_tap_delays_ucode_offset_bytes);
+	adev->gfx.rlc.se2_tap_delays_ucode_size_bytes = le32_to_cpu(rlc_hdr->se2_tap_delays_ucode_size_bytes);
+	adev->gfx.rlc.se2_tap_delays_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->se2_tap_delays_ucode_offset_bytes);
+	adev->gfx.rlc.se3_tap_delays_ucode_size_bytes = le32_to_cpu(rlc_hdr->se3_tap_delays_ucode_size_bytes);
+	adev->gfx.rlc.se3_tap_delays_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->se3_tap_delays_ucode_offset_bytes);
+
+	if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) {
+		if (adev->gfx.rlc.global_tap_delays_ucode_size_bytes) {
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_GLOBAL_TAP_DELAYS];
+			info->ucode_id = AMDGPU_UCODE_ID_GLOBAL_TAP_DELAYS;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.global_tap_delays_ucode_size_bytes, PAGE_SIZE);
+		}
+
+		if (adev->gfx.rlc.se0_tap_delays_ucode_size_bytes) {
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_SE0_TAP_DELAYS];
+			info->ucode_id = AMDGPU_UCODE_ID_SE0_TAP_DELAYS;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.se0_tap_delays_ucode_size_bytes, PAGE_SIZE);
+		}
+
+		if (adev->gfx.rlc.se1_tap_delays_ucode_size_bytes) {
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_SE1_TAP_DELAYS];
+			info->ucode_id = AMDGPU_UCODE_ID_SE1_TAP_DELAYS;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.se1_tap_delays_ucode_size_bytes, PAGE_SIZE);
+		}
+
+		if (adev->gfx.rlc.se2_tap_delays_ucode_size_bytes) {
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_SE2_TAP_DELAYS];
+			info->ucode_id = AMDGPU_UCODE_ID_SE2_TAP_DELAYS;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.se2_tap_delays_ucode_size_bytes, PAGE_SIZE);
+		}
+
+		if (adev->gfx.rlc.se3_tap_delays_ucode_size_bytes) {
+			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_SE3_TAP_DELAYS];
+			info->ucode_id = AMDGPU_UCODE_ID_SE3_TAP_DELAYS;
+			info->fw = adev->gfx.rlc_fw;
+			adev->firmware.fw_size +=
+				ALIGN(adev->gfx.rlc.se3_tap_delays_ucode_size_bytes, PAGE_SIZE);
+		}
+	}
+}
-- 
cgit v1.2.3


From 04fa38cce6e02be4362cd889780d58ec275c4d26 Mon Sep 17 00:00:00 2001
From: Hawking Zhang <Hawking.Zhang@amd.com>
Date: Fri, 16 Sep 2022 00:21:50 +0800
Subject: drm/amdgpu: add helper to init rlc firmware
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To initialzie rlc firmware according to rlc
firmware header version

v2: squash in backwards compat fix

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c | 35 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h |  4 +++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
index 0c262afbc7ca..e23f6192c50e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c
@@ -501,3 +501,38 @@ static void amdgpu_gfx_rlc_init_microcode_v2_4(struct amdgpu_device *adev)
 		}
 	}
 }
+
+int amdgpu_gfx_rlc_init_microcode(struct amdgpu_device *adev,
+				  uint16_t version_major,
+				  uint16_t version_minor)
+{
+	int err;
+
+	if (version_major < 2) {
+		/* only support rlc_hdr v2.x and onwards */
+		dev_err(adev->dev, "unsupported rlc fw hdr\n");
+		return -EINVAL;
+	}
+
+	/* is_rlc_v2_1 is still used in APU code path */
+	if (version_major == 2 && version_minor == 1)
+		adev->gfx.rlc.is_rlc_v2_1 = true;
+
+	if (version_minor >= 0) {
+		err = amdgpu_gfx_rlc_init_microcode_v2_0(adev);
+		if (err) {
+			dev_err(adev->dev, "fail to init rlc v2_0 microcode\n");
+			return err;
+		}
+	}
+	if (version_minor >= 1)
+		amdgpu_gfx_rlc_init_microcode_v2_1(adev);
+	if (version_minor >= 2)
+		amdgpu_gfx_rlc_init_microcode_v2_2(adev);
+	if (version_minor == 3)
+		amdgpu_gfx_rlc_init_microcode_v2_3(adev);
+	if (version_minor == 4)
+		amdgpu_gfx_rlc_init_microcode_v2_4(adev);
+
+	return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h
index 03ac36b2c2cf..23f060db9255 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.h
@@ -267,5 +267,7 @@ int amdgpu_gfx_rlc_init_csb(struct amdgpu_device *adev);
 int amdgpu_gfx_rlc_init_cpt(struct amdgpu_device *adev);
 void amdgpu_gfx_rlc_setup_cp_table(struct amdgpu_device *adev);
 void amdgpu_gfx_rlc_fini(struct amdgpu_device *adev);
-
+int amdgpu_gfx_rlc_init_microcode(struct amdgpu_device *adev,
+				  uint16_t version_major,
+				  uint16_t version_minor);
 #endif
-- 
cgit v1.2.3


From 0fd85e89b5bf18447e56099a010ee5be5dc9f2b0 Mon Sep 17 00:00:00 2001
From: Hawking Zhang <Hawking.Zhang@amd.com>
Date: Fri, 16 Sep 2022 00:48:27 +0800
Subject: drm/amdgpu/gfx11: switch to amdgpu_gfx_rlc_init_microcode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

switch to common helper to initialize rlc firmware
for gfx11

Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 156 +--------------------------------
 1 file changed, 4 insertions(+), 152 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 9174a5220bc0..daf8ba8235cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -474,54 +474,6 @@ static void gfx_v11_0_free_microcode(struct amdgpu_device *adev)
 	kfree(adev->gfx.rlc.register_list_format);
 }
 
-static void gfx_v11_0_init_rlc_ext_microcode(struct amdgpu_device *adev)
-{
-	const struct rlc_firmware_header_v2_1 *rlc_hdr;
-
-	rlc_hdr = (const struct rlc_firmware_header_v2_1 *)adev->gfx.rlc_fw->data;
-	adev->gfx.rlc_srlc_fw_version = le32_to_cpu(rlc_hdr->save_restore_list_cntl_ucode_ver);
-	adev->gfx.rlc_srlc_feature_version = le32_to_cpu(rlc_hdr->save_restore_list_cntl_feature_ver);
-	adev->gfx.rlc.save_restore_list_cntl_size_bytes = le32_to_cpu(rlc_hdr->save_restore_list_cntl_size_bytes);
-	adev->gfx.rlc.save_restore_list_cntl = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->save_restore_list_cntl_offset_bytes);
-	adev->gfx.rlc_srlg_fw_version = le32_to_cpu(rlc_hdr->save_restore_list_gpm_ucode_ver);
-	adev->gfx.rlc_srlg_feature_version = le32_to_cpu(rlc_hdr->save_restore_list_gpm_feature_ver);
-	adev->gfx.rlc.save_restore_list_gpm_size_bytes = le32_to_cpu(rlc_hdr->save_restore_list_gpm_size_bytes);
-	adev->gfx.rlc.save_restore_list_gpm = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->save_restore_list_gpm_offset_bytes);
-	adev->gfx.rlc_srls_fw_version = le32_to_cpu(rlc_hdr->save_restore_list_srm_ucode_ver);
-	adev->gfx.rlc_srls_feature_version = le32_to_cpu(rlc_hdr->save_restore_list_srm_feature_ver);
-	adev->gfx.rlc.save_restore_list_srm_size_bytes = le32_to_cpu(rlc_hdr->save_restore_list_srm_size_bytes);
-	adev->gfx.rlc.save_restore_list_srm = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->save_restore_list_srm_offset_bytes);
-	adev->gfx.rlc.reg_list_format_direct_reg_list_length =
-			le32_to_cpu(rlc_hdr->reg_list_format_direct_reg_list_length);
-}
-
-static void gfx_v11_0_init_rlc_iram_dram_microcode(struct amdgpu_device *adev)
-{
-	const struct rlc_firmware_header_v2_2 *rlc_hdr;
-
-	rlc_hdr = (const struct rlc_firmware_header_v2_2 *)adev->gfx.rlc_fw->data;
-	adev->gfx.rlc.rlc_iram_ucode_size_bytes = le32_to_cpu(rlc_hdr->rlc_iram_ucode_size_bytes);
-	adev->gfx.rlc.rlc_iram_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->rlc_iram_ucode_offset_bytes);
-	adev->gfx.rlc.rlc_dram_ucode_size_bytes = le32_to_cpu(rlc_hdr->rlc_dram_ucode_size_bytes);
-	adev->gfx.rlc.rlc_dram_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->rlc_dram_ucode_offset_bytes);
-}
-
-static void gfx_v11_0_init_rlcp_rlcv_microcode(struct amdgpu_device *adev)
-{
-	const struct rlc_firmware_header_v2_3 *rlc_hdr;
-
-	rlc_hdr = (const struct rlc_firmware_header_v2_3 *)adev->gfx.rlc_fw->data;
-	adev->gfx.rlcp_ucode_version = le32_to_cpu(rlc_hdr->rlcp_ucode_version);
-	adev->gfx.rlcp_ucode_feature_version = le32_to_cpu(rlc_hdr->rlcp_ucode_feature_version);
-	adev->gfx.rlc.rlcp_ucode_size_bytes = le32_to_cpu(rlc_hdr->rlcp_ucode_size_bytes);
-	adev->gfx.rlc.rlcp_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->rlcp_ucode_offset_bytes);
-
-	adev->gfx.rlcv_ucode_version = le32_to_cpu(rlc_hdr->rlcv_ucode_version);
-	adev->gfx.rlcv_ucode_feature_version = le32_to_cpu(rlc_hdr->rlcv_ucode_feature_version);
-	adev->gfx.rlc.rlcv_ucode_size_bytes = le32_to_cpu(rlc_hdr->rlcv_ucode_size_bytes);
-	adev->gfx.rlc.rlcv_ucode = (u8 *)rlc_hdr + le32_to_cpu(rlc_hdr->rlcv_ucode_offset_bytes);
-}
-
 static int gfx_v11_0_init_microcode(struct amdgpu_device *adev)
 {
 	char fw_name[40];
@@ -532,8 +484,6 @@ static int gfx_v11_0_init_microcode(struct amdgpu_device *adev)
 	const struct gfx_firmware_header_v1_0 *cp_hdr;
 	const struct gfx_firmware_header_v2_0 *cp_hdr_v2_0;
 	const struct rlc_firmware_header_v2_0 *rlc_hdr;
-	unsigned int *tmp = NULL;
-	unsigned int i = 0;
 	uint16_t version_major;
 	uint16_t version_minor;
 
@@ -588,58 +538,14 @@ static int gfx_v11_0_init_microcode(struct amdgpu_device *adev)
 		if (err)
 			goto out;
 		err = amdgpu_ucode_validate(adev->gfx.rlc_fw);
+		if (err)
+			goto out;
 		rlc_hdr = (const struct rlc_firmware_header_v2_0 *)adev->gfx.rlc_fw->data;
 		version_major = le16_to_cpu(rlc_hdr->header.header_version_major);
 		version_minor = le16_to_cpu(rlc_hdr->header.header_version_minor);
-
-		adev->gfx.rlc_fw_version = le32_to_cpu(rlc_hdr->header.ucode_version);
-		adev->gfx.rlc_feature_version = le32_to_cpu(rlc_hdr->ucode_feature_version);
-		adev->gfx.rlc.save_and_restore_offset =
-			le32_to_cpu(rlc_hdr->save_and_restore_offset);
-		adev->gfx.rlc.clear_state_descriptor_offset =
-			le32_to_cpu(rlc_hdr->clear_state_descriptor_offset);
-		adev->gfx.rlc.avail_scratch_ram_locations =
-			le32_to_cpu(rlc_hdr->avail_scratch_ram_locations);
-		adev->gfx.rlc.reg_restore_list_size =
-			le32_to_cpu(rlc_hdr->reg_restore_list_size);
-		adev->gfx.rlc.reg_list_format_start =
-			le32_to_cpu(rlc_hdr->reg_list_format_start);
-		adev->gfx.rlc.reg_list_format_separate_start =
-			le32_to_cpu(rlc_hdr->reg_list_format_separate_start);
-		adev->gfx.rlc.starting_offsets_start =
-			le32_to_cpu(rlc_hdr->starting_offsets_start);
-		adev->gfx.rlc.reg_list_format_size_bytes =
-			le32_to_cpu(rlc_hdr->reg_list_format_size_bytes);
-		adev->gfx.rlc.reg_list_size_bytes =
-			le32_to_cpu(rlc_hdr->reg_list_size_bytes);
-		adev->gfx.rlc.register_list_format =
-			kmalloc(adev->gfx.rlc.reg_list_format_size_bytes +
-					adev->gfx.rlc.reg_list_size_bytes, GFP_KERNEL);
-		if (!adev->gfx.rlc.register_list_format) {
-			err = -ENOMEM;
+		err = amdgpu_gfx_rlc_init_microcode(adev, version_major, version_minor);
+		if (err)
 			goto out;
-		}
-
-		tmp = (unsigned int *)((uintptr_t)rlc_hdr +
-							   le32_to_cpu(rlc_hdr->reg_list_format_array_offset_bytes));
-		for (i = 0 ; i < (rlc_hdr->reg_list_format_size_bytes >> 2); i++)
-			adev->gfx.rlc.register_list_format[i] =	le32_to_cpu(tmp[i]);
-
-		adev->gfx.rlc.register_restore = adev->gfx.rlc.register_list_format + i;
-
-		tmp = (unsigned int *)((uintptr_t)rlc_hdr +
-							   le32_to_cpu(rlc_hdr->reg_list_array_offset_bytes));
-		for (i = 0 ; i < (rlc_hdr->reg_list_size_bytes >> 2); i++)
-			adev->gfx.rlc.register_restore[i] = le32_to_cpu(tmp[i]);
-
-		if (version_major == 2) {
-			if (version_minor >= 1)
-				gfx_v11_0_init_rlc_ext_microcode(adev);
-			if (version_minor >= 2)
-				gfx_v11_0_init_rlc_iram_dram_microcode(adev);
-			if (version_minor == 3)
-				gfx_v11_0_init_rlcp_rlcv_microcode(adev);
-		}
 	}
 
 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_mec.bin", ucode_prefix);
@@ -774,60 +680,6 @@ static int gfx_v11_0_init_microcode(struct amdgpu_device *adev)
 			adev->firmware.fw_size +=
 				ALIGN(le32_to_cpu(cp_hdr->jt_size) * 4, PAGE_SIZE);
 		}
-
-		info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_G];
-		info->ucode_id = AMDGPU_UCODE_ID_RLC_G;
-		info->fw = adev->gfx.rlc_fw;
-		if (info->fw) {
-			header = (const struct common_firmware_header *)info->fw->data;
-			adev->firmware.fw_size +=
-				ALIGN(le32_to_cpu(header->ucode_size_bytes), PAGE_SIZE);
-		}
-		if (adev->gfx.rlc.save_restore_list_gpm_size_bytes &&
-		    adev->gfx.rlc.save_restore_list_srm_size_bytes) {
-			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM];
-			info->ucode_id = AMDGPU_UCODE_ID_RLC_RESTORE_LIST_GPM_MEM;
-			info->fw = adev->gfx.rlc_fw;
-			adev->firmware.fw_size +=
-				ALIGN(adev->gfx.rlc.save_restore_list_gpm_size_bytes, PAGE_SIZE);
-
-			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM];
-			info->ucode_id = AMDGPU_UCODE_ID_RLC_RESTORE_LIST_SRM_MEM;
-			info->fw = adev->gfx.rlc_fw;
-			adev->firmware.fw_size +=
-				ALIGN(adev->gfx.rlc.save_restore_list_srm_size_bytes, PAGE_SIZE);
-		}
-
-		if (adev->gfx.rlc.rlc_iram_ucode_size_bytes &&
-		    adev->gfx.rlc.rlc_dram_ucode_size_bytes) {
-			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_IRAM];
-			info->ucode_id = AMDGPU_UCODE_ID_RLC_IRAM;
-			info->fw = adev->gfx.rlc_fw;
-			adev->firmware.fw_size +=
-				ALIGN(adev->gfx.rlc.rlc_iram_ucode_size_bytes, PAGE_SIZE);
-
-			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_DRAM];
-			info->ucode_id = AMDGPU_UCODE_ID_RLC_DRAM;
-			info->fw = adev->gfx.rlc_fw;
-			adev->firmware.fw_size +=
-				ALIGN(adev->gfx.rlc.rlc_dram_ucode_size_bytes, PAGE_SIZE);
-		}
-
-		if (adev->gfx.rlc.rlcp_ucode_size_bytes) {
-			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_P];
-			info->ucode_id = AMDGPU_UCODE_ID_RLC_P;
-			info->fw = adev->gfx.rlc_fw;
-			adev->firmware.fw_size +=
-				ALIGN(adev->gfx.rlc.rlcp_ucode_size_bytes, PAGE_SIZE);
-		}
-
-		if (adev->gfx.rlc.rlcv_ucode_size_bytes) {
-			info = &adev->firmware.ucode[AMDGPU_UCODE_ID_RLC_V];
-			info->ucode_id = AMDGPU_UCODE_ID_RLC_V;
-			info->fw = adev->gfx.rlc_fw;
-			adev->firmware.fw_size +=
-				ALIGN(adev->gfx.rlc.rlcv_ucode_size_bytes, PAGE_SIZE);
-		}
 	}
 
 out:
-- 
cgit v1.2.3


From d863f42930db35e82f47e4b4c78531a2b8d396ae Mon Sep 17 00:00:00 2001
From: Yuan Can <yuancan@huawei.com>
Date: Wed, 28 Sep 2022 09:04:39 +0000
Subject: bpftool: Remove unused struct btf_attach_point

After commit 2828d0d75b73 ("bpftool: Switch to libbpf's hashmap for
programs/maps in BTF listing"), struct btf_attach_point is not used
anymore and can be removed as well.

Signed-off-by: Yuan Can <yuancan@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220928090440.79637-2-yuancan@huawei.com
---
 tools/bpf/bpftool/btf.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 0744bd1150be..64411fe49a66 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -43,11 +43,6 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_ENUM64]	= "ENUM64",
 };
 
-struct btf_attach_point {
-	__u32 obj_id;
-	__u32 btf_id;
-};
-
 static const char *btf_int_enc_str(__u8 encoding)
 {
 	switch (encoding) {
-- 
cgit v1.2.3


From f95a479797dc2c65fdf2809a7c388e7a9e2bc853 Mon Sep 17 00:00:00 2001
From: Yuan Can <yuancan@huawei.com>
Date: Wed, 28 Sep 2022 09:04:40 +0000
Subject: bpftool: Remove unused struct event_ring_info

After commit 9b190f185d2f ("tools/bpftool: switch map event_pipe to
libbpf's perf_buffer"), struct event_ring_info is not used any more and
can be removed as well.

Signed-off-by: Yuan Can <yuancan@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220928090440.79637-3-yuancan@huawei.com
---
 tools/bpf/bpftool/map_perf_ring.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
index 6b0c410152de..1583281d1327 100644
--- a/tools/bpf/bpftool/map_perf_ring.c
+++ b/tools/bpf/bpftool/map_perf_ring.c
@@ -29,13 +29,6 @@
 
 static volatile bool stop;
 
-struct event_ring_info {
-	int fd;
-	int key;
-	unsigned int cpu;
-	void *mem;
-};
-
 struct perf_event_sample {
 	struct perf_event_header header;
 	__u64 time;
-- 
cgit v1.2.3


From b59cc7fcbaebde52ed97f63c6c50e49b8dd5be37 Mon Sep 17 00:00:00 2001
From: Deming Wang <wangdeming@inspur.com>
Date: Tue, 27 Sep 2022 15:25:27 -0400
Subject: samples/bpf: Fix typo in xdp_router_ipv4 sample

Fix typo in xdp_router_ipv4 sample.

Signed-off-by: Deming Wang <wangdeming@inspur.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220927192527.8722-1-wangdeming@inspur.com
---
 samples/bpf/xdp_router_ipv4_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c
index 294fc15ad1cb..683913bbf279 100644
--- a/samples/bpf/xdp_router_ipv4_user.c
+++ b/samples/bpf/xdp_router_ipv4_user.c
@@ -209,7 +209,7 @@ static void read_route(struct nlmsghdr *nh, int nll)
 					/* Rereading the route table to check if
 					 * there is an entry with the same
 					 * prefix but a different metric as the
-					 * deleted enty.
+					 * deleted entry.
 					 */
 					get_route_table(AF_INET);
 				} else if (prefix_key->data[0] ==
-- 
cgit v1.2.3


From 2efcf695bfc0f078dd7d5d23d96a97db34c930d5 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.i.king@gmail.com>
Date: Wed, 28 Sep 2022 23:15:55 +0100
Subject: selftests/bpf: Fix spelling mistake "unpriviledged" -> "unprivileged"

There are a couple of spelling mistakes, one in a literal string and one
in a comment. Fix them.

Signed-off-by: Colin Ian King <colin.i.king@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20220928221555.67873-1-colin.i.king@gmail.com
---
 tools/testing/selftests/bpf/verifier/calls.c   | 2 +-
 tools/testing/selftests/bpf/verifier/var_off.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
index 3fb4f69b1962..e1a937277b54 100644
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -284,7 +284,7 @@
 	.result = ACCEPT,
 },
 {
-	"calls: not on unpriviledged",
+	"calls: not on unprivileged",
 	.insns = {
 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
 	BPF_MOV64_IMM(BPF_REG_0, 1),
diff --git a/tools/testing/selftests/bpf/verifier/var_off.c b/tools/testing/selftests/bpf/verifier/var_off.c
index 187c6f6e32bc..d37f512fad16 100644
--- a/tools/testing/selftests/bpf/verifier/var_off.c
+++ b/tools/testing/selftests/bpf/verifier/var_off.c
@@ -121,7 +121,7 @@
 	BPF_EXIT_INSN(),
 	},
 	.fixup_map_hash_8b = { 1 },
-	/* The unpriviledged case is not too interesting; variable
+	/* The unprivileged case is not too interesting; variable
 	 * stack access is rejected.
 	 */
 	.errstr_unpriv = "R2 variable stack access prohibited for !root",
-- 
cgit v1.2.3


From 51e05a8cf8eb34da7473823b7f236a77adfef0b4 Mon Sep 17 00:00:00 2001
From: Xin Liu <liuxin350@huawei.com>
Date: Fri, 30 Sep 2022 17:07:08 +0800
Subject: libbpf: Fix overrun in netlink attribute iteration

I accidentally found that a change in commit 1045b03e07d8 ("netlink: fix
overrun in attribute iteration") was not synchronized to the function
`nla_ok` in tools/lib/bpf/nlattr.c, I think it is necessary to modify,
this patch will do it.

Signed-off-by: Xin Liu <liuxin350@huawei.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20220930090708.62394-1-liuxin350@huawei.com
---
 tools/lib/bpf/nlattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/lib/bpf/nlattr.c b/tools/lib/bpf/nlattr.c
index f57e77a6e40f..3900d052ed19 100644
--- a/tools/lib/bpf/nlattr.c
+++ b/tools/lib/bpf/nlattr.c
@@ -32,7 +32,7 @@ static struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
 
 static int nla_ok(const struct nlattr *nla, int remaining)
 {
-	return remaining >= sizeof(*nla) &&
+	return remaining >= (int)sizeof(*nla) &&
 	       nla->nla_len >= sizeof(*nla) &&
 	       nla->nla_len <= remaining;
 }
-- 
cgit v1.2.3


From 3ca2fb497440a3c8294f9df0ce7b2c3c9a1c5875 Mon Sep 17 00:00:00 2001
From: Tianyi Liu <i.pear@outlook.com>
Date: Wed, 28 Sep 2022 16:09:32 +0800
Subject: bpftool: Fix error message of strerror

strerror() expects a positive errno, however variable err will never be
positive when an error occurs. This causes bpftool to output too many
"unknown error", even a simple "file not exist" error can not get an
accurate message.

This patch fixed all "strerror(err)" patterns in bpftool.
Specially in btf.c#L823, hashmap__append() is an internal function of
libbpf and will not change errno, so there's a little difference.
Some libbpf_get_error() calls are kept for return values.

Changes since v1: https://lore.kernel.org/bpf/SY4P282MB1084B61CD8671DFA395AA8579D539@SY4P282MB1084.AUSP282.PROD.OUTLOOK.COM/
Check directly for NULL values instead of calling libbpf_get_error().

Signed-off-by: Tianyi Liu <i.pear@outlook.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/SY4P282MB1084AD9CD84A920F08DF83E29D549@SY4P282MB1084.AUSP282.PROD.OUTLOOK.COM
---
 tools/bpf/bpftool/btf.c           | 11 +++++------
 tools/bpf/bpftool/gen.c           |  4 ++--
 tools/bpf/bpftool/map_perf_ring.c |  7 +++----
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 64411fe49a66..68a70ac03c80 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -635,10 +635,9 @@ static int do_dump(int argc, char **argv)
 
 		btf = btf__parse_split(*argv, base ?: base_btf);
 		err = libbpf_get_error(btf);
-		if (err) {
-			btf = NULL;
+		if (!btf) {
 			p_err("failed to load BTF from %s: %s",
-			      *argv, strerror(err));
+			      *argv, strerror(errno));
 			goto done;
 		}
 		NEXT_ARG();
@@ -683,8 +682,8 @@ static int do_dump(int argc, char **argv)
 
 		btf = btf__load_from_kernel_by_id_split(btf_id, base_btf);
 		err = libbpf_get_error(btf);
-		if (err) {
-			p_err("get btf by id (%u): %s", btf_id, strerror(err));
+		if (!btf) {
+			p_err("get btf by id (%u): %s", btf_id, strerror(errno));
 			goto done;
 		}
 	}
@@ -820,7 +819,7 @@ build_btf_type_table(struct hashmap *tab, enum bpf_obj_type type,
 				      u32_as_hash_field(id));
 		if (err) {
 			p_err("failed to append entry to hashmap for BTF ID %u, object ID %u: %s",
-			      btf_id, id, strerror(errno));
+			      btf_id, id, strerror(-err));
 			goto err_free;
 		}
 	}
diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c
index 7070dcffa822..cf8b4e525c88 100644
--- a/tools/bpf/bpftool/gen.c
+++ b/tools/bpf/bpftool/gen.c
@@ -1594,14 +1594,14 @@ static int do_object(int argc, char **argv)
 
 		err = bpf_linker__add_file(linker, file, NULL);
 		if (err) {
-			p_err("failed to link '%s': %s (%d)", file, strerror(err), err);
+			p_err("failed to link '%s': %s (%d)", file, strerror(errno), errno);
 			goto out;
 		}
 	}
 
 	err = bpf_linker__finalize(linker);
 	if (err) {
-		p_err("failed to finalize ELF file: %s (%d)", strerror(err), err);
+		p_err("failed to finalize ELF file: %s (%d)", strerror(errno), errno);
 		goto out;
 	}
 
diff --git a/tools/bpf/bpftool/map_perf_ring.c b/tools/bpf/bpftool/map_perf_ring.c
index 1583281d1327..21d7d447e1f3 100644
--- a/tools/bpf/bpftool/map_perf_ring.c
+++ b/tools/bpf/bpftool/map_perf_ring.c
@@ -188,10 +188,9 @@ int do_event_pipe(int argc, char **argv)
 	opts.map_keys = &ctx.idx;
 	pb = perf_buffer__new_raw(map_fd, MMAP_PAGE_CNT, &perf_attr,
 				  print_bpf_output, &ctx, &opts);
-	err = libbpf_get_error(pb);
-	if (err) {
+	if (!pb) {
 		p_err("failed to create perf buffer: %s (%d)",
-		      strerror(err), err);
+		      strerror(errno), errno);
 		goto err_close_map;
 	}
 
@@ -206,7 +205,7 @@ int do_event_pipe(int argc, char **argv)
 		err = perf_buffer__poll(pb, 200);
 		if (err < 0 && err != -EINTR) {
 			p_err("perf buffer polling failed: %s (%d)",
-			      strerror(err), err);
+			      strerror(errno), errno);
 			goto err_close_pb;
 		}
 	}
-- 
cgit v1.2.3


From 5f388bba7acbdb097a9e7ed932a39b40f7eb2acf Mon Sep 17 00:00:00 2001
From: Magnus Karlsson <magnus.karlsson@intel.com>
Date: Thu, 29 Sep 2022 11:01:33 +0200
Subject: selftests/xsk: Fix double free

Fix a double free at exit of the test suite.

Fixes: a693ff3ed561 ("selftests/xsk: Add support for executing tests on physical device")
Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://lore.kernel.org/bpf/20220929090133.7869-1-magnus.karlsson@gmail.com
---
 tools/testing/selftests/bpf/xskxceiver.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tools/testing/selftests/bpf/xskxceiver.c b/tools/testing/selftests/bpf/xskxceiver.c
index ef33309bbe49..d1a5f3218c34 100644
--- a/tools/testing/selftests/bpf/xskxceiver.c
+++ b/tools/testing/selftests/bpf/xskxceiver.c
@@ -1953,9 +1953,6 @@ int main(int argc, char **argv)
 
 	pkt_stream_delete(tx_pkt_stream_default);
 	pkt_stream_delete(rx_pkt_stream_default);
-	free(ifobj_rx->umem);
-	if (!ifobj_tx->shared_umem)
-		free(ifobj_tx->umem);
 	ifobject_delete(ifobj_tx);
 	ifobject_delete(ifobj_rx);
 
-- 
cgit v1.2.3


From 6abf0dae8c3c927f54e62c46faf8aba580ba0d04 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Fri, 30 Sep 2022 13:12:30 -0700
Subject: Bluetooth: hci_sync: Fix not indicating power state

When setting power state using legacy/non-mgmt API
(e.g hcitool hci0 up) the likes of mgmt_set_powered_complete won't be
called causing clients of the MGMT API to not be notified of the change
of the state.

Fixes: cf75ad8b41d2 ("Bluetooth: hci_sync: Convert MGMT_SET_POWERED")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Tested-by: Tedd Ho-Jeong An <tedd.an@intel.com>
---
 net/bluetooth/hci_sync.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 15c75ef4c271..76c3107c9f91 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4676,6 +4676,7 @@ int hci_dev_open_sync(struct hci_dev *hdev)
 		    hci_dev_test_flag(hdev, HCI_MGMT) &&
 		    hdev->dev_type == HCI_PRIMARY) {
 			ret = hci_powered_update_sync(hdev);
+			mgmt_power_on(hdev, ret);
 		}
 	} else {
 		/* Init failed, cleanup */
-- 
cgit v1.2.3


From cff2d762cde669023f345157f875b7ea6658992a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 29 Sep 2022 07:28:09 -0700
Subject: genetlink: reject use of nlmsg_flags for new commands

Commit 9c5d03d36251 ("genetlink: start to validate reserved header bytes")
introduced extra validation for genetlink headers. We had to gate it
to only apply to new commands, to maintain bug-wards compatibility.
Use this opportunity (before the new checks make it to Linus's tree)
to add more conditions.

Validate that Generic Netlink families do not use nlmsg_flags outside
of the well-understood set.

Link: https://lore.kernel.org/all/20220928073709.1b93b74a@kernel.org/
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://lore.kernel.org/r/20220929142809.1167546-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/netlink/genetlink.c | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 7c136de117eb..39b7c00e4cef 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -739,6 +739,36 @@ out:
 	return err;
 }
 
+static int genl_header_check(const struct genl_family *family,
+			     struct nlmsghdr *nlh, struct genlmsghdr *hdr,
+			     struct netlink_ext_ack *extack)
+{
+	u16 flags;
+
+	/* Only for commands added after we started validating */
+	if (hdr->cmd < family->resv_start_op)
+		return 0;
+
+	if (hdr->reserved) {
+		NL_SET_ERR_MSG(extack, "genlmsghdr.reserved field is not 0");
+		return -EINVAL;
+	}
+
+	/* Old netlink flags have pretty loose semantics, allow only the flags
+	 * consumed by the core where we can enforce the meaning.
+	 */
+	flags = nlh->nlmsg_flags;
+	if ((flags & NLM_F_DUMP) == NLM_F_DUMP) /* DUMP is 2 bits */
+		flags &= ~NLM_F_DUMP;
+	if (flags & ~(NLM_F_REQUEST | NLM_F_ACK | NLM_F_ECHO)) {
+		NL_SET_ERR_MSG(extack,
+			       "ambiguous or reserved bits set in nlmsg_flags");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int genl_family_rcv_msg(const struct genl_family *family,
 			       struct sk_buff *skb,
 			       struct nlmsghdr *nlh,
@@ -757,7 +787,7 @@ static int genl_family_rcv_msg(const struct genl_family *family,
 	if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
 		return -EINVAL;
 
-	if (hdr->cmd >= family->resv_start_op && hdr->reserved)
+	if (genl_header_check(family, nlh, hdr, extack))
 		return -EINVAL;
 
 	if (genl_get_cmd(hdr->cmd, family, &op))
-- 
cgit v1.2.3


From 3406079bbb279e33ab4b8d9a30e4dd68ce7bcafe Mon Sep 17 00:00:00 2001
From: Chunhao Lin <hau@realtek.com>
Date: Thu, 29 Sep 2022 01:13:56 +0800
Subject: r8169: add rtl_disable_rxdvgate()

rtl_disable_rxdvgate() is used for disable RXDV_GATE. It is opposite function
of rtl_enable_rxdvgate().

Disable RXDV_GATE does not have to delay. So in this patch, also remove the
delay after disale RXDV_GATE.

Signed-off-by: Chunhao Lin <hau@realtek.com>
Link: https://lore.kernel.org/r/20220928171356.3951-1-hau@realtek.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/realtek/r8169_main.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 3ec6d1319a8a..a73d061d9fcb 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -2443,6 +2443,11 @@ static void rtl_wait_txrx_fifo_empty(struct rtl8169_private *tp)
 	}
 }
 
+static void rtl_disable_rxdvgate(struct rtl8169_private *tp)
+{
+	RTL_W32(tp, MISC, RTL_R32(tp, MISC) & ~RXDV_GATED_EN);
+}
+
 static void rtl_enable_rxdvgate(struct rtl8169_private *tp)
 {
 	RTL_W32(tp, MISC, RTL_R32(tp, MISC) | RXDV_GATED_EN);
@@ -2960,7 +2965,7 @@ static void rtl_hw_start_8168g(struct rtl8169_private *tp)
 	rtl_reset_packet_filter(tp);
 	rtl_eri_write(tp, 0x2f8, ERIAR_MASK_0011, 0x1d8f);
 
-	RTL_W32(tp, MISC, RTL_R32(tp, MISC) & ~RXDV_GATED_EN);
+	rtl_disable_rxdvgate(tp);
 
 	rtl_eri_write(tp, 0xc0, ERIAR_MASK_0011, 0x0000);
 	rtl_eri_write(tp, 0xb8, ERIAR_MASK_0011, 0x0000);
@@ -3198,7 +3203,7 @@ static void rtl_hw_start_8168h_1(struct rtl8169_private *tp)
 
 	rtl_eri_write(tp, 0x5f0, ERIAR_MASK_0011, 0x4f87);
 
-	RTL_W32(tp, MISC, RTL_R32(tp, MISC) & ~RXDV_GATED_EN);
+	rtl_disable_rxdvgate(tp);
 
 	rtl_eri_write(tp, 0xc0, ERIAR_MASK_0011, 0x0000);
 	rtl_eri_write(tp, 0xb8, ERIAR_MASK_0011, 0x0000);
@@ -3249,7 +3254,7 @@ static void rtl_hw_start_8168ep(struct rtl8169_private *tp)
 
 	rtl_eri_write(tp, 0x5f0, ERIAR_MASK_0011, 0x4f87);
 
-	RTL_W32(tp, MISC, RTL_R32(tp, MISC) & ~RXDV_GATED_EN);
+	rtl_disable_rxdvgate(tp);
 
 	rtl_eri_write(tp, 0xc0, ERIAR_MASK_0011, 0x0000);
 	rtl_eri_write(tp, 0xb8, ERIAR_MASK_0011, 0x0000);
@@ -3313,7 +3318,7 @@ static void rtl_hw_start_8117(struct rtl8169_private *tp)
 
 	rtl_eri_write(tp, 0x5f0, ERIAR_MASK_0011, 0x4f87);
 
-	RTL_W32(tp, MISC, RTL_R32(tp, MISC) & ~RXDV_GATED_EN);
+	rtl_disable_rxdvgate(tp);
 
 	rtl_eri_write(tp, 0xc0, ERIAR_MASK_0011, 0x0000);
 	rtl_eri_write(tp, 0xb8, ERIAR_MASK_0011, 0x0000);
@@ -3557,8 +3562,7 @@ static void rtl_hw_start_8125_common(struct rtl8169_private *tp)
 	else
 		rtl8125a_config_eee_mac(tp);
 
-	RTL_W32(tp, MISC, RTL_R32(tp, MISC) & ~RXDV_GATED_EN);
-	udelay(10);
+	rtl_disable_rxdvgate(tp);
 }
 
 static void rtl_hw_start_8125a_2(struct rtl8169_private *tp)
-- 
cgit v1.2.3


From 3fcb698d9c773d1e0ce5eb4374273457eb4c8338 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Thu, 29 Sep 2022 09:28:56 +0200
Subject: net: devlink: introduce port registered assert helper and use it

Instead of checking devlink_port->devlink pointer for not being NULL
which indicates that devlink port is registered, put this check to new
pair of helpers similar to what we have for devlink and use them in
other functions.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/devlink.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 7776dc82f88d..f5bfbdb0301e 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -371,6 +371,11 @@ static struct devlink *devlink_get_from_attrs(struct net *net,
 	return ERR_PTR(-ENODEV);
 }
 
+#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port)				\
+	WARN_ON_ONCE(!(devlink_port)->devlink)
+#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port)			\
+	WARN_ON_ONCE((devlink_port)->devlink)
+
 static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
 						      unsigned int port_index)
 {
@@ -9869,7 +9874,8 @@ int devl_port_register(struct devlink *devlink,
 	if (devlink_port_index_exists(devlink, port_index))
 		return -EEXIST;
 
-	WARN_ON(devlink_port->devlink);
+	ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
 	devlink_port->devlink = devlink;
 	devlink_port->index = port_index;
 	spin_lock_init(&devlink_port->type_lock);
@@ -9952,8 +9958,8 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
 				    enum devlink_port_type type,
 				    void *type_dev)
 {
-	if (WARN_ON(!devlink_port->devlink))
-		return;
+	ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);
+
 	devlink_port_type_warn_cancel(devlink_port);
 	spin_lock_bh(&devlink_port->type_lock);
 	devlink_port->type = type;
@@ -10072,8 +10078,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
 {
 	int ret;
 
-	if (WARN_ON(devlink_port->devlink))
-		return;
+	ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
 	devlink_port->attrs = *attrs;
 	ret = __devlink_port_attrs_set(devlink_port, attrs->flavour);
 	if (ret)
@@ -10096,8 +10102,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro
 	struct devlink_port_attrs *attrs = &devlink_port->attrs;
 	int ret;
 
-	if (WARN_ON(devlink_port->devlink))
-		return;
+	ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
 	ret = __devlink_port_attrs_set(devlink_port,
 				       DEVLINK_PORT_FLAVOUR_PCI_PF);
 	if (ret)
@@ -10123,8 +10129,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro
 	struct devlink_port_attrs *attrs = &devlink_port->attrs;
 	int ret;
 
-	if (WARN_ON(devlink_port->devlink))
-		return;
+	ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
 	ret = __devlink_port_attrs_set(devlink_port,
 				       DEVLINK_PORT_FLAVOUR_PCI_VF);
 	if (ret)
@@ -10151,8 +10157,8 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
 	struct devlink_port_attrs *attrs = &devlink_port->attrs;
 	int ret;
 
-	if (WARN_ON(devlink_port->devlink))
-		return;
+	ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
 	ret = __devlink_port_attrs_set(devlink_port,
 				       DEVLINK_PORT_FLAVOUR_PCI_SF);
 	if (ret)
@@ -10267,8 +10273,8 @@ EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy);
 void devlink_port_linecard_set(struct devlink_port *devlink_port,
 			       struct devlink_linecard *linecard)
 {
-	if (WARN_ON(devlink_port->devlink))
-		return;
+	ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
 	devlink_port->linecard = linecard;
 }
 EXPORT_SYMBOL_GPL(devlink_port_linecard_set);
-- 
cgit v1.2.3


From 081adcfe930e4b01a55eaa329b2e453a442f35a9 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Thu, 29 Sep 2022 09:28:57 +0200
Subject: net: devlink: introduce a flag to indicate devlink port being
 registered

Instead of relying on devlink pointer not being initialized, introduce
an extra flag to indicate if devlink port is registered. This is needed
as later on devlink pointer is going to be initialized even in case
devlink port is not registered yet.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h | 3 ++-
 net/core/devlink.c    | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 264aa98e6da6..bcacd8dab297 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -129,7 +129,8 @@ struct devlink_port {
 	void *type_dev;
 	struct devlink_port_attrs attrs;
 	u8 attrs_set:1,
-	   switch_port:1;
+	   switch_port:1,
+	   registered:1;
 	struct delayed_work type_warn_dw;
 	struct list_head reporter_list;
 	struct mutex reporters_lock; /* Protects reporter_list */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index f5bfbdb0301e..17529e6b2bbf 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -372,9 +372,9 @@ static struct devlink *devlink_get_from_attrs(struct net *net,
 }
 
 #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port)				\
-	WARN_ON_ONCE(!(devlink_port)->devlink)
+	WARN_ON_ONCE(!(devlink_port)->registered)
 #define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port)			\
-	WARN_ON_ONCE((devlink_port)->devlink)
+	WARN_ON_ONCE((devlink_port)->registered)
 
 static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
 						      unsigned int port_index)
@@ -9876,6 +9876,7 @@ int devl_port_register(struct devlink *devlink,
 
 	ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
 
+	devlink_port->registered = true;
 	devlink_port->devlink = devlink;
 	devlink_port->index = port_index;
 	spin_lock_init(&devlink_port->type_lock);
@@ -9934,6 +9935,7 @@ void devl_port_unregister(struct devlink_port *devlink_port)
 	WARN_ON(!list_empty(&devlink_port->reporter_list));
 	WARN_ON(!list_empty(&devlink_port->region_list));
 	mutex_destroy(&devlink_port->reporters_lock);
+	devlink_port->registered = false;
 }
 EXPORT_SYMBOL_GPL(devl_port_unregister);
 
-- 
cgit v1.2.3


From ae3bbc04d4bfef5d0332cd4edda3ac8f714cea23 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Thu, 29 Sep 2022 09:28:58 +0200
Subject: net: devlink: add port_init/fini() helpers to allow
 pre-register/post-unregister functions

Lifetime of some of the devlink objects, like regions, is currently
forced to be different for devlink instance and devlink port instance
(per-port regions). The reason is that for devlink ports, the internal
structures initialization happens only after devlink_port_register() is
called.

To resolve this inconsistency, introduce new set of helpers to allow
driver to initialize devlink pointer and region list before
devlink_register() is called. That allows port regions to be created
before devlink port registration and destroyed after devlink
port unregistration.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h |  6 +++++-
 net/core/devlink.c    | 46 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index bcacd8dab297..ba6b8b094943 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -130,7 +130,8 @@ struct devlink_port {
 	struct devlink_port_attrs attrs;
 	u8 attrs_set:1,
 	   switch_port:1,
-	   registered:1;
+	   registered:1,
+	   initialized:1;
 	struct delayed_work type_warn_dw;
 	struct list_head reporter_list;
 	struct mutex reporters_lock; /* Protects reporter_list */
@@ -1563,6 +1564,9 @@ void devlink_set_features(struct devlink *devlink, u64 features);
 void devlink_register(struct devlink *devlink);
 void devlink_unregister(struct devlink *devlink);
 void devlink_free(struct devlink *devlink);
+void devlink_port_init(struct devlink *devlink,
+		       struct devlink_port *devlink_port);
+void devlink_port_fini(struct devlink_port *devlink_port);
 int devl_port_register(struct devlink *devlink,
 		       struct devlink_port *devlink_port,
 		       unsigned int port_index);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 17529e6b2bbf..89baa7c0938b 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -375,6 +375,8 @@ static struct devlink *devlink_get_from_attrs(struct net *net,
 	WARN_ON_ONCE(!(devlink_port)->registered)
 #define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port)			\
 	WARN_ON_ONCE((devlink_port)->registered)
+#define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port)				\
+	WARN_ON_ONCE(!(devlink_port)->initialized)
 
 static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
 						      unsigned int port_index)
@@ -9852,6 +9854,44 @@ static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
 	cancel_delayed_work_sync(&devlink_port->type_warn_dw);
 }
 
+/**
+ * devlink_port_init() - Init devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ *
+ * Initialize essencial stuff that is needed for functions
+ * that may be called before devlink port registration.
+ * Call to this function is optional and not needed
+ * in case the driver does not use such functions.
+ */
+void devlink_port_init(struct devlink *devlink,
+		       struct devlink_port *devlink_port)
+{
+	if (devlink_port->initialized)
+		return;
+	devlink_port->devlink = devlink;
+	INIT_LIST_HEAD(&devlink_port->region_list);
+	devlink_port->initialized = true;
+}
+EXPORT_SYMBOL_GPL(devlink_port_init);
+
+/**
+ * devlink_port_fini() - Deinitialize devlink port
+ *
+ * @devlink_port: devlink port
+ *
+ * Deinitialize essencial stuff that is in use for functions
+ * that may be called after devlink port unregistration.
+ * Call to this function is optional and not needed
+ * in case the driver does not use such functions.
+ */
+void devlink_port_fini(struct devlink_port *devlink_port)
+{
+	WARN_ON(!list_empty(&devlink_port->region_list));
+}
+EXPORT_SYMBOL_GPL(devlink_port_fini);
+
 /**
  * devl_port_register() - Register devlink port
  *
@@ -9876,14 +9916,13 @@ int devl_port_register(struct devlink *devlink,
 
 	ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
 
+	devlink_port_init(devlink, devlink_port);
 	devlink_port->registered = true;
-	devlink_port->devlink = devlink;
 	devlink_port->index = port_index;
 	spin_lock_init(&devlink_port->type_lock);
 	INIT_LIST_HEAD(&devlink_port->reporter_list);
 	mutex_init(&devlink_port->reporters_lock);
 	list_add_tail(&devlink_port->list, &devlink->port_list);
-	INIT_LIST_HEAD(&devlink_port->region_list);
 
 	INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
 	devlink_port_type_warn_schedule(devlink_port);
@@ -9933,7 +9972,6 @@ void devl_port_unregister(struct devlink_port *devlink_port)
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
 	list_del(&devlink_port->list);
 	WARN_ON(!list_empty(&devlink_port->reporter_list));
-	WARN_ON(!list_empty(&devlink_port->region_list));
 	mutex_destroy(&devlink_port->reporters_lock);
 	devlink_port->registered = false;
 }
@@ -11347,6 +11385,8 @@ devlink_port_region_create(struct devlink_port *port,
 	struct devlink_region *region;
 	int err = 0;
 
+	ASSERT_DEVLINK_PORT_INITIALIZED(port);
+
 	if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
 		return ERR_PTR(-EINVAL);
 
-- 
cgit v1.2.3


From d82acd85cc41a8e5d5e0e4c2a3f4b645def29723 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Thu, 29 Sep 2022 09:28:59 +0200
Subject: net: dsa: move port_setup/teardown to be called outside devlink port
 registered area

Move port_setup() op to be called before devlink_port_register() and
port_teardown() after devlink_port_unregister().

Note it makes sense to move this alongside the rest of the devlink port
code, the reinit() function also gets much nicer, as clearly the fact that
port_setup()->devlink_port_region_create() was called in dsa_port_setup
did not fit the flow.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/dsa/dsa2.c | 68 ++++++++++++++++++++++------------------------------------
 1 file changed, 26 insertions(+), 42 deletions(-)

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 7024e2120de1..6f555b1bb483 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -472,12 +472,6 @@ static int dsa_port_setup(struct dsa_port *dp)
 	if (dp->setup)
 		return 0;
 
-	if (ds->ops->port_setup) {
-		err = ds->ops->port_setup(ds, dp->index);
-		if (err)
-			return err;
-	}
-
 	switch (dp->type) {
 	case DSA_PORT_TYPE_UNUSED:
 		dsa_port_disable(dp);
@@ -532,11 +526,8 @@ static int dsa_port_setup(struct dsa_port *dp)
 		dsa_port_disable(dp);
 	if (err && dsa_port_link_registered)
 		dsa_shared_port_link_unregister_of(dp);
-	if (err) {
-		if (ds->ops->port_teardown)
-			ds->ops->port_teardown(ds, dp->index);
+	if (err)
 		return err;
-	}
 
 	dp->setup = true;
 
@@ -549,17 +540,26 @@ static int dsa_port_devlink_setup(struct dsa_port *dp)
 	struct dsa_switch_tree *dst = dp->ds->dst;
 	struct devlink_port_attrs attrs = {};
 	struct devlink *dl = dp->ds->devlink;
+	struct dsa_switch *ds = dp->ds;
 	const unsigned char *id;
 	unsigned char len;
 	int err;
 
+	memset(dlp, 0, sizeof(*dlp));
+	devlink_port_init(dl, dlp);
+
+	if (ds->ops->port_setup) {
+		err = ds->ops->port_setup(ds, dp->index);
+		if (err)
+			return err;
+	}
+
 	id = (const unsigned char *)&dst->index;
 	len = sizeof(dst->index);
 
 	attrs.phys.port_number = dp->index;
 	memcpy(attrs.switch_id.id, id, len);
 	attrs.switch_id.id_len = len;
-	memset(dlp, 0, sizeof(*dlp));
 
 	switch (dp->type) {
 	case DSA_PORT_TYPE_UNUSED:
@@ -578,24 +578,23 @@ static int dsa_port_devlink_setup(struct dsa_port *dp)
 
 	devlink_port_attrs_set(dlp, &attrs);
 	err = devlink_port_register(dl, dlp, dp->index);
+	if (err) {
+		if (ds->ops->port_teardown)
+			ds->ops->port_teardown(ds, dp->index);
+		return err;
+	}
+	dp->devlink_port_setup = true;
 
-	if (!err)
-		dp->devlink_port_setup = true;
-
-	return err;
+	return 0;
 }
 
 static void dsa_port_teardown(struct dsa_port *dp)
 {
 	struct devlink_port *dlp = &dp->devlink_port;
-	struct dsa_switch *ds = dp->ds;
 
 	if (!dp->setup)
 		return;
 
-	if (ds->ops->port_teardown)
-		ds->ops->port_teardown(ds, dp->index);
-
 	devlink_port_type_clear(dlp);
 
 	switch (dp->type) {
@@ -625,40 +624,25 @@ static void dsa_port_teardown(struct dsa_port *dp)
 static void dsa_port_devlink_teardown(struct dsa_port *dp)
 {
 	struct devlink_port *dlp = &dp->devlink_port;
+	struct dsa_switch *ds = dp->ds;
 
-	if (dp->devlink_port_setup)
+	if (dp->devlink_port_setup) {
 		devlink_port_unregister(dlp);
+		if (ds->ops->port_teardown)
+			ds->ops->port_teardown(ds, dp->index);
+		devlink_port_fini(dlp);
+	}
 	dp->devlink_port_setup = false;
 }
 
 /* Destroy the current devlink port, and create a new one which has the UNUSED
- * flavour. At this point, any call to ds->ops->port_setup has been already
- * balanced out by a call to ds->ops->port_teardown, so we know that any
- * devlink port regions the driver had are now unregistered. We then call its
- * ds->ops->port_setup again, in order for the driver to re-create them on the
- * new devlink port.
+ * flavour.
  */
 static int dsa_port_reinit_as_unused(struct dsa_port *dp)
 {
-	struct dsa_switch *ds = dp->ds;
-	int err;
-
 	dsa_port_devlink_teardown(dp);
 	dp->type = DSA_PORT_TYPE_UNUSED;
-	err = dsa_port_devlink_setup(dp);
-	if (err)
-		return err;
-
-	if (ds->ops->port_setup) {
-		/* On error, leave the devlink port registered,
-		 * dsa_switch_teardown will clean it up later.
-		 */
-		err = ds->ops->port_setup(ds, dp->index);
-		if (err)
-			return err;
-	}
-
-	return 0;
+	return dsa_port_devlink_setup(dp);
 }
 
 static int dsa_devlink_info_get(struct devlink *dl,
-- 
cgit v1.2.3


From cf5ca4ddc37a693b17fdb653cb84b920b1185d71 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 29 Sep 2022 09:29:00 +0200
Subject: net: dsa: don't leave dangling pointers in dp->pl when failing

There is a desire to simplify the dsa_port registration path with
devlink, and this involves reworking a bit how user ports which fail to
connect to their PHY (because it's missing) get reinitialized as UNUSED
devlink ports.

The desire is for the change to look something like this; basically
dsa_port_setup() has failed, we just change dp->type and call
dsa_port_setup() again.

-/* Destroy the current devlink port, and create a new one which has the UNUSED
- * flavour.
- */
-static int dsa_port_reinit_as_unused(struct dsa_port *dp)
+static int dsa_port_setup_as_unused(struct dsa_port *dp)
 {
-	dsa_port_devlink_teardown(dp);
 	dp->type = DSA_PORT_TYPE_UNUSED;
-	return dsa_port_devlink_setup(dp);
+	return dsa_port_setup(dp);
 }

For an UNUSED port, dsa_port_setup() mostly only calls dsa_port_devlink_setup()
anyway, so we could get away with calling just that. But if we call the
full blown dsa_port_setup(dp) (which will be needed to properly set
dp->setup = true), the callee will have the tendency to go through this
code block too, and call dsa_port_disable(dp):

	switch (dp->type) {
	case DSA_PORT_TYPE_UNUSED:
		dsa_port_disable(dp);
		break;

That is not very good, because dsa_port_disable() has this hidden inside
of it:

	if (dp->pl)
		phylink_stop(dp->pl);

Fact is, we are not prepared to handle a call to dsa_port_disable() with
a struct dsa_port that came from a previous (and failed) call to
dsa_port_setup(). We do not clean up dp->pl, and this will make the
second call to dsa_port_setup() call phylink_stop() on a dangling dp->pl
pointer.

Solve this by creating an API for phylink destruction which is symmetric
to the phylink creation, and never leave dp->pl set to anything except
NULL or a valid phylink structure.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/dsa/dsa_priv.h |  1 +
 net/dsa/port.c     | 22 +++++++++++++++-------
 net/dsa/slave.c    |  6 +++---
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 129e4a649c7e..6e65c7ffd6f3 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -294,6 +294,7 @@ int dsa_port_mrp_add_ring_role(const struct dsa_port *dp,
 int dsa_port_mrp_del_ring_role(const struct dsa_port *dp,
 			       const struct switchdev_obj_ring_role_mrp *mrp);
 int dsa_port_phylink_create(struct dsa_port *dp);
+void dsa_port_phylink_destroy(struct dsa_port *dp);
 int dsa_shared_port_link_register_of(struct dsa_port *dp);
 void dsa_shared_port_link_unregister_of(struct dsa_port *dp);
 int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index e6289a1db0a0..e4a0513816bb 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -1661,6 +1661,7 @@ int dsa_port_phylink_create(struct dsa_port *dp)
 {
 	struct dsa_switch *ds = dp->ds;
 	phy_interface_t mode;
+	struct phylink *pl;
 	int err;
 
 	err = of_get_phy_mode(dp->dn, &mode);
@@ -1677,16 +1678,24 @@ int dsa_port_phylink_create(struct dsa_port *dp)
 	if (ds->ops->phylink_get_caps)
 		ds->ops->phylink_get_caps(ds, dp->index, &dp->pl_config);
 
-	dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn),
-				mode, &dsa_port_phylink_mac_ops);
-	if (IS_ERR(dp->pl)) {
+	pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn),
+			    mode, &dsa_port_phylink_mac_ops);
+	if (IS_ERR(pl)) {
 		pr_err("error creating PHYLINK: %ld\n", PTR_ERR(dp->pl));
-		return PTR_ERR(dp->pl);
+		return PTR_ERR(pl);
 	}
 
+	dp->pl = pl;
+
 	return 0;
 }
 
+void dsa_port_phylink_destroy(struct dsa_port *dp)
+{
+	phylink_destroy(dp->pl);
+	dp->pl = NULL;
+}
+
 static int dsa_shared_port_setup_phy_of(struct dsa_port *dp, bool enable)
 {
 	struct dsa_switch *ds = dp->ds;
@@ -1781,7 +1790,7 @@ static int dsa_shared_port_phylink_register(struct dsa_port *dp)
 	return 0;
 
 err_phy_connect:
-	phylink_destroy(dp->pl);
+	dsa_port_phylink_destroy(dp);
 	return err;
 }
 
@@ -1983,8 +1992,7 @@ void dsa_shared_port_link_unregister_of(struct dsa_port *dp)
 		rtnl_lock();
 		phylink_disconnect_phy(dp->pl);
 		rtnl_unlock();
-		phylink_destroy(dp->pl);
-		dp->pl = NULL;
+		dsa_port_phylink_destroy(dp);
 		return;
 	}
 
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index aa47ddc19fdf..1a59918d3b30 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2304,7 +2304,7 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev)
 	if (ret) {
 		netdev_err(slave_dev, "failed to connect to PHY: %pe\n",
 			   ERR_PTR(ret));
-		phylink_destroy(dp->pl);
+		dsa_port_phylink_destroy(dp);
 	}
 
 	return ret;
@@ -2476,7 +2476,7 @@ out_phy:
 	rtnl_lock();
 	phylink_disconnect_phy(p->dp->pl);
 	rtnl_unlock();
-	phylink_destroy(p->dp->pl);
+	dsa_port_phylink_destroy(p->dp);
 out_gcells:
 	gro_cells_destroy(&p->gcells);
 out_free:
@@ -2499,7 +2499,7 @@ void dsa_slave_destroy(struct net_device *slave_dev)
 	phylink_disconnect_phy(dp->pl);
 	rtnl_unlock();
 
-	phylink_destroy(dp->pl);
+	dsa_port_phylink_destroy(dp);
 	gro_cells_destroy(&p->gcells);
 	free_percpu(slave_dev->tstats);
 	free_netdev(slave_dev);
-- 
cgit v1.2.3


From c698a5fbf7fd9f5bb909d09626319b59d55db36b Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Thu, 29 Sep 2022 09:29:01 +0200
Subject: net: dsa: don't do devlink port setup early

Commit 3122433eb533 ("net: dsa: Register devlink ports before calling DSA driver setup()")
moved devlink port setup to be done early before driver setup()
was called. That is no longer needed, so move the devlink port
initialization back to dsa_port_setup(), as the first thing done there.

Note there is no longer needed to reinit port as unused if
dsa_port_setup() fails, as it unregisters the devlink port instance on
the error path.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/dsa/dsa2.c | 176 +++++++++++++++++++++++++++------------------------------
 1 file changed, 82 insertions(+), 94 deletions(-)

diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 6f555b1bb483..747c0364fb0f 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -461,6 +461,74 @@ static void dsa_tree_teardown_cpu_ports(struct dsa_switch_tree *dst)
 			dp->cpu_dp = NULL;
 }
 
+static int dsa_port_devlink_setup(struct dsa_port *dp)
+{
+	struct devlink_port *dlp = &dp->devlink_port;
+	struct dsa_switch_tree *dst = dp->ds->dst;
+	struct devlink_port_attrs attrs = {};
+	struct devlink *dl = dp->ds->devlink;
+	struct dsa_switch *ds = dp->ds;
+	const unsigned char *id;
+	unsigned char len;
+	int err;
+
+	memset(dlp, 0, sizeof(*dlp));
+	devlink_port_init(dl, dlp);
+
+	if (ds->ops->port_setup) {
+		err = ds->ops->port_setup(ds, dp->index);
+		if (err)
+			return err;
+	}
+
+	id = (const unsigned char *)&dst->index;
+	len = sizeof(dst->index);
+
+	attrs.phys.port_number = dp->index;
+	memcpy(attrs.switch_id.id, id, len);
+	attrs.switch_id.id_len = len;
+
+	switch (dp->type) {
+	case DSA_PORT_TYPE_UNUSED:
+		attrs.flavour = DEVLINK_PORT_FLAVOUR_UNUSED;
+		break;
+	case DSA_PORT_TYPE_CPU:
+		attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU;
+		break;
+	case DSA_PORT_TYPE_DSA:
+		attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA;
+		break;
+	case DSA_PORT_TYPE_USER:
+		attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
+		break;
+	}
+
+	devlink_port_attrs_set(dlp, &attrs);
+	err = devlink_port_register(dl, dlp, dp->index);
+	if (err) {
+		if (ds->ops->port_teardown)
+			ds->ops->port_teardown(ds, dp->index);
+		return err;
+	}
+	dp->devlink_port_setup = true;
+
+	return 0;
+}
+
+static void dsa_port_devlink_teardown(struct dsa_port *dp)
+{
+	struct devlink_port *dlp = &dp->devlink_port;
+	struct dsa_switch *ds = dp->ds;
+
+	if (dp->devlink_port_setup) {
+		devlink_port_unregister(dlp);
+		if (ds->ops->port_teardown)
+			ds->ops->port_teardown(ds, dp->index);
+		devlink_port_fini(dlp);
+	}
+	dp->devlink_port_setup = false;
+}
+
 static int dsa_port_setup(struct dsa_port *dp)
 {
 	struct devlink_port *dlp = &dp->devlink_port;
@@ -472,6 +540,10 @@ static int dsa_port_setup(struct dsa_port *dp)
 	if (dp->setup)
 		return 0;
 
+	err = dsa_port_devlink_setup(dp);
+	if (err)
+		return err;
+
 	switch (dp->type) {
 	case DSA_PORT_TYPE_UNUSED:
 		dsa_port_disable(dp);
@@ -526,64 +598,12 @@ static int dsa_port_setup(struct dsa_port *dp)
 		dsa_port_disable(dp);
 	if (err && dsa_port_link_registered)
 		dsa_shared_port_link_unregister_of(dp);
-	if (err)
-		return err;
-
-	dp->setup = true;
-
-	return 0;
-}
-
-static int dsa_port_devlink_setup(struct dsa_port *dp)
-{
-	struct devlink_port *dlp = &dp->devlink_port;
-	struct dsa_switch_tree *dst = dp->ds->dst;
-	struct devlink_port_attrs attrs = {};
-	struct devlink *dl = dp->ds->devlink;
-	struct dsa_switch *ds = dp->ds;
-	const unsigned char *id;
-	unsigned char len;
-	int err;
-
-	memset(dlp, 0, sizeof(*dlp));
-	devlink_port_init(dl, dlp);
-
-	if (ds->ops->port_setup) {
-		err = ds->ops->port_setup(ds, dp->index);
-		if (err)
-			return err;
-	}
-
-	id = (const unsigned char *)&dst->index;
-	len = sizeof(dst->index);
-
-	attrs.phys.port_number = dp->index;
-	memcpy(attrs.switch_id.id, id, len);
-	attrs.switch_id.id_len = len;
-
-	switch (dp->type) {
-	case DSA_PORT_TYPE_UNUSED:
-		attrs.flavour = DEVLINK_PORT_FLAVOUR_UNUSED;
-		break;
-	case DSA_PORT_TYPE_CPU:
-		attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU;
-		break;
-	case DSA_PORT_TYPE_DSA:
-		attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA;
-		break;
-	case DSA_PORT_TYPE_USER:
-		attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
-		break;
-	}
-
-	devlink_port_attrs_set(dlp, &attrs);
-	err = devlink_port_register(dl, dlp, dp->index);
 	if (err) {
-		if (ds->ops->port_teardown)
-			ds->ops->port_teardown(ds, dp->index);
+		dsa_port_devlink_teardown(dp);
 		return err;
 	}
-	dp->devlink_port_setup = true;
+
+	dp->setup = true;
 
 	return 0;
 }
@@ -618,31 +638,15 @@ static void dsa_port_teardown(struct dsa_port *dp)
 		break;
 	}
 
-	dp->setup = false;
-}
-
-static void dsa_port_devlink_teardown(struct dsa_port *dp)
-{
-	struct devlink_port *dlp = &dp->devlink_port;
-	struct dsa_switch *ds = dp->ds;
+	dsa_port_devlink_teardown(dp);
 
-	if (dp->devlink_port_setup) {
-		devlink_port_unregister(dlp);
-		if (ds->ops->port_teardown)
-			ds->ops->port_teardown(ds, dp->index);
-		devlink_port_fini(dlp);
-	}
-	dp->devlink_port_setup = false;
+	dp->setup = false;
 }
 
-/* Destroy the current devlink port, and create a new one which has the UNUSED
- * flavour.
- */
-static int dsa_port_reinit_as_unused(struct dsa_port *dp)
+static int dsa_port_setup_as_unused(struct dsa_port *dp)
 {
-	dsa_port_devlink_teardown(dp);
 	dp->type = DSA_PORT_TYPE_UNUSED;
-	return dsa_port_devlink_setup(dp);
+	return dsa_port_setup(dp);
 }
 
 static int dsa_devlink_info_get(struct devlink *dl,
@@ -866,7 +870,6 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 {
 	struct dsa_devlink_priv *dl_priv;
 	struct device_node *dn;
-	struct dsa_port *dp;
 	int err;
 
 	if (ds->setup)
@@ -889,18 +892,9 @@ static int dsa_switch_setup(struct dsa_switch *ds)
 	dl_priv = devlink_priv(ds->devlink);
 	dl_priv->ds = ds;
 
-	/* Setup devlink port instances now, so that the switch
-	 * setup() can register regions etc, against the ports
-	 */
-	dsa_switch_for_each_port(dp, ds) {
-		err = dsa_port_devlink_setup(dp);
-		if (err)
-			goto unregister_devlink_ports;
-	}
-
 	err = dsa_switch_register_notifier(ds);
 	if (err)
-		goto unregister_devlink_ports;
+		goto devlink_free;
 
 	ds->configure_vlan_while_not_filtering = true;
 
@@ -941,9 +935,7 @@ teardown:
 		ds->ops->teardown(ds);
 unregister_notifier:
 	dsa_switch_unregister_notifier(ds);
-unregister_devlink_ports:
-	dsa_switch_for_each_port(dp, ds)
-		dsa_port_devlink_teardown(dp);
+devlink_free:
 	devlink_free(ds->devlink);
 	ds->devlink = NULL;
 	return err;
@@ -951,8 +943,6 @@ unregister_devlink_ports:
 
 static void dsa_switch_teardown(struct dsa_switch *ds)
 {
-	struct dsa_port *dp;
-
 	if (!ds->setup)
 		return;
 
@@ -971,8 +961,6 @@ static void dsa_switch_teardown(struct dsa_switch *ds)
 	dsa_switch_unregister_notifier(ds);
 
 	if (ds->devlink) {
-		dsa_switch_for_each_port(dp, ds)
-			dsa_port_devlink_teardown(dp);
 		devlink_free(ds->devlink);
 		ds->devlink = NULL;
 	}
@@ -1025,7 +1013,7 @@ static int dsa_tree_setup_ports(struct dsa_switch_tree *dst)
 		if (dsa_port_is_user(dp) || dsa_port_is_unused(dp)) {
 			err = dsa_port_setup(dp);
 			if (err) {
-				err = dsa_port_reinit_as_unused(dp);
+				err = dsa_port_setup_as_unused(dp);
 				if (err)
 					goto teardown;
 			}
-- 
cgit v1.2.3


From 61e4a51621587c939672d6a9354f6d0aa3d4e131 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Thu, 29 Sep 2022 09:29:02 +0200
Subject: net: dsa: remove bool devlink_port_setup

Since dsa_port_devlink_setup() and dsa_port_devlink_teardown() are
already called from code paths which only execute once per port (due to
the existing bool dp->setup), keeping another dp->devlink_port_setup is
redundant, because we can already manage to balance the calls properly
(and not call teardown when setup was never called, or call setup twice,
or things like that).

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h |  2 --
 net/dsa/dsa2.c    | 14 ++++++--------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/include/net/dsa.h b/include/net/dsa.h
index d777eac5694f..ee369670e20e 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -294,8 +294,6 @@ struct dsa_port {
 
 	u8			lag_tx_enabled:1;
 
-	u8			devlink_port_setup:1;
-
 	/* Master state bits, valid only on CPU ports */
 	u8			master_admin_up:1;
 	u8			master_oper_up:1;
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index 747c0364fb0f..af0e2c0394ac 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -510,7 +510,6 @@ static int dsa_port_devlink_setup(struct dsa_port *dp)
 			ds->ops->port_teardown(ds, dp->index);
 		return err;
 	}
-	dp->devlink_port_setup = true;
 
 	return 0;
 }
@@ -520,13 +519,12 @@ static void dsa_port_devlink_teardown(struct dsa_port *dp)
 	struct devlink_port *dlp = &dp->devlink_port;
 	struct dsa_switch *ds = dp->ds;
 
-	if (dp->devlink_port_setup) {
-		devlink_port_unregister(dlp);
-		if (ds->ops->port_teardown)
-			ds->ops->port_teardown(ds, dp->index);
-		devlink_port_fini(dlp);
-	}
-	dp->devlink_port_setup = false;
+	devlink_port_unregister(dlp);
+
+	if (ds->ops->port_teardown)
+		ds->ops->port_teardown(ds, dp->index);
+
+	devlink_port_fini(dlp);
 }
 
 static int dsa_port_setup(struct dsa_port *dp)
-- 
cgit v1.2.3


From 70d5ab532059f06c2833b0e4ffae8a785c3692df Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Thu, 29 Sep 2022 13:22:55 -0700
Subject: dt-bindings: nfc: marvell,nci: fix reset line polarity in examples

The reset line is supposed to be "active low" (it even says so in the
description), but examples incorrectly show it as "active high"
(likely because original examples use 0 which is technically "active
high" but in practice often "don't care" if the driver is using legacy
gpio API, as this one does).

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/YzX+nzJolxAKmt+z@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/devicetree/bindings/net/nfc/marvell,nci.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/nfc/marvell,nci.yaml b/Documentation/devicetree/bindings/net/nfc/marvell,nci.yaml
index a191a04e681c..308485a8ee6c 100644
--- a/Documentation/devicetree/bindings/net/nfc/marvell,nci.yaml
+++ b/Documentation/devicetree/bindings/net/nfc/marvell,nci.yaml
@@ -128,7 +128,7 @@ examples:
 
             i2c-int-rising;
 
-            reset-n-io = <&gpio3 19 GPIO_ACTIVE_HIGH>;
+            reset-n-io = <&gpio3 19 GPIO_ACTIVE_LOW>;
         };
     };
 
@@ -151,7 +151,7 @@ examples:
             interrupt-parent = <&gpio1>;
             interrupts = <17 IRQ_TYPE_EDGE_RISING>;
 
-            reset-n-io = <&gpio3 19 GPIO_ACTIVE_HIGH>;
+            reset-n-io = <&gpio3 19 GPIO_ACTIVE_LOW>;
         };
     };
 
@@ -162,7 +162,7 @@ examples:
         nfc {
             compatible = "marvell,nfc-uart";
 
-            reset-n-io = <&gpio3 16 GPIO_ACTIVE_HIGH>;
+            reset-n-io = <&gpio3 16 GPIO_ACTIVE_LOW>;
 
             hci-muxed;
             flow-control;
-- 
cgit v1.2.3


From f77a9f3cd1e669b83d575e1f4df69c0079704842 Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Thu, 29 Sep 2022 12:19:09 +0800
Subject: selftests/tc-testing: update qdisc/cls/action features in config

Since three patchsets "add tc-testing test cases", "refactor duplicate
codes in the tc cls walk function", and "refactor duplicate codes in the
qdisc class walk function" are merged to net-next tree, the list of
supported features needs to be updated in config file.

Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Reviewed-by: Victor Nogueira <victor@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://lore.kernel.org/r/20220929041909.83913-1-shaozhengchao@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 tools/testing/selftests/tc-testing/config | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config
index 2b2c2a835757..4638c63a339f 100644
--- a/tools/testing/selftests/tc-testing/config
+++ b/tools/testing/selftests/tc-testing/config
@@ -13,15 +13,28 @@ CONFIG_NET_SCHED=y
 # Queueing/Scheduling
 #
 CONFIG_NET_SCH_ATM=m
+CONFIG_NET_SCH_CAKE=m
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_CBS=m
 CONFIG_NET_SCH_CHOKE=m
 CONFIG_NET_SCH_CODEL=m
+CONFIG_NET_SCH_DRR=m
+CONFIG_NET_SCH_DSMARK=m
 CONFIG_NET_SCH_ETF=m
 CONFIG_NET_SCH_FQ=m
+CONFIG_NET_SCH_FQ_CODEL=m
 CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_HFSC=m
 CONFIG_NET_SCH_HHF=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_SCH_MQPRIO=m
+CONFIG_NET_SCH_MULTIQ=m
+CONFIG_NET_SCH_NETEM=m
+CONFIG_NET_SCH_PIE=m
 CONFIG_NET_SCH_PLUG=m
 CONFIG_NET_SCH_PRIO=m
-CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_SCH_QFQ=m
 CONFIG_NET_SCH_SFB=m
 CONFIG_NET_SCH_SFQ=m
 CONFIG_NET_SCH_SKBPRIO=m
@@ -37,6 +50,15 @@ CONFIG_NET_CLS_FW=m
 CONFIG_NET_CLS_U32=m
 CONFIG_CLS_U32_PERF=y
 CONFIG_CLS_U32_MARK=y
+CONFIG_NET_CLS_BASIC=m
+CONFIG_NET_CLS_BPF=m
+CONFIG_NET_CLS_CGROUP=m
+CONFIG_NET_CLS_FLOW=m
+CONFIG_NET_CLS_FLOWER=m
+CONFIG_NET_CLS_MATCHALL=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_TCINDEX=m
 CONFIG_NET_EMATCH=y
 CONFIG_NET_EMATCH_STACK=32
 CONFIG_NET_EMATCH_CMP=m
@@ -68,6 +90,7 @@ CONFIG_NET_ACT_IFE=m
 CONFIG_NET_ACT_TUNNEL_KEY=m
 CONFIG_NET_ACT_CT=m
 CONFIG_NET_ACT_MPLS=m
+CONFIG_NET_ACT_GATE=m
 CONFIG_NET_IFE_SKBMARK=m
 CONFIG_NET_IFE_SKBPRIO=m
 CONFIG_NET_IFE_SKBTCINDEX=m
-- 
cgit v1.2.3


From b674deddeb49fd37b8f20a15b658abde6671e2a8 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Mon, 19 Sep 2022 09:52:55 +0200
Subject: MAINTAINERS: drop entry to removed file in ARM/RISCPC ARCHITECTURE

Commit c1fe8d054c0a ("ARM: riscpc: use GENERIC_IRQ_MULTI_HANDLER") removes
arch/arm/include/asm/hardware/entry-macro-iomd.S, but missed to adjust
MAINTAINERS.

Hence, ./scripts/get_maintainer.pl --self-test=patterns complains about a
broken reference.

Drop the file entry to the removed file in ARM/RISCPC ARCHITECTURE.

Link: https://lkml.kernel.org/r/20220919075255.386-1-lukas.bulwahn@gmail.com
Fixes: c1fe8d054c0a ("ARM: riscpc: use GENERIC_IRQ_MULTI_HANDLER")
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Russell King <linux@armlinux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 589517372408..b8dc03c66484 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2670,7 +2670,6 @@ M:	Russell King <linux@armlinux.org.uk>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 W:	http://www.armlinux.org.uk/
-F:	arch/arm/include/asm/hardware/entry-macro-iomd.S
 F:	arch/arm/include/asm/hardware/ioc.h
 F:	arch/arm/include/asm/hardware/iomd.h
 F:	arch/arm/include/asm/hardware/memc.h
-- 
cgit v1.2.3


From 30c19366636f72515679aa10dad61a4d988d4c9a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 26 Sep 2022 17:16:50 +0200
Subject: mm: fix BUG splat with kvmalloc + GFP_ATOMIC

Martin Zaharinov reports BUG with 5.19.10 kernel:
 kernel BUG at mm/vmalloc.c:2437!
 invalid opcode: 0000 [#1] SMP
 CPU: 28 PID: 0 Comm: swapper/28 Tainted: G        W  O      5.19.9 #1
 [..]
 RIP: 0010:__get_vm_area_node+0x120/0x130
  __vmalloc_node_range+0x96/0x1e0
  kvmalloc_node+0x92/0xb0
  bucket_table_alloc.isra.0+0x47/0x140
  rhashtable_try_insert+0x3a4/0x440
  rhashtable_insert_slow+0x1b/0x30
 [..]

bucket_table_alloc uses kvzalloc(GPF_ATOMIC).  If kmalloc fails, this now
falls through to vmalloc and hits code paths that assume GFP_KERNEL.

Link: https://lkml.kernel.org/r/20220926151650.15293-1-fw@strlen.de
Fixes: a421ef303008 ("mm: allow !GFP_KERNEL allocations for kvmalloc")
Signed-off-by: Florian Westphal <fw@strlen.de>
Suggested-by: Michal Hocko <mhocko@suse.com>
Link: https://lore.kernel.org/linux-mm/Yy3MS2uhSgjF47dy@pc636/T/#t
Acked-by: Michal Hocko <mhocko@suse.com>
Reported-by: Martin Zaharinov <micron10@gmail.com>
Cc: Uladzislau Rezki (Sony) <urezki@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/util.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/util.c b/mm/util.c
index c9439c66d8cf..346e40177bc6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -619,6 +619,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
 	if (ret || size <= PAGE_SIZE)
 		return ret;
 
+	/* non-sleeping allocations are not supported by vmalloc */
+	if (!gfpflags_allow_blocking(flags))
+		return NULL;
+
 	/* Don't even allow crazy sizes */
 	if (unlikely(size > INT_MAX)) {
 		WARN_ON_ONCE(!(flags & __GFP_NOWARN));
-- 
cgit v1.2.3


From 1c8e2349f2d033f634d046063b704b2ca6c46972 Mon Sep 17 00:00:00 2001
From: Levi Yun <ppbuk5246@gmail.com>
Date: Mon, 26 Sep 2022 16:06:11 +0000
Subject: damon/sysfs: fix possible memleak on damon_sysfs_add_target

When damon_sysfs_add_target couldn't find proper task, New allocated
damon_target structure isn't registered yet, So, it's impossible to free
new allocated one by damon_sysfs_destroy_targets.

By calling damon_add_target as soon as allocating new target, Fix this
possible memory leak.

Link: https://lkml.kernel.org/r/20220926160611.48536-1-sj@kernel.org
Fixes: a61ea561c871 ("mm/damon/sysfs: link DAMON for virtual address spaces monitoring")
Signed-off-by: Levi Yun <ppbuk5246@gmail.com>
Signed-off-by: SeongJae Park <sj@kernel.org>
Reviewed-by: SeongJae Park <sj@kernel.org>
Cc: <stable@vger.kernel.org>	[5.17.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 mm/damon/sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 7488e27c87c3..bdef9682d0a0 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -2182,12 +2182,12 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
 
 	if (!t)
 		return -ENOMEM;
+	damon_add_target(ctx, t);
 	if (damon_target_has_pid(ctx)) {
 		t->pid = find_get_pid(sys_target->pid);
 		if (!t->pid)
 			goto destroy_targets_out;
 	}
-	damon_add_target(ctx, t);
 	err = damon_sysfs_set_regions(t, sys_target->regions);
 	if (err)
 		goto destroy_targets_out;
-- 
cgit v1.2.3


From fc26e70f8acaa2279cb00c1d15c91ecbe961bd2f Mon Sep 17 00:00:00 2001
From: Yinjun Zhang <yinjun.zhang@corigine.com>
Date: Thu, 29 Sep 2022 10:58:28 +0200
Subject: nfp: add support for reporting active FEC mode

The latest management firmware can now report the active FEC
mode. Adapt driver accordingly so that user can get the active
FEC mode by running command:

  # ethtool --show-fec <intf>

Also correct use of `fec` field.

Signed-off-by: Yinjun Zhang <yinjun.zhang@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c     | 2 +-
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h     | 2 ++
 drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c | 9 ++++++++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index db58532364b6..d50af23642a2 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -996,7 +996,7 @@ nfp_port_get_fecparam(struct net_device *netdev,
 		return 0;
 
 	param->fec = nfp_port_fec_nsp_to_ethtool(eth_port->fec_modes_supported);
-	param->active_fec = nfp_port_fec_nsp_to_ethtool(eth_port->fec);
+	param->active_fec = nfp_port_fec_nsp_to_ethtool(BIT(eth_port->act_fec));
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h
index 77d66855be42..52465670a01e 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h
@@ -132,6 +132,7 @@ enum nfp_eth_fec {
  * @ports.interface:	interface (module) plugged in
  * @ports.media:	media type of the @interface
  * @ports.fec:		forward error correction mode
+ * @ports.act_fec:	active forward error correction mode
  * @ports.aneg:		auto negotiation mode
  * @ports.mac_addr:	interface MAC address
  * @ports.label_port:	port id
@@ -162,6 +163,7 @@ struct nfp_eth_table {
 		enum nfp_eth_media media;
 
 		enum nfp_eth_fec fec;
+		enum nfp_eth_fec act_fec;
 		enum nfp_eth_aneg aneg;
 
 		u8 mac_addr[ETH_ALEN];
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
index 4cc38799eabc..18ba7629cdc2 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
@@ -40,6 +40,7 @@
 #define NSP_ETH_STATE_OVRD_CHNG		BIT_ULL(22)
 #define NSP_ETH_STATE_ANEG		GENMASK_ULL(25, 23)
 #define NSP_ETH_STATE_FEC		GENMASK_ULL(27, 26)
+#define NSP_ETH_STATE_ACT_FEC		GENMASK_ULL(29, 28)
 
 #define NSP_ETH_CTRL_CONFIGURED		BIT_ULL(0)
 #define NSP_ETH_CTRL_ENABLED		BIT_ULL(1)
@@ -170,7 +171,13 @@ nfp_eth_port_translate(struct nfp_nsp *nsp, const union eth_table_entry *src,
 	if (dst->fec_modes_supported)
 		dst->fec_modes_supported |= NFP_FEC_AUTO | NFP_FEC_DISABLED;
 
-	dst->fec = 1 << FIELD_GET(NSP_ETH_STATE_FEC, state);
+	dst->fec = FIELD_GET(NSP_ETH_STATE_FEC, state);
+	dst->act_fec = dst->fec;
+
+	if (nfp_nsp_get_abi_ver_minor(nsp) < 33)
+		return;
+
+	dst->act_fec = FIELD_GET(NSP_ETH_STATE_ACT_FEC, state);
 }
 
 static void
-- 
cgit v1.2.3


From 965dd27d9893f543c014c96e8beb52a8ae8a02a5 Mon Sep 17 00:00:00 2001
From: Yinjun Zhang <yinjun.zhang@corigine.com>
Date: Thu, 29 Sep 2022 10:58:29 +0200
Subject: nfp: avoid halt of driver init process when non-fatal error happens

It's not a fatal error when setting `hwinfo` into management firmware
fails, no need to halt the whole driver initialization process.

Signed-off-by: Yinjun Zhang <yinjun.zhang@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_main.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
index e2d4c487e8de..f3852ba8099a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
@@ -315,18 +315,17 @@ static int nfp_net_pf_cfg_nsp(struct nfp_pf *pf, bool sp_indiff)
 	int err;
 
 	nsp = nfp_nsp_open(pf->cpp);
-	if (IS_ERR(nsp)) {
-		err = PTR_ERR(nsp);
-		return err;
-	}
+	if (IS_ERR(nsp))
+		return PTR_ERR(nsp);
 
 	snprintf(hwinfo, sizeof(hwinfo), "sp_indiff=%d", sp_indiff);
 	err = nfp_nsp_hwinfo_set(nsp, hwinfo, sizeof(hwinfo));
+	/* Not a fatal error, no need to return error to stop driver from loading */
 	if (err)
 		nfp_warn(pf->cpp, "HWinfo(sp_indiff=%d) set failed: %d\n", sp_indiff, err);
 
 	nfp_nsp_close(nsp);
-	return err;
+	return 0;
 }
 
 static int nfp_net_pf_init_nsp(struct nfp_pf *pf)
-- 
cgit v1.2.3


From b1e4f11e426dba8fd8baa549208e40dbe39c03de Mon Sep 17 00:00:00 2001
From: Yinjun Zhang <yinjun.zhang@corigine.com>
Date: Thu, 29 Sep 2022 10:58:30 +0200
Subject: nfp: refine the ABI of getting `sp_indiff` info

Considering that whether application firmware is indifferent to
port speed is a firmware property instead of port property, now use
a new rtsym to get the property instead of parsing per-port tlv caps.
With this change, relevant code is moved to `nfp_main` layer.

Signed-off-by: Yinjun Zhang <yinjun.zhang@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/nfp_main.c     | 67 ++++++++++++++++++++++-
 drivers/net/ethernet/netronome/nfp/nfp_main.h     |  3 +-
 drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.c |  8 ---
 drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h | 10 +---
 drivers/net/ethernet/netronome/nfp/nfp_net_main.c | 48 +---------------
 5 files changed, 71 insertions(+), 65 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index 873429f7a6da..91063f19c97d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -691,6 +691,64 @@ static int nfp_pf_find_rtsyms(struct nfp_pf *pf)
 	return 0;
 }
 
+int nfp_net_pf_get_app_id(struct nfp_pf *pf)
+{
+	return nfp_pf_rtsym_read_optional(pf, "_pf%u_net_app_id",
+					  NFP_APP_CORE_NIC);
+}
+
+static u64 nfp_net_pf_get_app_cap(struct nfp_pf *pf)
+{
+	char name[32];
+	int err = 0;
+	u64 val;
+
+	snprintf(name, sizeof(name), "_pf%u_net_app_cap", nfp_cppcore_pcie_unit(pf->cpp));
+
+	val = nfp_rtsym_read_le(pf->rtbl, name, &err);
+	if (err) {
+		if (err != -ENOENT)
+			nfp_err(pf->cpp, "Unable to read symbol %s\n", name);
+
+		return 0;
+	}
+
+	return val;
+}
+
+static int nfp_pf_cfg_hwinfo(struct nfp_pf *pf, bool sp_indiff)
+{
+	struct nfp_nsp *nsp;
+	char hwinfo[32];
+	int err;
+
+	nsp = nfp_nsp_open(pf->cpp);
+	if (IS_ERR(nsp))
+		return PTR_ERR(nsp);
+
+	snprintf(hwinfo, sizeof(hwinfo), "sp_indiff=%d", sp_indiff);
+	err = nfp_nsp_hwinfo_set(nsp, hwinfo, sizeof(hwinfo));
+	/* Not a fatal error, no need to return error to stop driver from loading */
+	if (err)
+		nfp_warn(pf->cpp, "HWinfo(sp_indiff=%d) set failed: %d\n", sp_indiff, err);
+
+	nfp_nsp_close(nsp);
+	return 0;
+}
+
+static int nfp_pf_nsp_cfg(struct nfp_pf *pf)
+{
+	bool sp_indiff = (nfp_net_pf_get_app_id(pf) == NFP_APP_FLOWER_NIC) ||
+			 (nfp_net_pf_get_app_cap(pf) & NFP_NET_APP_CAP_SP_INDIFF);
+
+	return nfp_pf_cfg_hwinfo(pf, sp_indiff);
+}
+
+static void nfp_pf_nsp_clean(struct nfp_pf *pf)
+{
+	nfp_pf_cfg_hwinfo(pf, false);
+}
+
 static int nfp_pci_probe(struct pci_dev *pdev,
 			 const struct pci_device_id *pci_id)
 {
@@ -791,10 +849,14 @@ static int nfp_pci_probe(struct pci_dev *pdev,
 		goto err_fw_unload;
 	}
 
-	err = nfp_net_pci_probe(pf);
+	err = nfp_pf_nsp_cfg(pf);
 	if (err)
 		goto err_fw_unload;
 
+	err = nfp_net_pci_probe(pf);
+	if (err)
+		goto err_nsp_clean;
+
 	err = nfp_hwmon_register(pf);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to register hwmon info\n");
@@ -805,6 +867,8 @@ static int nfp_pci_probe(struct pci_dev *pdev,
 
 err_net_remove:
 	nfp_net_pci_remove(pf);
+err_nsp_clean:
+	nfp_pf_nsp_clean(pf);
 err_fw_unload:
 	kfree(pf->rtbl);
 	nfp_mip_close(pf->mip);
@@ -844,6 +908,7 @@ static void __nfp_pci_shutdown(struct pci_dev *pdev, bool unload_fw)
 
 	nfp_net_pci_remove(pf);
 
+	nfp_pf_nsp_clean(pf);
 	vfree(pf->dumpspec);
 	kfree(pf->rtbl);
 	nfp_mip_close(pf->mip);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.h b/drivers/net/ethernet/netronome/nfp/nfp_main.h
index 6805af186f1b..afd3edfa2428 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.h
@@ -65,7 +65,6 @@ struct nfp_dumpspec {
  * @num_vfs:		Number of SR-IOV VFs enabled
  * @fw_loaded:		Is the firmware loaded?
  * @unload_fw_on_remove:Do we need to unload firmware on driver removal?
- * @sp_indiff:		Is the firmware indifferent to physical port speed?
  * @ctrl_vnic:		Pointer to the control vNIC if available
  * @mip:		MIP handle
  * @rtbl:		RTsym table
@@ -115,7 +114,6 @@ struct nfp_pf {
 
 	bool fw_loaded;
 	bool unload_fw_on_remove;
-	bool sp_indiff;
 
 	struct nfp_net *ctrl_vnic;
 
@@ -163,6 +161,7 @@ bool nfp_ctrl_tx(struct nfp_net *nn, struct sk_buff *skb);
 
 int nfp_pf_rtsym_read_optional(struct nfp_pf *pf, const char *format,
 			       unsigned int default_val);
+int nfp_net_pf_get_app_id(struct nfp_pf *pf);
 u8 __iomem *
 nfp_pf_map_rtsym(struct nfp_pf *pf, const char *name, const char *sym_fmt,
 		 unsigned int min_size, struct nfp_cpp_area **area);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.c
index d81bd8697047..c3a763134e79 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.c
@@ -148,14 +148,6 @@ int nfp_net_tlv_caps_parse(struct device *dev, u8 __iomem *ctrl_mem,
 							  true))
 				return -EINVAL;
 			break;
-		case NFP_NET_CFG_TLV_TYPE_SP_INDIFF:
-			if (length) {
-				dev_err(dev, "Unexpected len of SP_INDIFF TLV:%u\n", length);
-				return -EINVAL;
-			}
-
-			caps->sp_indiff = true;
-			break;
 		default:
 			if (!FIELD_GET(NFP_NET_CFG_TLV_HEADER_REQUIRED, hdr))
 				break;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
index 1d53f721a1c8..6714d5e8fdab 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h
@@ -14,6 +14,9 @@
 
 #include <linux/types.h>
 
+/* 64-bit per app capabilities */
+#define NFP_NET_APP_CAP_SP_INDIFF	BIT_ULL(0) /* indifferent to port speed */
+
 /* Configuration BAR size.
  *
  * The configuration BAR is 8K in size, but due to
@@ -492,10 +495,6 @@
  * %NFP_NET_CFG_TLV_TYPE_CRYPTO_OPS_RX_SCAN:
  * Same as %NFP_NET_CFG_TLV_TYPE_CRYPTO_OPS, but crypto TLS does stream scan
  * RX sync, rather than kernel-assisted sync.
- *
- * %NFP_NET_CFG_TLV_TYPE_SP_INDIFF:
- * Empty, indicate the firmware is indifferent to port speed. Then no need to
- * reload driver and firmware when port speed is changed.
  */
 #define NFP_NET_CFG_TLV_TYPE_UNKNOWN		0
 #define NFP_NET_CFG_TLV_TYPE_RESERVED		1
@@ -509,7 +508,6 @@
 #define NFP_NET_CFG_TLV_TYPE_CRYPTO_OPS		11 /* see crypto/fw.h */
 #define NFP_NET_CFG_TLV_TYPE_VNIC_STATS		12
 #define NFP_NET_CFG_TLV_TYPE_CRYPTO_OPS_RX_SCAN	13
-#define NFP_NET_CFG_TLV_TYPE_SP_INDIFF		14
 
 struct device;
 
@@ -524,7 +522,6 @@ struct device;
  * @vnic_stats_off:	offset of vNIC stats area
  * @vnic_stats_cnt:	number of vNIC stats
  * @tls_resync_ss:	TLS resync will be performed via stream scan
- * @sp_indiff:		Firmware is indifferent to port speed
  */
 struct nfp_net_tlv_caps {
 	u32 me_freq_mhz;
@@ -537,7 +534,6 @@ struct nfp_net_tlv_caps {
 	unsigned int vnic_stats_off;
 	unsigned int vnic_stats_cnt;
 	unsigned int tls_resync_ss:1;
-	unsigned int sp_indiff:1;
 };
 
 int nfp_net_tlv_caps_parse(struct device *dev, u8 __iomem *ctrl_mem,
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
index f3852ba8099a..3bae92dc899e 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c
@@ -77,12 +77,6 @@ static int nfp_net_pf_get_num_ports(struct nfp_pf *pf)
 	return nfp_pf_rtsym_read_optional(pf, "nfd_cfg_pf%u_num_ports", 1);
 }
 
-static int nfp_net_pf_get_app_id(struct nfp_pf *pf)
-{
-	return nfp_pf_rtsym_read_optional(pf, "_pf%u_net_app_id",
-					  NFP_APP_CORE_NIC);
-}
-
 static void nfp_net_pf_free_vnic(struct nfp_pf *pf, struct nfp_net *nn)
 {
 	if (nfp_net_is_data_vnic(nn))
@@ -206,7 +200,6 @@ nfp_net_pf_alloc_vnics(struct nfp_pf *pf, void __iomem *ctrl_bar,
 			nn->port->link_cb = nfp_net_refresh_port_table;
 
 		ctrl_bar += NFP_PF_CSR_SLICE_SIZE;
-		pf->sp_indiff |= nn->tlv_caps.sp_indiff;
 
 		/* Kill the vNIC if app init marked it as invalid */
 		if (nn->port && nn->port->type == NFP_PORT_INVALID)
@@ -308,36 +301,6 @@ err_prev_deinit:
 	return err;
 }
 
-static int nfp_net_pf_cfg_nsp(struct nfp_pf *pf, bool sp_indiff)
-{
-	struct nfp_nsp *nsp;
-	char hwinfo[32];
-	int err;
-
-	nsp = nfp_nsp_open(pf->cpp);
-	if (IS_ERR(nsp))
-		return PTR_ERR(nsp);
-
-	snprintf(hwinfo, sizeof(hwinfo), "sp_indiff=%d", sp_indiff);
-	err = nfp_nsp_hwinfo_set(nsp, hwinfo, sizeof(hwinfo));
-	/* Not a fatal error, no need to return error to stop driver from loading */
-	if (err)
-		nfp_warn(pf->cpp, "HWinfo(sp_indiff=%d) set failed: %d\n", sp_indiff, err);
-
-	nfp_nsp_close(nsp);
-	return 0;
-}
-
-static int nfp_net_pf_init_nsp(struct nfp_pf *pf)
-{
-	return nfp_net_pf_cfg_nsp(pf, pf->sp_indiff);
-}
-
-static void nfp_net_pf_clean_nsp(struct nfp_pf *pf)
-{
-	(void)nfp_net_pf_cfg_nsp(pf, false);
-}
-
 static int
 nfp_net_pf_app_init(struct nfp_pf *pf, u8 __iomem *qc_bar, unsigned int stride)
 {
@@ -349,8 +312,6 @@ nfp_net_pf_app_init(struct nfp_pf *pf, u8 __iomem *qc_bar, unsigned int stride)
 	if (IS_ERR(pf->app))
 		return PTR_ERR(pf->app);
 
-	pf->sp_indiff |= pf->app->type->id == NFP_APP_FLOWER_NIC;
-
 	devl_lock(devlink);
 	err = nfp_app_init(pf->app);
 	devl_unlock(devlink);
@@ -813,13 +774,9 @@ int nfp_net_pci_probe(struct nfp_pf *pf)
 	if (err)
 		goto err_clean_ddir;
 
-	err = nfp_net_pf_init_nsp(pf);
-	if (err)
-		goto err_free_vnics;
-
 	err = nfp_net_pf_alloc_irqs(pf);
 	if (err)
-		goto err_clean_nsp;
+		goto err_free_vnics;
 
 	err = nfp_net_pf_app_start(pf);
 	if (err)
@@ -838,8 +795,6 @@ err_stop_app:
 	nfp_net_pf_app_stop(pf);
 err_free_irqs:
 	nfp_net_pf_free_irqs(pf);
-err_clean_nsp:
-	nfp_net_pf_clean_nsp(pf);
 err_free_vnics:
 	nfp_net_pf_free_vnics(pf);
 err_clean_ddir:
@@ -870,7 +825,6 @@ void nfp_net_pci_remove(struct nfp_pf *pf)
 		nfp_net_pf_free_vnic(pf, nn);
 	}
 
-	nfp_net_pf_clean_nsp(pf);
 	nfp_net_pf_app_stop(pf);
 	/* stop app first, to avoid double free of ctrl vNIC's ddir */
 	nfp_net_debugfs_dir_clean(&pf->ddir);
-- 
cgit v1.2.3


From 8d545385bf267e071fee1b6d00d5639cd46aae13 Mon Sep 17 00:00:00 2001
From: Yinjun Zhang <yinjun.zhang@corigine.com>
Date: Thu, 29 Sep 2022 10:58:31 +0200
Subject: nfp: add support for link auto negotiation

Report the auto negotiation capability if it's supported
in management firmware, and advertise it if it's enabled.
Changing port speed is not allowed when autoneg is enabled.

The ethtool <intf> command displays the auto-neg capability:

  # ethtool enp1s0np0
  Settings for enp1s0np0:
          Supported ports: [ FIBRE ]
          Supported link modes:   Not reported
          Supported pause frame use: Symmetric
          Supports auto-negotiation: Yes
          Supported FEC modes: None        RS      BASER
          Advertised link modes:  Not reported
          Advertised pause frame use: Symmetric
          Advertised auto-negotiation: Yes
          Advertised FEC modes: None       RS      BASER
          Speed: 25000Mb/s
          Duplex: Full
          Auto-negotiation: on
          Port: FIBRE
          PHYAD: 0
          Transceiver: internal
          Link detected: yes

Signed-off-by: Yinjun Zhang <yinjun.zhang@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/netronome/nfp/nfp_main.c      |  9 +++++++-
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   | 26 ++++++++++++++++++----
 .../net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h   |  1 +
 .../ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c   |  2 ++
 4 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c
index 91063f19c97d..e66e548919d4 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_main.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c
@@ -729,8 +729,15 @@ static int nfp_pf_cfg_hwinfo(struct nfp_pf *pf, bool sp_indiff)
 	snprintf(hwinfo, sizeof(hwinfo), "sp_indiff=%d", sp_indiff);
 	err = nfp_nsp_hwinfo_set(nsp, hwinfo, sizeof(hwinfo));
 	/* Not a fatal error, no need to return error to stop driver from loading */
-	if (err)
+	if (err) {
 		nfp_warn(pf->cpp, "HWinfo(sp_indiff=%d) set failed: %d\n", sp_indiff, err);
+	} else {
+		/* Need reinit eth_tbl since the eth table state may change
+		 * after sp_indiff is configured.
+		 */
+		kfree(pf->eth_tbl);
+		pf->eth_tbl = __nfp_eth_read_ports(pf->cpp, nsp);
+	}
 
 	nfp_nsp_close(nsp);
 	return 0;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index d50af23642a2..678cea0fd274 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -290,8 +290,13 @@ nfp_net_get_link_ksettings(struct net_device *netdev,
 	if (eth_port) {
 		ethtool_link_ksettings_add_link_mode(cmd, supported, Pause);
 		ethtool_link_ksettings_add_link_mode(cmd, advertising, Pause);
-		cmd->base.autoneg = eth_port->aneg != NFP_ANEG_DISABLED ?
-			AUTONEG_ENABLE : AUTONEG_DISABLE;
+		if (eth_port->supp_aneg) {
+			ethtool_link_ksettings_add_link_mode(cmd, supported, Autoneg);
+			if (eth_port->aneg == NFP_ANEG_AUTO) {
+				ethtool_link_ksettings_add_link_mode(cmd, advertising, Autoneg);
+				cmd->base.autoneg = AUTONEG_ENABLE;
+			}
+		}
 		nfp_net_set_fec_link_mode(eth_port, cmd);
 	}
 
@@ -327,6 +332,7 @@ static int
 nfp_net_set_link_ksettings(struct net_device *netdev,
 			   const struct ethtool_link_ksettings *cmd)
 {
+	bool req_aneg = (cmd->base.autoneg == AUTONEG_ENABLE);
 	struct nfp_eth_table_port *eth_port;
 	struct nfp_port *port;
 	struct nfp_nsp *nsp;
@@ -346,13 +352,25 @@ nfp_net_set_link_ksettings(struct net_device *netdev,
 	if (IS_ERR(nsp))
 		return PTR_ERR(nsp);
 
-	err = __nfp_eth_set_aneg(nsp, cmd->base.autoneg == AUTONEG_ENABLE ?
-				 NFP_ANEG_AUTO : NFP_ANEG_DISABLED);
+	if (req_aneg && !eth_port->supp_aneg) {
+		netdev_warn(netdev, "Autoneg is not supported.\n");
+		err = -EOPNOTSUPP;
+		goto err_bad_set;
+	}
+
+	err = __nfp_eth_set_aneg(nsp, req_aneg ? NFP_ANEG_AUTO : NFP_ANEG_DISABLED);
 	if (err)
 		goto err_bad_set;
+
 	if (cmd->base.speed != SPEED_UNKNOWN) {
 		u32 speed = cmd->base.speed / eth_port->lanes;
 
+		if (req_aneg) {
+			netdev_err(netdev, "Speed changing is not allowed when working on autoneg mode.\n");
+			err = -EINVAL;
+			goto err_bad_set;
+		}
+
 		err = __nfp_eth_set_speed(nsp, speed);
 		if (err)
 			goto err_bad_set;
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h
index 52465670a01e..992d72ac98d3 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp.h
@@ -174,6 +174,7 @@ struct nfp_eth_table {
 		bool enabled;
 		bool tx_enabled;
 		bool rx_enabled;
+		bool supp_aneg;
 
 		bool override_changed;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
index 18ba7629cdc2..bb64efec4c46 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nsp_eth.c
@@ -27,6 +27,7 @@
 #define NSP_ETH_PORT_PHYLABEL		GENMASK_ULL(59, 54)
 #define NSP_ETH_PORT_FEC_SUPP_BASER	BIT_ULL(60)
 #define NSP_ETH_PORT_FEC_SUPP_RS	BIT_ULL(61)
+#define NSP_ETH_PORT_SUPP_ANEG		BIT_ULL(63)
 
 #define NSP_ETH_PORT_LANES_MASK		cpu_to_le64(NSP_ETH_PORT_LANES)
 
@@ -178,6 +179,7 @@ nfp_eth_port_translate(struct nfp_nsp *nsp, const union eth_table_entry *src,
 		return;
 
 	dst->act_fec = FIELD_GET(NSP_ETH_STATE_ACT_FEC, state);
+	dst->supp_aneg = FIELD_GET(NSP_ETH_PORT_SUPP_ANEG, port);
 }
 
 static void
-- 
cgit v1.2.3


From 2820a400dfd3579af6db41b6bd5f5114b8749cae Mon Sep 17 00:00:00 2001
From: Fei Qin <fei.qin@corigine.com>
Date: Thu, 29 Sep 2022 10:58:32 +0200
Subject: nfp: add support restart of link auto-negotiation

Add support restart of link auto-negotiation.
This may be initiated using:

  # ethtool -r <intf>

Signed-off-by: Fei Qin <fei.qin@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   | 33 ++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index 678cea0fd274..22a5d2419084 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -228,6 +228,37 @@ nfp_net_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo)
 	nfp_get_drvinfo(nn->app, nn->pdev, vnic_version, drvinfo);
 }
 
+static int
+nfp_net_nway_reset(struct net_device *netdev)
+{
+	struct nfp_eth_table_port *eth_port;
+	struct nfp_port *port;
+	int err;
+
+	port = nfp_port_from_netdev(netdev);
+	eth_port = nfp_port_get_eth_port(port);
+	if (!eth_port)
+		return -EOPNOTSUPP;
+
+	if (!netif_running(netdev))
+		return 0;
+
+	err = nfp_eth_set_configured(port->app->cpp, eth_port->index, false);
+	if (err) {
+		netdev_info(netdev, "Link down failed: %d\n", err);
+		return err;
+	}
+
+	err = nfp_eth_set_configured(port->app->cpp, eth_port->index, true);
+	if (err) {
+		netdev_info(netdev, "Link up failed: %d\n", err);
+		return err;
+	}
+
+	netdev_info(netdev, "Link reset succeeded\n");
+	return 0;
+}
+
 static void
 nfp_app_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo)
 {
@@ -1841,6 +1872,7 @@ static const struct ethtool_ops nfp_net_ethtool_ops = {
 				     ETHTOOL_COALESCE_MAX_FRAMES |
 				     ETHTOOL_COALESCE_USE_ADAPTIVE,
 	.get_drvinfo		= nfp_net_get_drvinfo,
+	.nway_reset             = nfp_net_nway_reset,
 	.get_link		= ethtool_op_get_link,
 	.get_ringparam		= nfp_net_get_ringparam,
 	.set_ringparam		= nfp_net_set_ringparam,
@@ -1878,6 +1910,7 @@ static const struct ethtool_ops nfp_net_ethtool_ops = {
 
 const struct ethtool_ops nfp_port_ethtool_ops = {
 	.get_drvinfo		= nfp_app_get_drvinfo,
+	.nway_reset             = nfp_net_nway_reset,
 	.get_link		= ethtool_op_get_link,
 	.get_strings		= nfp_port_get_strings,
 	.get_ethtool_stats	= nfp_port_get_stats,
-- 
cgit v1.2.3


From ae3ed15da5889263de372ff9df2e83e16acca4cb Mon Sep 17 00:00:00 2001
From: Daniel Golle <daniel@makrotopia.org>
Date: Fri, 30 Sep 2022 01:56:53 +0100
Subject: net: ethernet: mtk_eth_soc: fix state in __mtk_foe_entry_clear

Setting ib1 state to MTK_FOE_STATE_UNBIND in __mtk_foe_entry_clear
routine as done by commit 0e80707d94e4c8 ("net: ethernet: mtk_eth_soc:
fix typo in __mtk_foe_entry_clear") breaks flow offloading, at least
on older MTK_NETSYS_V1 SoCs, OpenWrt users have confirmed the bug on
MT7622 and MT7621 systems.
Felix Fietkau suggested to use MTK_FOE_STATE_INVALID instead which
works well on both, MTK_NETSYS_V1 and MTK_NETSYS_V2.

Tested on MT7622 (Linksys E8450) and MT7986 (BananaPi BPI-R3).

Suggested-by: Felix Fietkau <nbd@nbd.name>
Fixes: 0e80707d94e4c8 ("net: ethernet: mtk_eth_soc: fix typo in __mtk_foe_entry_clear")
Fixes: 33fc42de33278b ("net: ethernet: mtk_eth_soc: support creating mac address based offload entries")
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Link: https://lore.kernel.org/r/YzY+1Yg0FBXcnrtc@makrotopia.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mediatek/mtk_ppe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mediatek/mtk_ppe.c b/drivers/net/ethernet/mediatek/mtk_ppe.c
index cfe804bc8d20..148ea636ef97 100644
--- a/drivers/net/ethernet/mediatek/mtk_ppe.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe.c
@@ -412,7 +412,7 @@ __mtk_foe_entry_clear(struct mtk_ppe *ppe, struct mtk_flow_entry *entry)
 	if (entry->hash != 0xffff) {
 		ppe->foe_table[entry->hash].ib1 &= ~MTK_FOE_IB1_STATE;
 		ppe->foe_table[entry->hash].ib1 |= FIELD_PREP(MTK_FOE_IB1_STATE,
-							      MTK_FOE_STATE_UNBIND);
+							      MTK_FOE_STATE_INVALID);
 		dma_wmb();
 	}
 	entry->hash = 0xffff;
-- 
cgit v1.2.3


From 2120635108b35ecad9c59c8b44f6cbdf4f98214e Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Fri, 30 Sep 2022 20:33:10 +0000
Subject: Makefile.extrawarn: Move -Wcast-function-type-strict to W=1

We enable -Wcast-function-type globally in the kernel to warn about
mismatching types in function pointer casts. Compilers currently
warn only about ABI incompability with this flag, but Clang 16 will
enable a stricter version of the check by default that checks for an
exact type match. This will be very noisy in the kernel, so disable
-Wcast-function-type-strict without W=1 until the new warnings have
been addressed.

Cc: stable@vger.kernel.org
Link: https://reviews.llvm.org/D134831
Link: https://github.com/ClangBuiltLinux/linux/issues/1724
Suggested-by: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220930203310.4010564-1-samitolvanen@google.com
---
 scripts/Makefile.extrawarn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn
index 0621c39a3955..20df48637373 100644
--- a/scripts/Makefile.extrawarn
+++ b/scripts/Makefile.extrawarn
@@ -52,6 +52,7 @@ KBUILD_CFLAGS += -Wno-sign-compare
 KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast)
 KBUILD_CFLAGS += -Wno-tautological-constant-out-of-range-compare
 KBUILD_CFLAGS += $(call cc-disable-warning, unaligned-access)
+KBUILD_CFLAGS += $(call cc-disable-warning, cast-function-type-strict)
 endif
 
 endif
-- 
cgit v1.2.3


From b502a6fb46d275aa978c1e0655bada2cafc81fea Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Sat, 1 Oct 2022 08:49:45 -0700
Subject: bpf, docs: Delete misformatted table.

Delete misformatted table.

Fixes: 6166da0a02cd ("bpf, docs: Move legacy packet instructions to a separate file")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/linux-notes.rst | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/Documentation/bpf/linux-notes.rst b/Documentation/bpf/linux-notes.rst
index 1c31379b469f..956b0c86699d 100644
--- a/Documentation/bpf/linux-notes.rst
+++ b/Documentation/bpf/linux-notes.rst
@@ -51,20 +51,3 @@ where ``ntohl()`` converts a 32-bit value from network byte order to host byte o
 ``BPF_IND | BPF_W | BPF_LD`` (0x40) means::
 
   R0 = ntohl(*(u32 *) ((struct sk_buff *) R6->data + src + imm))
-
-Appendix
-========
-
-For reference, the following table lists legacy Linux-specific opcodes in order by value.
-
-======  ====  ===================================================  =============
-opcode  imm   description                                          reference
-======  ====  ===================================================  =============
-0x20    any   dst = ntohl(\*(uint32_t \*)(R6->data + imm))         `Legacy BPF Packet access instructions`_
-0x28    any   dst = ntohs(\*(uint16_t \*)(R6->data + imm))         `Legacy BPF Packet access instructions`_
-0x30    any   dst = (\*(uint8_t \*)(R6->data + imm))               `Legacy BPF Packet access instructions`_
-0x38    any   dst = ntohll(\*(uint64_t \*)(R6->data + imm))        `Legacy BPF Packet access instructions`_
-0x40    any   dst = ntohl(\*(uint32_t \*)(R6->data + src + imm))   `Legacy BPF Packet access instructions`_
-0x48    any   dst = ntohs(\*(uint16_t \*)(R6->data + src + imm))   `Legacy BPF Packet access instructions`_
-0x50    any   dst = \*(uint8_t \*)(R6->data + src + imm))          `Legacy BPF Packet access instructions`_
-0x58    any   dst = ntohll(\*(uint64_t \*)(R6->data + src + imm))  `Legacy BPF Packet access instructions`_
-- 
cgit v1.2.3


From 056a68cea01edfa78b3474af1bfa39cc6bcc7bee Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Fri, 30 Sep 2022 16:01:38 +0200
Subject: mips: allow firmware to pass RNG seed to kernel

Nearly all other firmware environments have some way of passing a RNG
seed to initialize the RNG: DTB's rng-seed, EFI's RNG protocol, m68k's
bootinfo block, x86's setup_data, and so forth. This adds something
similar for MIPS, which will allow various firmware environments,
bootloaders, and hypervisors to pass an RNG seed to initialize the
kernel's RNG.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/kernel/setup.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index 2ca156a5b231..39c79f67c7a3 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -42,6 +42,7 @@
 #include <asm/setup.h>
 #include <asm/smp-ops.h>
 #include <asm/prom.h>
+#include <asm/fw/fw.h>
 
 #ifdef CONFIG_MIPS_ELF_APPENDED_DTB
 char __section(".appended_dtb") __appended_dtb[0x100000];
@@ -756,6 +757,24 @@ static void __init prefill_possible_map(void)
 static inline void prefill_possible_map(void) {}
 #endif
 
+static void __init setup_rng_seed(void)
+{
+	char *rng_seed_hex = fw_getenv("rngseed");
+	u8 rng_seed[512];
+	size_t len;
+
+	if (!rng_seed_hex)
+		return;
+
+	len = min(sizeof(rng_seed), strlen(rng_seed_hex) / 2);
+	if (hex2bin(rng_seed, rng_seed_hex, len))
+		return;
+
+	add_bootloader_randomness(rng_seed, len);
+	memzero_explicit(rng_seed, len);
+	memzero_explicit(rng_seed_hex, len * 2);
+}
+
 void __init setup_arch(char **cmdline_p)
 {
 	cpu_probe();
@@ -786,6 +805,8 @@ void __init setup_arch(char **cmdline_p)
 	paging_init();
 
 	memblock_dump_all();
+
+	setup_rng_seed();
 }
 
 unsigned long kernelsp[NR_CPUS];
-- 
cgit v1.2.3


From 90c2d2eb7ab5848c4f853751f12b96bdc460ad1b Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Fri, 30 Sep 2022 08:57:18 -0700
Subject: MIPS: pci: lantiq: switch to using gpiod API

This patch switches the driver from legacy gpio API to the newer
gpiod API.

Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/pci/pci-lantiq.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c
index 1ca42f482130..8d16cd021f60 100644
--- a/arch/mips/pci/pci-lantiq.c
+++ b/arch/mips/pci/pci-lantiq.c
@@ -9,11 +9,11 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/gpio/consumer.h>
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/clk.h>
 #include <linux/of_platform.h>
-#include <linux/of_gpio.h>
 #include <linux/of_irq.h>
 #include <linux/of_pci.h>
 
@@ -62,7 +62,7 @@
 __iomem void *ltq_pci_mapped_cfg;
 static __iomem void *ltq_pci_membase;
 
-static int reset_gpio;
+static struct gpio_desc *reset_gpio;
 static struct clk *clk_pci, *clk_external;
 static struct resource pci_io_resource;
 static struct resource pci_mem_resource;
@@ -95,6 +95,7 @@ static int ltq_pci_startup(struct platform_device *pdev)
 	struct device_node *node = pdev->dev.of_node;
 	const __be32 *req_mask, *bus_clk;
 	u32 temp_buffer;
+	int error;
 
 	/* get our clocks */
 	clk_pci = clk_get(&pdev->dev, NULL);
@@ -123,17 +124,14 @@ static int ltq_pci_startup(struct platform_device *pdev)
 		clk_disable(clk_external);
 
 	/* setup reset gpio used by pci */
-	reset_gpio = of_get_named_gpio(node, "gpio-reset", 0);
-	if (gpio_is_valid(reset_gpio)) {
-		int ret = devm_gpio_request(&pdev->dev,
-						reset_gpio, "pci-reset");
-		if (ret) {
-			dev_err(&pdev->dev,
-				"failed to request gpio %d\n", reset_gpio);
-			return ret;
-		}
-		gpio_direction_output(reset_gpio, 1);
+	reset_gpio = devm_gpiod_get_optional(&pdev->dev, "reset",
+					     GPIOD_OUT_LOW);
+	error = PTR_ERR_OR_ZERO(reset_gpio);
+	if (error) {
+		dev_err(&pdev->dev, "failed to request gpio: %d\n", error);
+		return error;
 	}
+	gpiod_set_consumer_name(reset_gpio, "pci_reset");
 
 	/* enable auto-switching between PCI and EBU */
 	ltq_pci_w32(0xa, PCI_CR_CLK_CTRL);
@@ -195,11 +193,11 @@ static int ltq_pci_startup(struct platform_device *pdev)
 	ltq_ebu_w32(ltq_ebu_r32(LTQ_EBU_PCC_IEN) | 0x10, LTQ_EBU_PCC_IEN);
 
 	/* toggle reset pin */
-	if (gpio_is_valid(reset_gpio)) {
-		__gpio_set_value(reset_gpio, 0);
+	if (reset_gpio) {
+		gpiod_set_value_cansleep(reset_gpio, 1);
 		wmb();
 		mdelay(1);
-		__gpio_set_value(reset_gpio, 1);
+		gpiod_set_value_cansleep(reset_gpio, 0);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From d54d7194ba48e074d4d51580753d3a396b82aecf Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:28:48 -0700
Subject: net/mlx5e: xsk: Use mlx5e_trigger_napi_icosq for XSK wakeup

mlx5e_xsk_wakeup triggers an IRQ by posting a NOP to async_icosq, taking
a spinlock to protect from concurrent access. There is already a
function that does the same: mlx5e_trigger_napi_icosq. Use this function
in mlx5e_xsk_wakeup.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
index 4902ef74fedf..1203d7d5f9bd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -36,9 +36,7 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
 		if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state))
 			return 0;
 
-		spin_lock_bh(&c->async_icosq_lock);
-		mlx5e_trigger_irq(&c->async_icosq);
-		spin_unlock_bh(&c->async_icosq_lock);
+		mlx5e_trigger_napi_icosq(c);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 8cbcafcee1910ece54990f9aebae78fcbdb93913 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:28:49 -0700
Subject: net/mlx5e: xsk: Drop the check for XSK state in mlx5e_xsk_wakeup

The MLX5E_CHANNEL_STATE_XSK flag checked in mlx5e_xsk_wakeup indicates
that XSK queues are open, but not necessarily activated. This check is
not very useful, because:

0. Both XSK setup and netdev state transitions take the same state_lock
mutex, so they can't happen at the same time.

1. If the netdev is up, xsk_is_bound can return true only when
MLX5E_CHANNEL_STATE_XSK is set on the corresponding channel.
mlx5e_xsk_wakeup is only called when xsk_is_bound is true.

2. If the XSK socket is bound, and the netdev is going up or down,
mlx5e_xsk_wakeup can take one of two branches, depending on the return
value of napi_if_scheduled_mark_missed:

2.1. True means one of two things: either NAPI was enabled at this
point, which means MLX5E_CHANNEL_STATE_XSK was also set; or NAPI was
disabled, and nothing really happened.

2.2. False means that NAPI was enabled by this point, which also implies
MLX5E_CHANNEL_STATE_XSK was set. Additionally, mlx5e_xsk_wakeup contains
a following check for MLX5E_SQ_STATE_ENABLED on async_icosq, and this
flag implies MLX5E_CHANNEL_STATE_XSK too on XSK channels.

As checking this flag doesn't cut any flows, remove the check.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c    | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
index 5129b9bf534f..d7dfc7d2c058 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
@@ -154,7 +154,7 @@ err_free_cparam:
 void mlx5e_close_xsk(struct mlx5e_channel *c)
 {
 	clear_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
-	synchronize_net(); /* Sync with the XSK wakeup and with NAPI. */
+	synchronize_net(); /* Sync with NAPI. */
 
 	mlx5e_close_rq(&c->xskrq);
 	mlx5e_close_cq(&c->xskrq.cq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
index 1203d7d5f9bd..c856fc3f197e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -22,9 +22,6 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
 
 	c = priv->channels.c[ix];
 
-	if (unlikely(!test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)))
-		return -EINVAL;
-
 	if (!napi_if_scheduled_mark_missed(&c->napi)) {
 		/* To avoid WQE overrun, don't post a NOP if async_icosq is not
 		 * active and not polled by NAPI. Return 0, because the upcoming
-- 
cgit v1.2.3


From a064c609849bf71adc7484b030539568cd2a5155 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:28:50 -0700
Subject: net/mlx5e: Introduce wqe_index_mask for legacy RQ

When fragments of different WQEs share the same page, mlx5e_post_rx_wqes
must wait until the old WQE stops using the page, only then the new WQE
can allocate the new page. Essentially, it means that if WQE index i is
still in use, the allocation must stop before `i % bulk`, where bulk is
the number of WQEs that may share the same page.

As bulk is always a power of two, `i % bulk = i & (bulk - 1)`, and the
new wqe_index_mask field will be equal to `bulk - 1`.

At the same time, wqe_bulk remains for optimization purposes and stores
`max(bulk, 8)`, which allows to skip the allocation until we have at
least 8 WQEs free.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  1 +
 .../net/ethernet/mellanox/mlx5/core/en/params.c    | 25 ++++++++++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 95a232fb2127..8e174a7f7c25 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -660,6 +660,7 @@ struct mlx5e_rq_frags_info {
 	u8 num_frags;
 	u8 log_num_frags;
 	u8 wqe_bulk;
+	u8 wqe_index_mask;
 };
 
 struct mlx5e_dma_info {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index 68bc66cbd8a5..49306a68b3b5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -586,7 +586,14 @@ static int mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
 		info->arr[0].frag_size = byte_count;
 		info->arr[0].frag_stride = frag_stride;
 		info->num_frags = 1;
-		info->wqe_bulk = PAGE_SIZE / frag_stride;
+
+		/* N WQEs share the same page, N = PAGE_SIZE / frag_stride. The
+		 * first WQE in the page is responsible for allocation of this
+		 * page, this WQE's index is k*N. If WQEs [k*N+1; k*N+N-1] are
+		 * still not completed, the allocation must stop before k*N.
+		 */
+		info->wqe_index_mask = (PAGE_SIZE / frag_stride) - 1;
+
 		goto out;
 	}
 
@@ -635,11 +642,21 @@ static int mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
 		i++;
 	}
 	info->num_frags = i;
-	/* number of different wqes sharing a page */
-	info->wqe_bulk = 1 + (info->num_frags % 2);
+
+	/* The last fragment of WQE with index 2*N may share the page with the
+	 * first fragment of WQE with index 2*N+1 in certain cases. If WQE 2*N+1
+	 * is not completed yet, WQE 2*N must not be allocated, as it's
+	 * responsible for allocating a new page.
+	 */
+	info->wqe_index_mask = info->num_frags % 2;
 
 out:
-	info->wqe_bulk = max_t(u8, info->wqe_bulk, 8);
+	/* Bulking optimization to skip allocation until at least 8 WQEs can be
+	 * allocated in a row. At the same time, never start allocation when
+	 * the page is still used by older WQEs.
+	 */
+	info->wqe_bulk = max_t(u8, info->wqe_index_mask + 1, 8);
+
 	info->log_num_frags = order_base_2(info->num_frags);
 
 	return 0;
-- 
cgit v1.2.3


From 5758c3145b88aa9d0919681a9d3edf39353f1dbd Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:28:51 -0700
Subject: net/mlx5e: Make the wqe_index_mask calculation more exact

The old calculation of wqe_index_mask may give false positives, i.e.
request bulking of pairs of WQEs when not strictly needed, for example,
when the first fragment size is equal to the PAGE_SIZE, bulking is not
needed, even if the number of fragments is odd.

Make the calculation more exact to cut false positives.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/params.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index 49306a68b3b5..ac4d70bb21e8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -648,7 +648,26 @@ static int mlx5e_build_rq_frags_info(struct mlx5_core_dev *mdev,
 	 * is not completed yet, WQE 2*N must not be allocated, as it's
 	 * responsible for allocating a new page.
 	 */
-	info->wqe_index_mask = info->num_frags % 2;
+	if (frag_size_max == PAGE_SIZE) {
+		/* No WQE can start in the middle of a page. */
+		info->wqe_index_mask = 0;
+	} else {
+		/* PAGE_SIZEs starting from 8192 don't use 2K-sized fragments,
+		 * because there would be more than MLX5E_MAX_RX_FRAGS of them.
+		 */
+		WARN_ON(PAGE_SIZE != 2 * DEFAULT_FRAG_SIZE);
+
+		/* Odd number of fragments allows to pack the last fragment of
+		 * the previous WQE and the first fragment of the next WQE into
+		 * the same page.
+		 * As long as DEFAULT_FRAG_SIZE is 2048, and MLX5E_MAX_RX_FRAGS
+		 * is 4, the last fragment can be bigger than the rest only if
+		 * it's the fourth one, so WQEs consisting of 3 fragments will
+		 * always share a page.
+		 * When a page is shared, WQE bulk size is 2, otherwise just 1.
+		 */
+		info->wqe_index_mask = info->num_frags % 2;
+	}
 
 out:
 	/* Bulking optimization to skip allocation until at least 8 WQEs can be
-- 
cgit v1.2.3


From 42847fed55523bebb712bfd7e2c4616db00c3aef Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:28:52 -0700
Subject: net/mlx5e: Use partial batches in legacy RQ

Legacy RQ allocates WQEs in batches. If the batch allocation fails, the
pages of the allocated part are released. This commit changes this
behavior to allow to use the pages that have been already allocated.

After this change, we need to be careful about indexing rq->wqe.frags[].
The WQ size is a power of two that divides by wqe_bulk (8), and the old
code used whole bulks, which allowed to use indices [8*K; 8*K+7] without
overflowing. Now that the bulks may be partial, the range can start at
any location (not only at 8*K), so we need to wrap them around to avoid
out-of-bounds array access.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 39 +++++++++++++------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 72d74de3ee99..ffca217b7d7e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -427,7 +427,6 @@ static void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
 static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
-	int err;
 	int i;
 
 	if (rq->xsk_pool) {
@@ -442,20 +441,16 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
 	}
 
 	for (i = 0; i < wqe_bulk; i++) {
-		struct mlx5e_rx_wqe_cyc *wqe = mlx5_wq_cyc_get_wqe(wq, ix + i);
+		int j = mlx5_wq_cyc_ctr2ix(wq, ix + i);
+		struct mlx5e_rx_wqe_cyc *wqe;
 
-		err = mlx5e_alloc_rx_wqe(rq, wqe, ix + i);
-		if (unlikely(err))
-			goto free_wqes;
-	}
+		wqe = mlx5_wq_cyc_get_wqe(wq, j);
 
-	return 0;
-
-free_wqes:
-	while (--i >= 0)
-		mlx5e_dealloc_rx_wqe(rq, ix + i);
+		if (unlikely(mlx5e_alloc_rx_wqe(rq, wqe, j)))
+			break;
+	}
 
-	return err;
+	return i;
 }
 
 static inline void
@@ -821,8 +816,8 @@ static void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	bool busy = false;
 	u8 wqe_bulk;
-	int err;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
@@ -837,14 +832,22 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 
 	do {
 		u16 head = mlx5_wq_cyc_get_head(wq);
+		int count;
+		u8 bulk;
 
-		err = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
-		if (unlikely(err)) {
+		/* Don't allow any newly allocated WQEs to share the same page
+		 * with old WQEs that aren't completed yet. Stop earlier.
+		 */
+		bulk = wqe_bulk - ((head + wqe_bulk) & rq->wqe.info.wqe_index_mask);
+
+		count = mlx5e_alloc_rx_wqes(rq, head, bulk);
+		if (likely(count > 0))
+			mlx5_wq_cyc_push_n(wq, count);
+		if (unlikely(count != bulk)) {
 			rq->stats->buff_alloc_err++;
+			busy = true;
 			break;
 		}
-
-		mlx5_wq_cyc_push_n(wq, wqe_bulk);
 	} while (mlx5_wq_cyc_missing(wq) >= wqe_bulk);
 
 	/* ensure wqes are visible to device before updating doorbell record */
@@ -852,7 +855,7 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 
 	mlx5_wq_cyc_update_db_record(wq);
 
-	return !!err;
+	return busy;
 }
 
 void mlx5e_free_icosq_descs(struct mlx5e_icosq *sq)
-- 
cgit v1.2.3


From 3f5fe0b2e606ab71d3425c138e311bce60b09543 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:28:53 -0700
Subject: net/mlx5e: xsk: Use partial batches in legacy RQ with XSK

The previous commit allowed allocating WQE batches in legacy RQ
partially, however, XSK still checks whether there are enough frames in
the fill ring. Remove this check to allow to allocate batches partially
also with XSK.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index ffca217b7d7e..80f2b5960782 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -429,17 +429,6 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	int i;
 
-	if (rq->xsk_pool) {
-		int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags;
-
-		/* Check in advance that we have enough frames, instead of
-		 * allocating one-by-one, failing and moving frames to the
-		 * Reuse Ring.
-		 */
-		if (unlikely(!xsk_buff_can_alloc(rq->xsk_pool, pages_desired)))
-			return -ENOMEM;
-	}
-
 	for (i = 0; i < wqe_bulk; i++) {
 		int j = mlx5_wq_cyc_ctr2ix(wq, ix + i);
 		struct mlx5e_rx_wqe_cyc *wqe;
@@ -841,8 +830,7 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 		bulk = wqe_bulk - ((head + wqe_bulk) & rq->wqe.info.wqe_index_mask);
 
 		count = mlx5e_alloc_rx_wqes(rq, head, bulk);
-		if (likely(count > 0))
-			mlx5_wq_cyc_push_n(wq, count);
+		mlx5_wq_cyc_push_n(wq, count);
 		if (unlikely(count != bulk)) {
 			rq->stats->buff_alloc_err++;
 			busy = true;
-- 
cgit v1.2.3


From 0b482232374528b62ef978241f8efc548dce7edb Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:28:54 -0700
Subject: net/mlx5e: Remove the outer loop when allocating legacy RQ WQEs

Legacy RQ WQEs are allocated in a loop in small batches (8 WQEs). As
partial batches are allowed, there is no point to have a loop in a loop,
so the outer loop is removed, and the batch size is increased up to the
total number of WQEs to allocate, still not smaller than 8.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 37 +++++++++++--------------
 drivers/net/ethernet/mellanox/mlx5/core/wq.h    |  2 +-
 2 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 80f2b5960782..d620c1ed9b80 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -424,7 +424,7 @@ static void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
 	mlx5e_free_rx_wqe(rq, wi, false);
 }
 
-static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
+static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
 	int i;
@@ -805,38 +805,33 @@ static void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	int wqe_bulk, count;
 	bool busy = false;
-	u8 wqe_bulk;
+	u16 head;
 
 	if (unlikely(!test_bit(MLX5E_RQ_STATE_ENABLED, &rq->state)))
 		return false;
 
-	wqe_bulk = rq->wqe.info.wqe_bulk;
-
-	if (mlx5_wq_cyc_missing(wq) < wqe_bulk)
+	if (mlx5_wq_cyc_missing(wq) < rq->wqe.info.wqe_bulk)
 		return false;
 
 	if (rq->page_pool)
 		page_pool_nid_changed(rq->page_pool, numa_mem_id());
 
-	do {
-		u16 head = mlx5_wq_cyc_get_head(wq);
-		int count;
-		u8 bulk;
+	wqe_bulk = mlx5_wq_cyc_missing(wq);
+	head = mlx5_wq_cyc_get_head(wq);
 
-		/* Don't allow any newly allocated WQEs to share the same page
-		 * with old WQEs that aren't completed yet. Stop earlier.
-		 */
-		bulk = wqe_bulk - ((head + wqe_bulk) & rq->wqe.info.wqe_index_mask);
+	/* Don't allow any newly allocated WQEs to share the same page with old
+	 * WQEs that aren't completed yet. Stop earlier.
+	 */
+	wqe_bulk -= (head + wqe_bulk) & rq->wqe.info.wqe_index_mask;
 
-		count = mlx5e_alloc_rx_wqes(rq, head, bulk);
-		mlx5_wq_cyc_push_n(wq, count);
-		if (unlikely(count != bulk)) {
-			rq->stats->buff_alloc_err++;
-			busy = true;
-			break;
-		}
-	} while (mlx5_wq_cyc_missing(wq) >= wqe_bulk);
+	count = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
+	mlx5_wq_cyc_push_n(wq, count);
+	if (unlikely(count != wqe_bulk)) {
+		rq->stats->buff_alloc_err++;
+		busy = true;
+	}
 
 	/* ensure wqes are visible to device before updating doorbell record */
 	dma_wmb();
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wq.h b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
index e5c4dcd1425e..4d629e5ddbc7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wq.h
@@ -123,7 +123,7 @@ static inline void mlx5_wq_cyc_push(struct mlx5_wq_cyc *wq)
 	wq->cur_sz++;
 }
 
-static inline void mlx5_wq_cyc_push_n(struct mlx5_wq_cyc *wq, u8 n)
+static inline void mlx5_wq_cyc_push_n(struct mlx5_wq_cyc *wq, u16 n)
 {
 	wq->wqe_ctr += n;
 	wq->cur_sz += n;
-- 
cgit v1.2.3


From a2e5ba242c338208024cd814128c7fa9e22ae2b4 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:28:55 -0700
Subject: net/mlx5e: xsk: Split out WQE allocation for legacy XSK RQ

Allocation of XSK frames on legacy RQ may be made more efficient with a
specialized routine that relies on certain assumptions, such as there is
only one fragment, allocation units (XSK frames) are not shared among
multiple packets. It reduces the number of branches both in the XSK code
and in the regular RQ, because with this approach there is only a single
check whether it's an XSK or regular RQ.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.c    | 26 ++++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.h    |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 11 +++++----
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 4441d35943d1..a850141789a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -8,6 +8,32 @@
 
 /* RX data path */
 
+int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
+{
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	int i;
+
+	for (i = 0; i < wqe_bulk; i++) {
+		int j = mlx5_wq_cyc_ctr2ix(wq, ix + i);
+		struct mlx5e_wqe_frag_info *frag;
+		struct mlx5e_rx_wqe_cyc *wqe;
+		dma_addr_t addr;
+
+		wqe = mlx5_wq_cyc_get_wqe(wq, j);
+		/* Assumes log_num_frags == 0. */
+		frag = &rq->wqe.frags[j];
+
+		frag->au->xsk = xsk_buff_alloc(rq->xsk_pool);
+		if (unlikely(!frag->au->xsk))
+			return i;
+
+		addr = xsk_buff_xdp_get_frame_dma(frag->au->xsk);
+		wqe->data[0].addr = cpu_to_be64(addr + rq->buff.headroom);
+	}
+
+	return wqe_bulk;
+}
+
 static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data,
 					       u32 cqe_bcnt)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index e702cb790476..acabcee623f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -9,6 +9,7 @@
 
 /* RX data path */
 
+int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
 struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
 						    struct mlx5e_mpw_info *wi,
 						    u16 cqe_bcnt,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index d620c1ed9b80..6321eb3fff31 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -359,7 +359,7 @@ static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
 		 * offset) should just use the new one without replenishing again
 		 * by themselves.
 		 */
-		err = mlx5e_page_alloc(rq, frag->au);
+		err = mlx5e_page_alloc_pool(rq, frag->au);
 
 	return err;
 }
@@ -393,8 +393,7 @@ static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe,
 			goto free_frags;
 
 		headroom = i == 0 ? rq->buff.headroom : 0;
-		addr = rq->xsk_pool ? xsk_buff_xdp_get_frame_dma(frag->au->xsk) :
-				      page_pool_get_dma_addr(frag->au->page);
+		addr = page_pool_get_dma_addr(frag->au->page);
 		wqe->data[i].addr = cpu_to_be64(addr + frag->offset + headroom);
 	}
 
@@ -826,7 +825,11 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 	 */
 	wqe_bulk -= (head + wqe_bulk) & rq->wqe.info.wqe_index_mask;
 
-	count = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
+	if (!rq->xsk_pool)
+		count = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
+	else
+		count = mlx5e_xsk_alloc_rx_wqes(rq, head, wqe_bulk);
+
 	mlx5_wq_cyc_push_n(wq, count);
 	if (unlikely(count != wqe_bulk)) {
 		rq->stats->buff_alloc_err++;
-- 
cgit v1.2.3


From 259bbc64367a130b49c56e68744262ee909a2549 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:28:56 -0700
Subject: net/mlx5e: xsk: Use xsk_buff_alloc_batch on legacy RQ

XSK provides a function to allocate frames in batches for more efficient
processing. This commit starts using this function on legacy RQ, adding
a special case for XSK. The new branch introduced basically replaces the
branch that was removed from the same place a few commits before.

A check is made that DMA sync is not needed, because the batching
allocator falls back to returning one frame when DMA sync is needed, and
this is best handled by the loop in the standard case.

Performance improvement is up to 8% in the aligned mode and up to 9% in
the unaligned mode.

Aligned mode, 2048-byte frames: 12.8 Mpps -> 13.5 Mpps
Aligned mode, 4096-byte frames: 11.5 Mpps -> 12.4 Mpps
Unaligned mode, 2048-byte frames: 12.2 Mpps -> 13.4 Mpps
Unaligned mode, 3072-byte frames: 11.6 Mpps -> 12.5 Mpps
Unaligned mode, 4096-byte frames: 11.2 Mpps -> 12.2 Mpps

CPU: Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.c    | 40 ++++++++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.h    |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  7 ++++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  7 ++++
 4 files changed, 55 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index a850141789a0..812a370f6aea 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -8,6 +8,46 @@
 
 /* RX data path */
 
+int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
+{
+	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
+	struct xdp_buff **buffs;
+	u32 contig, alloc;
+	int i;
+
+	/* mlx5e_init_frags_partition creates a 1:1 mapping between
+	 * rq->wqe.frags and rq->wqe.alloc_units, which allows us to
+	 * allocate XDP buffers straight into alloc_units.
+	 */
+	BUILD_BUG_ON(sizeof(rq->wqe.alloc_units[0]) !=
+		     sizeof(rq->wqe.alloc_units[0].xsk));
+	buffs = (struct xdp_buff **)rq->wqe.alloc_units;
+	contig = mlx5_wq_cyc_get_size(wq) - ix;
+	if (wqe_bulk <= contig) {
+		alloc = xsk_buff_alloc_batch(rq->xsk_pool, buffs + ix, wqe_bulk);
+	} else {
+		alloc = xsk_buff_alloc_batch(rq->xsk_pool, buffs + ix, contig);
+		if (likely(alloc == contig))
+			alloc += xsk_buff_alloc_batch(rq->xsk_pool, buffs, wqe_bulk - contig);
+	}
+
+	for (i = 0; i < alloc; i++) {
+		int j = mlx5_wq_cyc_ctr2ix(wq, ix + i);
+		struct mlx5e_wqe_frag_info *frag;
+		struct mlx5e_rx_wqe_cyc *wqe;
+		dma_addr_t addr;
+
+		wqe = mlx5_wq_cyc_get_wqe(wq, j);
+		/* Assumes log_num_frags == 0. */
+		frag = &rq->wqe.frags[j];
+
+		addr = xsk_buff_xdp_get_frame_dma(frag->au->xsk);
+		wqe->data[0].addr = cpu_to_be64(addr + rq->buff.headroom);
+	}
+
+	return alloc;
+}
+
 int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index acabcee623f9..7898a78237b8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -9,6 +9,7 @@
 
 /* RX data path */
 
+int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
 int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
 struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
 						    struct mlx5e_mpw_info *wi,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2719247b18db..6a0adda03463 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -433,6 +433,13 @@ static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
 	struct mlx5e_wqe_frag_info *prev = NULL;
 	int i;
 
+	if (rq->xsk_pool) {
+		/* Assumptions used by XSK batched allocator. */
+		WARN_ON(rq->wqe.info.num_frags != 1);
+		WARN_ON(rq->wqe.info.log_num_frags != 0);
+		WARN_ON(rq->wqe.info.arr[0].frag_stride != PAGE_SIZE);
+	}
+
 	next_frag.au = &rq->wqe.alloc_units[0];
 
 	for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 6321eb3fff31..5f411c29157f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -827,7 +827,14 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 
 	if (!rq->xsk_pool)
 		count = mlx5e_alloc_rx_wqes(rq, head, wqe_bulk);
+	else if (likely(!rq->xsk_pool->dma_need_sync))
+		count = mlx5e_xsk_alloc_rx_wqes_batched(rq, head, wqe_bulk);
 	else
+		/* If dma_need_sync is true, it's more efficient to call
+		 * xsk_buff_alloc in a loop, rather than xsk_buff_alloc_batch,
+		 * because the latter does the same check and returns only one
+		 * frame.
+		 */
 		count = mlx5e_xsk_alloc_rx_wqes(rq, head, wqe_bulk);
 
 	mlx5_wq_cyc_push_n(wq, count);
-- 
cgit v1.2.3


From cf544517c4690fd886aa5e664188abe92a377cbf Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:28:57 -0700
Subject: net/mlx5e: xsk: Use xsk_buff_alloc_batch on striding RQ

XSK provides a function to allocate frames in batches for more efficient
processing. This commit starts using this function on striding RQ and
creates an optimized flow for XSK. A side effect is an opportunity to
optimize the regular RX flow by dropping branching for XSK cases.

Performance improvement is up to 6.4% in the aligned mode and up to 7.5%
in the unaligned mode.

Aligned mode, 2048-byte frames: 12.9 Mpps -> 13.8 Mpps
Aligned mode, 4096-byte frames: 11.8 Mpps -> 12.5 Mpps
Unaligned mode, 2048-byte frames: 11.9 Mpps -> 12.8 Mpps
Unaligned mode, 3072-byte frames: 11.4 Mpps -> 12.1 Mpps
Unaligned mode, 4096-byte frames: 11.0 Mpps -> 11.2 Mpps

CPU: Intel(R) Xeon(R) Gold 6240 CPU @ 2.60GHz

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h  |  7 ++
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.c    | 88 +++++++++++++++++++++-
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.h    |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 58 +++-----------
 4 files changed, 106 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index f4f306bb8e6d..4456ad5cedf1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -452,4 +452,11 @@ static inline bool mlx5e_icosq_can_post_wqe(struct mlx5e_icosq *sq, u16 wqe_size
 
 	return mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, room);
 }
+
+static inline struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int i)
+{
+	size_t isz = struct_size(rq->mpwqe.info, alloc_units, rq->mpwqe.pages_per_wqe);
+
+	return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz));
+}
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 812a370f6aea..7bd49f0b1271 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -8,6 +8,90 @@
 
 /* RX data path */
 
+int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
+{
+	struct mlx5e_mpw_info *wi = mlx5e_get_mpw_info(rq, ix);
+	struct mlx5e_icosq *icosq = rq->icosq;
+	struct mlx5_wq_cyc *wq = &icosq->wq;
+	struct mlx5e_umr_wqe *umr_wqe;
+	int batch, i;
+	u32 offset; /* 17-bit value with MTT. */
+	u16 pi;
+
+	if (unlikely(!xsk_buff_can_alloc(rq->xsk_pool, rq->mpwqe.pages_per_wqe)))
+		goto err;
+
+	BUILD_BUG_ON(sizeof(wi->alloc_units[0]) != sizeof(wi->alloc_units[0].xsk));
+	batch = xsk_buff_alloc_batch(rq->xsk_pool, (struct xdp_buff **)wi->alloc_units,
+				     rq->mpwqe.pages_per_wqe);
+
+	/* If batch < pages_per_wqe, either:
+	 * 1. Some (or all) descriptors were invalid.
+	 * 2. dma_need_sync is true, and it fell back to allocating one frame.
+	 * In either case, try to continue allocating frames one by one, until
+	 * the first error, which will mean there are no more valid descriptors.
+	 */
+	for (; batch < rq->mpwqe.pages_per_wqe; batch++) {
+		wi->alloc_units[batch].xsk = xsk_buff_alloc(rq->xsk_pool);
+		if (unlikely(!wi->alloc_units[batch].xsk))
+			goto err_reuse_batch;
+	}
+
+	pi = mlx5e_icosq_get_next_pi(icosq, rq->mpwqe.umr_wqebbs);
+	umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
+	memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
+
+	if (unlikely(rq->mpwqe.unaligned)) {
+		for (i = 0; i < batch; i++) {
+			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
+
+			umr_wqe->inline_ksms[i] = (struct mlx5_ksm) {
+				.key = rq->mkey_be,
+				.va = cpu_to_be64(addr),
+			};
+		}
+	} else {
+		for (i = 0; i < batch; i++) {
+			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
+
+			umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
+				.ptag = cpu_to_be64(addr | MLX5_EN_WR),
+			};
+		}
+	}
+
+	bitmap_zero(wi->xdp_xmit_bitmap, rq->mpwqe.pages_per_wqe);
+	wi->consumed_strides = 0;
+
+	umr_wqe->ctrl.opmod_idx_opcode =
+		cpu_to_be32((icosq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | MLX5_OPCODE_UMR);
+
+	offset = ix * rq->mpwqe.mtts_per_wqe;
+	if (likely(!rq->mpwqe.unaligned))
+		offset = MLX5_ALIGNED_MTTS_OCTW(offset);
+	umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
+
+	icosq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
+		.wqe_type = MLX5E_ICOSQ_WQE_UMR_RX,
+		.num_wqebbs = rq->mpwqe.umr_wqebbs,
+		.umr.rq = rq,
+	};
+
+	icosq->pc += rq->mpwqe.umr_wqebbs;
+
+	icosq->doorbell_cseg = &umr_wqe->ctrl;
+
+	return 0;
+
+err_reuse_batch:
+	while (--batch >= 0)
+		xsk_buff_free(wi->alloc_units[batch].xsk);
+
+err:
+	rq->stats->buff_alloc_err++;
+	return -ENOMEM;
+}
+
 int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
 {
 	struct mlx5_wq_cyc *wq = &rq->wqe.wq;
@@ -112,7 +196,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
 	 */
 	WARN_ON_ONCE(head_offset);
 
-	xdp->data_end = xdp->data + cqe_bcnt;
+	xsk_buff_set_size(xdp, cqe_bcnt);
 	xdp_set_data_meta_invalid(xdp);
 	xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
 	net_prefetch(xdp->data);
@@ -159,7 +243,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 	 */
 	WARN_ON_ONCE(wi->offset);
 
-	xdp->data_end = xdp->data + cqe_bcnt;
+	xsk_buff_set_size(xdp, cqe_bcnt);
 	xdp_set_data_meta_invalid(xdp);
 	xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
 	net_prefetch(xdp->data);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index 7898a78237b8..84a496a8d72f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -9,6 +9,7 @@
 
 /* RX data path */
 
+int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix);
 int mlx5e_xsk_alloc_rx_wqes_batched(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
 int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk);
 struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 5f411c29157f..329702e185a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -75,13 +75,6 @@ const struct mlx5e_rx_handlers mlx5e_rx_handlers_nic = {
 	.handle_rx_cqe_mpwqe_shampo = mlx5e_handle_rx_cqe_mpwrq_shampo,
 };
 
-static struct mlx5e_mpw_info *mlx5e_get_mpw_info(struct mlx5e_rq *rq, int i)
-{
-	size_t isz = struct_size(rq->mpwqe.info, alloc_units, rq->mpwqe.pages_per_wqe);
-
-	return (struct mlx5e_mpw_info *)((char *)rq->mpwqe.info + array_size(i, isz));
-}
-
 static inline bool mlx5e_rx_hw_stamp(struct hwtstamp_config *config)
 {
 	return config->rx_filter == HWTSTAMP_FILTER_ALL;
@@ -668,15 +661,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	int err;
 	int i;
 
-	/* Check in advance that we have enough frames, instead of allocating
-	 * one-by-one, failing and moving frames to the Reuse Ring.
-	 */
-	if (rq->xsk_pool &&
-	    unlikely(!xsk_buff_can_alloc(rq->xsk_pool, rq->mpwqe.pages_per_wqe))) {
-		err = -ENOMEM;
-		goto err;
-	}
-
 	if (test_bit(MLX5E_RQ_STATE_SHAMPO, &rq->state)) {
 		err = mlx5e_alloc_rx_hd_mpwqe(rq);
 		if (unlikely(err))
@@ -687,33 +671,16 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
 	memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
 
-	if (unlikely(rq->mpwqe.unaligned)) {
-		for (i = 0; i < rq->mpwqe.pages_per_wqe; i++, au++) {
-			dma_addr_t addr;
-
-			err = mlx5e_page_alloc(rq, au);
-			if (unlikely(err))
-				goto err_unmap;
-			/* Unaligned means XSK. */
-			addr = xsk_buff_xdp_get_frame_dma(au->xsk);
-			umr_wqe->inline_ksms[i] = (struct mlx5_ksm) {
-				.key = rq->mkey_be,
-				.va = cpu_to_be64(addr),
-			};
-		}
-	} else {
-		for (i = 0; i < rq->mpwqe.pages_per_wqe; i++, au++) {
-			dma_addr_t addr;
+	for (i = 0; i < rq->mpwqe.pages_per_wqe; i++, au++) {
+		dma_addr_t addr;
 
-			err = mlx5e_page_alloc(rq, au);
-			if (unlikely(err))
-				goto err_unmap;
-			addr = rq->xsk_pool ? xsk_buff_xdp_get_frame_dma(au->xsk) :
-					      page_pool_get_dma_addr(au->page);
-			umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
-				.ptag = cpu_to_be64(addr | MLX5_EN_WR),
-			};
-		}
+		err = mlx5e_page_alloc_pool(rq, au);
+		if (unlikely(err))
+			goto err_unmap;
+		addr = page_pool_get_dma_addr(au->page);
+		umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
+			.ptag = cpu_to_be64(addr | MLX5_EN_WR),
+		};
 	}
 
 	bitmap_zero(wi->xdp_xmit_bitmap, rq->mpwqe.pages_per_wqe);
@@ -723,9 +690,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 		cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
 			    MLX5_OPCODE_UMR);
 
-	offset = ix * rq->mpwqe.mtts_per_wqe;
-	if (!rq->mpwqe.unaligned)
-		offset = MLX5_ALIGNED_MTTS_OCTW(offset);
+	offset = MLX5_ALIGNED_MTTS_OCTW(ix * rq->mpwqe.mtts_per_wqe);
 	umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
 
 	sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
@@ -1016,7 +981,8 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
 	head = rq->mpwqe.actual_wq_head;
 	i = missing;
 	do {
-		alloc_err = mlx5e_alloc_rx_mpwqe(rq, head);
+		alloc_err = rq->xsk_pool ? mlx5e_xsk_alloc_rx_mpwqe(rq, head) :
+					   mlx5e_alloc_rx_mpwqe(rq, head);
 
 		if (unlikely(alloc_err))
 			break;
-- 
cgit v1.2.3


From 132857d9124c853c105acf96b557ce866c044970 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:28:58 -0700
Subject: net/mlx5e: Use non-XSK page allocator in SHAMPO

The SHAMPO flow is not compatible with XSK, it can call the page pool
allocator directly to save a branch.

mlx5e_page_alloc is removed, as it's no longer used in any flow.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 329702e185a9..9d0a5c66c6a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -293,16 +293,6 @@ static inline int mlx5e_page_alloc_pool(struct mlx5e_rq *rq, union mlx5e_alloc_u
 	return 0;
 }
 
-static inline int mlx5e_page_alloc(struct mlx5e_rq *rq, union mlx5e_alloc_unit *au)
-{
-	if (rq->xsk_pool) {
-		au->xsk = xsk_buff_alloc(rq->xsk_pool);
-		return likely(au->xsk) ? 0 : -ENOMEM;
-	} else {
-		return mlx5e_page_alloc_pool(rq, au);
-	}
-}
-
 void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, struct page *page)
 {
 	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
@@ -562,7 +552,7 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq,
 		if (!(header_offset & (PAGE_SIZE - 1))) {
 			union mlx5e_alloc_unit au;
 
-			err = mlx5e_page_alloc(rq, &au);
+			err = mlx5e_page_alloc_pool(rq, &au);
 			if (unlikely(err))
 				goto err_unmap;
 			page = dma_info->page = au.page;
-- 
cgit v1.2.3


From 96d37d861a09ba4b6ea08b87fa1c173c1af522b1 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:28:59 -0700
Subject: net/mlx5e: Call mlx5e_page_release_dynamic directly where possible

mlx5e_page_release calls the appropriate deallocator depending on
whether it's an XSK RQ or a regular one. Some flows that call this
function are not compatible with XSK, so they can call the non-XSK
deallocator directly to save a branch.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 9d0a5c66c6a9..d0db6a66cb46 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -588,12 +588,8 @@ err_unmap:
 	while (--i >= 0) {
 		dma_info = &shampo->info[--index];
 		if (!(i & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1))) {
-			union mlx5e_alloc_unit au = {
-				.page = dma_info->page,
-			};
-
 			dma_info->addr = ALIGN_DOWN(dma_info->addr, PAGE_SIZE);
-			mlx5e_page_release(rq, &au, true);
+			mlx5e_page_release_dynamic(rq, dma_info->page, true);
 		}
 	}
 	rq->stats->buff_alloc_err++;
@@ -698,7 +694,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 err_unmap:
 	while (--i >= 0) {
 		au--;
-		mlx5e_page_release(rq, au, true);
+		mlx5e_page_release_dynamic(rq, au->page, true);
 	}
 
 err:
@@ -731,12 +727,8 @@ void mlx5e_shampo_dealloc_hd(struct mlx5e_rq *rq, u16 len, u16 start, bool close
 		hd_info = &shampo->info[index];
 		hd_info->addr = ALIGN_DOWN(hd_info->addr, PAGE_SIZE);
 		if (hd_info->page != deleted_page) {
-			union mlx5e_alloc_unit au = {
-				.page = hd_info->page,
-			};
-
 			deleted_page = hd_info->page;
-			mlx5e_page_release(rq, &au, false);
+			mlx5e_page_release_dynamic(rq, hd_info->page, false);
 		}
 	}
 
@@ -2061,12 +2053,8 @@ mlx5e_free_rx_shampo_hd_entry(struct mlx5e_rq *rq, u16 header_index)
 	u64 addr = shampo->info[header_index].addr;
 
 	if (((header_index + 1) & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1)) == 0) {
-		union mlx5e_alloc_unit au = {
-			.page = shampo->info[header_index].page,
-		};
-
 		shampo->info[header_index].addr = ALIGN_DOWN(addr, PAGE_SIZE);
-		mlx5e_page_release(rq, &au, true);
+		mlx5e_page_release_dynamic(rq, shampo->info[header_index].page, true);
 	}
 	bitmap_clear(shampo->bitmap, header_index, 1);
 }
-- 
cgit v1.2.3


From ddb7afeee28bc37b4815470d00f0c8db3fcabd2e Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:29:00 -0700
Subject: net/mlx5e: Optimize RQ page deallocation

mlx5e_free_rx_mpwqe loops over all pages of a MPWQE, calling
mlx5e_page_release for ones that are not scheduled for XDP_TX or
XDP_REDIRECT; and mlx5e_page_release checks whether it's an XSK RQ or a
regular one for each page/XSK frame. This check can be moved outside the
loop to reduce the number of branches.

mlx5e_free_rx_wqe loops over all fragments, calling mlx5e_page_release
for the ones that are last in a page; and mlx5e_page_release checks
whether it's an XSK RQ or a regular one for each fragment. Using the
fact that XSK doesn't support multiple fragments, it can be optimized
for both XSK and regular usages:

1. Make an early check for XSK and call its deallocator directly, saving
3 branches (loop condition, frag->last_in_page and selection of
deallocator).

2. Call the regular deallocator directly in the non-XSK case, saving a
branch per fragment, except the first one.

After the changes, mlx5e_page_release is removed, as there are no
callers left.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.c    |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    | 41 ++++++++++++----------
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 7bd49f0b1271..661d2d5748f4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -253,7 +253,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 		return NULL; /* page/packet was consumed by XDP */
 
 	/* XDP_PASS: copy the data from the UMEM to a new SKB. The frame reuse
-	 * will be handled by mlx5e_put_rx_frag.
+	 * will be handled by mlx5e_free_rx_wqe.
 	 * On SKB allocation failure, NULL is returned.
 	 */
 	return mlx5e_xsk_construct_skb(rq, xdp->data, xdp->data_end - xdp->data);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index d0db6a66cb46..36eda4c958a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -317,20 +317,6 @@ void mlx5e_page_release_dynamic(struct mlx5e_rq *rq, struct page *page, bool rec
 	}
 }
 
-static inline void mlx5e_page_release(struct mlx5e_rq *rq,
-				      union mlx5e_alloc_unit *au,
-				      bool recycle)
-{
-	if (rq->xsk_pool)
-		/* The `recycle` parameter is ignored, and the page is always
-		 * put into the Reuse Ring, because there is no way to return
-		 * the page to the userspace when the interface goes down.
-		 */
-		xsk_buff_free(au->xsk);
-	else
-		mlx5e_page_release_dynamic(rq, au->page, recycle);
-}
-
 static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq,
 				    struct mlx5e_wqe_frag_info *frag)
 {
@@ -352,7 +338,7 @@ static inline void mlx5e_put_rx_frag(struct mlx5e_rq *rq,
 				     bool recycle)
 {
 	if (frag->last_in_page)
-		mlx5e_page_release(rq, frag->au, recycle);
+		mlx5e_page_release_dynamic(rq, frag->au->page, recycle);
 }
 
 static inline struct mlx5e_wqe_frag_info *get_frag(struct mlx5e_rq *rq, u16 ix)
@@ -395,6 +381,15 @@ static inline void mlx5e_free_rx_wqe(struct mlx5e_rq *rq,
 {
 	int i;
 
+	if (rq->xsk_pool) {
+		/* The `recycle` parameter is ignored, and the page is always
+		 * put into the Reuse Ring, because there is no way to return
+		 * the page to the userspace when the interface goes down.
+		 */
+		xsk_buff_free(wi->au->xsk);
+		return;
+	}
+
 	for (i = 0; i < rq->wqe.info.num_frags; i++, wi++)
 		mlx5e_put_rx_frag(rq, wi, recycle);
 }
@@ -463,9 +458,19 @@ mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, bool recycle
 
 	no_xdp_xmit = bitmap_empty(wi->xdp_xmit_bitmap, rq->mpwqe.pages_per_wqe);
 
-	for (i = 0; i < rq->mpwqe.pages_per_wqe; i++)
-		if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap))
-			mlx5e_page_release(rq, &alloc_units[i], recycle);
+	if (rq->xsk_pool) {
+		/* The `recycle` parameter is ignored, and the page is always
+		 * put into the Reuse Ring, because there is no way to return
+		 * the page to the userspace when the interface goes down.
+		 */
+		for (i = 0; i < rq->mpwqe.pages_per_wqe; i++)
+			if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap))
+				xsk_buff_free(alloc_units[i].xsk);
+	} else {
+		for (i = 0; i < rq->mpwqe.pages_per_wqe; i++)
+			if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap))
+				mlx5e_page_release_dynamic(rq, alloc_units[i].page, recycle);
+	}
 }
 
 static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq, u8 n)
-- 
cgit v1.2.3


From a752b2edb5c158f4ca3386387a3b3df1be856d03 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:29:01 -0700
Subject: net/mlx5e: xsk: Support XDP metadata on XSK RQs

Add support for XDP metadata on XSK RQs for cross-program
communication. The driver no longer calls xdp_set_data_meta_invalid and
copies the metadata to a newly allocated SKB on XDP_PASS.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 661d2d5748f4..aebc1d5a9004 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -158,18 +158,24 @@ int mlx5e_xsk_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, int wqe_bulk)
 	return wqe_bulk;
 }
 
-static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data,
-					       u32 cqe_bcnt)
+static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, struct xdp_buff *xdp)
 {
+	u32 totallen = xdp->data_end - xdp->data_meta;
+	u32 metalen = xdp->data - xdp->data_meta;
 	struct sk_buff *skb;
 
-	skb = napi_alloc_skb(rq->cq.napi, cqe_bcnt);
+	skb = napi_alloc_skb(rq->cq.napi, totallen);
 	if (unlikely(!skb)) {
 		rq->stats->buff_alloc_err++;
 		return NULL;
 	}
 
-	skb_put_data(skb, data, cqe_bcnt);
+	skb_put_data(skb, xdp->data_meta, totallen);
+
+	if (metalen) {
+		skb_metadata_set(skb, metalen);
+		__skb_pull(skb, metalen);
+	}
 
 	return skb;
 }
@@ -197,7 +203,6 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
 	WARN_ON_ONCE(head_offset);
 
 	xsk_buff_set_size(xdp, cqe_bcnt);
-	xdp_set_data_meta_invalid(xdp);
 	xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
 	net_prefetch(xdp->data);
 
@@ -226,7 +231,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
 	/* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the
 	 * frame. On SKB allocation failure, NULL is returned.
 	 */
-	return mlx5e_xsk_construct_skb(rq, xdp->data, xdp->data_end - xdp->data);
+	return mlx5e_xsk_construct_skb(rq, xdp);
 }
 
 struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
@@ -244,7 +249,6 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 	WARN_ON_ONCE(wi->offset);
 
 	xsk_buff_set_size(xdp, cqe_bcnt);
-	xdp_set_data_meta_invalid(xdp);
 	xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
 	net_prefetch(xdp->data);
 
@@ -256,5 +260,5 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 	 * will be handled by mlx5e_free_rx_wqe.
 	 * On SKB allocation failure, NULL is returned.
 	 */
-	return mlx5e_xsk_construct_skb(rq, xdp->data, xdp->data_end - xdp->data);
+	return mlx5e_xsk_construct_skb(rq, xdp);
 }
-- 
cgit v1.2.3


From d9ba64deb2f1ad58eb3067c7485518f3e96559ee Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:29:02 -0700
Subject: net/mlx5e: Introduce the mlx5e_flush_rq function

Add a function to flush an RQ: clean up descriptors, release pages and
reset the RQ. This procedure is used by the recovery flow, and it will
also be used in a following commit to free some memory when switching a
channel to the XSK mode.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  2 +-
 .../ethernet/mellanox/mlx5/core/en/reporter_rx.c   | 23 +-----------------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 28 +++++++++++++++++++++-
 3 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 8e174a7f7c25..238307390400 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1097,7 +1097,7 @@ void mlx5e_activate_priv_channels(struct mlx5e_priv *priv);
 void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv);
 int mlx5e_ptp_rx_manage_fs_ctx(struct mlx5e_priv *priv, void *ctx);
 
-int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state);
+int mlx5e_flush_rq(struct mlx5e_rq *rq, int curr_state);
 void mlx5e_activate_rq(struct mlx5e_rq *rq);
 void mlx5e_deactivate_rq(struct mlx5e_rq *rq);
 void mlx5e_activate_icosq(struct mlx5e_icosq *icosq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
index 2b946ae1d97f..5f6f95ad6888 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
@@ -134,34 +134,13 @@ out:
 	return err;
 }
 
-static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state)
-{
-	struct net_device *dev = rq->netdev;
-	int err;
-
-	err = mlx5e_modify_rq_state(rq, curr_state, MLX5_RQC_STATE_RST);
-	if (err) {
-		netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn);
-		return err;
-	}
-	err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
-	if (err) {
-		netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn);
-		return err;
-	}
-
-	return 0;
-}
-
 static int mlx5e_rx_reporter_err_rq_cqe_recover(void *ctx)
 {
 	struct mlx5e_rq *rq = ctx;
 	int err;
 
 	mlx5e_deactivate_rq(rq);
-	mlx5e_free_rx_descs(rq);
-
-	err = mlx5e_rq_to_ready(rq, MLX5_RQC_STATE_ERR);
+	err = mlx5e_flush_rq(rq, MLX5_RQC_STATE_ERR);
 	clear_bit(MLX5E_RQ_STATE_RECOVERING, &rq->state);
 	if (err)
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 6a0adda03463..129a0d678cce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -899,7 +899,7 @@ int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param)
 	return err;
 }
 
-int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state)
+static int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state)
 {
 	struct mlx5_core_dev *mdev = rq->mdev;
 
@@ -928,6 +928,32 @@ int mlx5e_modify_rq_state(struct mlx5e_rq *rq, int curr_state, int next_state)
 	return err;
 }
 
+static int mlx5e_rq_to_ready(struct mlx5e_rq *rq, int curr_state)
+{
+	struct net_device *dev = rq->netdev;
+	int err;
+
+	err = mlx5e_modify_rq_state(rq, curr_state, MLX5_RQC_STATE_RST);
+	if (err) {
+		netdev_err(dev, "Failed to move rq 0x%x to reset\n", rq->rqn);
+		return err;
+	}
+	err = mlx5e_modify_rq_state(rq, MLX5_RQC_STATE_RST, MLX5_RQC_STATE_RDY);
+	if (err) {
+		netdev_err(dev, "Failed to move rq 0x%x to ready\n", rq->rqn);
+		return err;
+	}
+
+	return 0;
+}
+
+int mlx5e_flush_rq(struct mlx5e_rq *rq, int curr_state)
+{
+	mlx5e_free_rx_descs(rq);
+
+	return mlx5e_rq_to_ready(rq, curr_state);
+}
+
 static int mlx5e_modify_rq_scatter_fcs(struct mlx5e_rq *rq, bool enable)
 {
 	struct mlx5_core_dev *mdev = rq->mdev;
-- 
cgit v1.2.3


From 3db4c85cde7a514a5277070b32e776dbefcaa838 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Fri, 30 Sep 2022 09:29:03 -0700
Subject: net/mlx5e: xsk: Use queue indices starting from 0 for XSK queues

In the initial implementation of XSK in mlx5e, XSK RQs coexisted with
regular RQs in the same channel. The main idea was to allow RSS work the
same for regular traffic, without need to reconfigure RSS to exclude XSK
queues.

However, this scheme didn't prove to be beneficial, mainly because of
incompatibility with other vendors. Some tools don't properly support
using higher indices for XSK queues, some tools get confused with the
double amount of RQs exposed in sysfs. Some use cases are purely XSK,
and allocating the same amount of unused regular RQs is a waste of
resources.

This commit changes the queuing scheme to the standard one, where XSK
RQs replace regular RQs on the channels where XSK sockets are open. Two
RQs still exist in the channel to allow failsafe disable of XSK, but
only one is exposed at a time. The next commit will achieve the desired
memory save by flushing the buffers when the regular RQ is unused.

As the result of this transition:

1. It's possible to use RSS contexts over XSK RQs.

2. It's possible to dedicate all queues to XSK.

3. When XSK RQs coexist with regular RQs, the admin should make sure no
unwanted traffic goes into XSK RQs by either excluding them from RSS or
settings up the XDP program to return XDP_PASS for non-XSK traffic.

4. When using a mixed fleet of mlx5e devices and other netdevs, the same
configuration can be applied. If the application supports the fallback
to copy mode on unsupported drivers, it will work too.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   7 --
 .../net/ethernet/mellanox/mlx5/core/en/channels.c  |  29 ++---
 .../net/ethernet/mellanox/mlx5/core/en/channels.h  |   3 +-
 .../net/ethernet/mellanox/mlx5/core/en/params.h    |  32 ------
 .../net/ethernet/mellanox/mlx5/core/en/rx_res.c    | 118 +++------------------
 .../net/ethernet/mellanox/mlx5/core/en/rx_res.h    |   9 +-
 .../net/ethernet/mellanox/mlx5/core/en/xsk/pool.c  |  17 +--
 .../net/ethernet/mellanox/mlx5/core/en/xsk/setup.c |   2 +-
 .../net/ethernet/mellanox/mlx5/core/en/xsk/tx.c    |   5 +-
 .../ethernet/mellanox/mlx5/core/en_fs_ethtool.c    |  13 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  17 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |   3 -
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |   1 -
 .../ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c |   1 -
 14 files changed, 52 insertions(+), 205 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 238307390400..6bc6472b98f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -181,12 +181,6 @@ do {                                                            \
 #define mlx5e_state_dereference(priv, p) \
 	rcu_dereference_protected((p), lockdep_is_held(&(priv)->state_lock))
 
-enum mlx5e_rq_group {
-	MLX5E_RQ_GROUP_REGULAR,
-	MLX5E_RQ_GROUP_XSK,
-#define MLX5E_NUM_RQ_GROUPS(g) (1 + MLX5E_RQ_GROUP_##g)
-};
-
 static inline u8 mlx5e_get_num_lag_ports(struct mlx5_core_dev *mdev)
 {
 	if (mlx5_lag_is_lacp_owner(mdev))
@@ -1005,7 +999,6 @@ struct mlx5e_profile {
 	mlx5e_stats_grp_t *stats_grps;
 	const struct mlx5e_rx_handlers *rx_handlers;
 	int	max_tc;
-	u8	rq_groups;
 	u32     features;
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c
index e7c14c0de0a7..48581ea3adcb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c
@@ -10,28 +10,33 @@ unsigned int mlx5e_channels_get_num(struct mlx5e_channels *chs)
 	return chs->num;
 }
 
-void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn)
+static struct mlx5e_channel *mlx5e_channels_get(struct mlx5e_channels *chs, unsigned int ix)
 {
-	struct mlx5e_channel *c;
+	WARN_ON_ONCE(ix >= mlx5e_channels_get_num(chs));
+	return chs->c[ix];
+}
 
-	WARN_ON(ix >= mlx5e_channels_get_num(chs));
-	c = chs->c[ix];
+bool mlx5e_channels_is_xsk(struct mlx5e_channels *chs, unsigned int ix)
+{
+	struct mlx5e_channel *c = mlx5e_channels_get(chs, ix);
 
-	*rqn = c->rq.rqn;
+	return test_bit(MLX5E_CHANNEL_STATE_XSK, c->state);
 }
 
-bool mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn)
+void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn)
 {
-	struct mlx5e_channel *c;
+	struct mlx5e_channel *c = mlx5e_channels_get(chs, ix);
 
-	WARN_ON(ix >= mlx5e_channels_get_num(chs));
-	c = chs->c[ix];
+	*rqn = c->rq.rqn;
+}
 
-	if (!test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
-		return false;
+void mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn)
+{
+	struct mlx5e_channel *c = mlx5e_channels_get(chs, ix);
+
+	WARN_ON_ONCE(!test_bit(MLX5E_CHANNEL_STATE_XSK, c->state));
 
 	*rqn = c->xskrq.rqn;
-	return true;
 }
 
 bool mlx5e_channels_get_ptp_rqn(struct mlx5e_channels *chs, u32 *rqn)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h
index ca00cbc827cb..637ca90daaa8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h
@@ -9,8 +9,9 @@
 struct mlx5e_channels;
 
 unsigned int mlx5e_channels_get_num(struct mlx5e_channels *chs);
+bool mlx5e_channels_is_xsk(struct mlx5e_channels *chs, unsigned int ix);
 void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn);
-bool mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn);
+void mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn);
 bool mlx5e_channels_get_ptp_rqn(struct mlx5e_channels *chs, u32 *rqn);
 
 #endif /* __MLX5_EN_CHANNELS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
index cb862c478376..a3952afdcbe4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
@@ -53,38 +53,6 @@ struct mlx5e_create_sq_param {
 	u8                          min_inline_mode;
 };
 
-static inline bool mlx5e_qid_get_ch_if_in_group(struct mlx5e_params *params,
-						u16 qid,
-						enum mlx5e_rq_group group,
-						u16 *ix)
-{
-	int nch = params->num_channels;
-	int ch = qid - nch * group;
-
-	if (ch < 0 || ch >= nch)
-		return false;
-
-	*ix = ch;
-	return true;
-}
-
-static inline void mlx5e_qid_get_ch_and_group(struct mlx5e_params *params,
-					      u16 qid,
-					      u16 *ix,
-					      enum mlx5e_rq_group *group)
-{
-	u16 nch = params->num_channels;
-
-	*ix = qid % nch;
-	*group = qid / nch;
-}
-
-static inline bool mlx5e_qid_validate(const struct mlx5e_profile *profile,
-				      struct mlx5e_params *params, u64 qid)
-{
-	return qid < params->num_channels * profile->rq_groups;
-}
-
 /* Striding RQ dynamic parameters */
 
 u8 mlx5e_mpwrq_page_shift(struct mlx5_core_dev *mdev, struct mlx5e_xsk_param *xsk);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c
index 3436ecfcbc2f..e1095bc36543 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c
@@ -24,8 +24,6 @@ struct mlx5e_rx_res {
 	struct {
 		struct mlx5e_rqt direct_rqt;
 		struct mlx5e_tir direct_tir;
-		struct mlx5e_rqt xsk_rqt;
-		struct mlx5e_tir xsk_tir;
 	} *channels;
 
 	struct {
@@ -320,48 +318,8 @@ static int mlx5e_rx_res_channels_init(struct mlx5e_rx_res *res)
 		mlx5e_tir_builder_clear(builder);
 	}
 
-	if (!(res->features & MLX5E_RX_RES_FEATURE_XSK))
-		goto out;
-
-	for (ix = 0; ix < res->max_nch; ix++) {
-		err = mlx5e_rqt_init_direct(&res->channels[ix].xsk_rqt,
-					    res->mdev, false, res->drop_rqn);
-		if (err) {
-			mlx5_core_warn(res->mdev, "Failed to create an XSK RQT: err = %d, ix = %u\n",
-				       err, ix);
-			goto err_destroy_xsk_rqts;
-		}
-	}
-
-	for (ix = 0; ix < res->max_nch; ix++) {
-		mlx5e_tir_builder_build_rqt(builder, res->mdev->mlx5e_res.hw_objs.td.tdn,
-					    mlx5e_rqt_get_rqtn(&res->channels[ix].xsk_rqt),
-					    inner_ft_support);
-		mlx5e_tir_builder_build_packet_merge(builder, &res->pkt_merge_param);
-		mlx5e_tir_builder_build_direct(builder);
-
-		err = mlx5e_tir_init(&res->channels[ix].xsk_tir, builder, res->mdev, true);
-		if (err) {
-			mlx5_core_warn(res->mdev, "Failed to create an XSK TIR: err = %d, ix = %u\n",
-				       err, ix);
-			goto err_destroy_xsk_tirs;
-		}
-
-		mlx5e_tir_builder_clear(builder);
-	}
-
 	goto out;
 
-err_destroy_xsk_tirs:
-	while (--ix >= 0)
-		mlx5e_tir_destroy(&res->channels[ix].xsk_tir);
-
-	ix = res->max_nch;
-err_destroy_xsk_rqts:
-	while (--ix >= 0)
-		mlx5e_rqt_destroy(&res->channels[ix].xsk_rqt);
-
-	ix = res->max_nch;
 err_destroy_direct_tirs:
 	while (--ix >= 0)
 		mlx5e_tir_destroy(&res->channels[ix].direct_tir);
@@ -420,12 +378,6 @@ static void mlx5e_rx_res_channels_destroy(struct mlx5e_rx_res *res)
 	for (ix = 0; ix < res->max_nch; ix++) {
 		mlx5e_tir_destroy(&res->channels[ix].direct_tir);
 		mlx5e_rqt_destroy(&res->channels[ix].direct_rqt);
-
-		if (!(res->features & MLX5E_RX_RES_FEATURE_XSK))
-			continue;
-
-		mlx5e_tir_destroy(&res->channels[ix].xsk_tir);
-		mlx5e_rqt_destroy(&res->channels[ix].xsk_rqt);
 	}
 
 	kvfree(res->channels);
@@ -491,13 +443,6 @@ u32 mlx5e_rx_res_get_tirn_direct(struct mlx5e_rx_res *res, unsigned int ix)
 	return mlx5e_tir_get_tirn(&res->channels[ix].direct_tir);
 }
 
-u32 mlx5e_rx_res_get_tirn_xsk(struct mlx5e_rx_res *res, unsigned int ix)
-{
-	WARN_ON(!(res->features & MLX5E_RX_RES_FEATURE_XSK));
-
-	return mlx5e_tir_get_tirn(&res->channels[ix].xsk_tir);
-}
-
 u32 mlx5e_rx_res_get_tirn_rss(struct mlx5e_rx_res *res, enum mlx5_traffic_types tt)
 {
 	struct mlx5e_rss *rss = res->rss[0];
@@ -527,26 +472,14 @@ static void mlx5e_rx_res_channel_activate_direct(struct mlx5e_rx_res *res,
 						 struct mlx5e_channels *chs,
 						 unsigned int ix)
 {
-	u32 rqn;
+	u32 rqn = res->rss_rqns[ix];
 	int err;
 
-	mlx5e_channels_get_regular_rqn(chs, ix, &rqn);
 	err = mlx5e_rqt_redirect_direct(&res->channels[ix].direct_rqt, rqn);
 	if (err)
 		mlx5_core_warn(res->mdev, "Failed to redirect direct RQT %#x to RQ %#x (channel %u): err = %d\n",
 			       mlx5e_rqt_get_rqtn(&res->channels[ix].direct_rqt),
 			       rqn, ix, err);
-
-	if (!(res->features & MLX5E_RX_RES_FEATURE_XSK))
-		return;
-
-	if (!mlx5e_channels_get_xsk_rqn(chs, ix, &rqn))
-		rqn = res->drop_rqn;
-	err = mlx5e_rqt_redirect_direct(&res->channels[ix].xsk_rqt, rqn);
-	if (err)
-		mlx5_core_warn(res->mdev, "Failed to redirect XSK RQT %#x to RQ %#x (channel %u): err = %d\n",
-			       mlx5e_rqt_get_rqtn(&res->channels[ix].xsk_rqt),
-			       rqn, ix, err);
 }
 
 static void mlx5e_rx_res_channel_deactivate_direct(struct mlx5e_rx_res *res,
@@ -559,15 +492,6 @@ static void mlx5e_rx_res_channel_deactivate_direct(struct mlx5e_rx_res *res,
 		mlx5_core_warn(res->mdev, "Failed to redirect direct RQT %#x to drop RQ %#x (channel %u): err = %d\n",
 			       mlx5e_rqt_get_rqtn(&res->channels[ix].direct_rqt),
 			       res->drop_rqn, ix, err);
-
-	if (!(res->features & MLX5E_RX_RES_FEATURE_XSK))
-		return;
-
-	err = mlx5e_rqt_redirect_direct(&res->channels[ix].xsk_rqt, res->drop_rqn);
-	if (err)
-		mlx5_core_warn(res->mdev, "Failed to redirect XSK RQT %#x to drop RQ %#x (channel %u): err = %d\n",
-			       mlx5e_rqt_get_rqtn(&res->channels[ix].xsk_rqt),
-			       res->drop_rqn, ix, err);
 }
 
 void mlx5e_rx_res_channels_activate(struct mlx5e_rx_res *res, struct mlx5e_channels *chs)
@@ -577,8 +501,12 @@ void mlx5e_rx_res_channels_activate(struct mlx5e_rx_res *res, struct mlx5e_chann
 
 	nch = mlx5e_channels_get_num(chs);
 
-	for (ix = 0; ix < chs->num; ix++)
-		mlx5e_channels_get_regular_rqn(chs, ix, &res->rss_rqns[ix]);
+	for (ix = 0; ix < chs->num; ix++) {
+		if (mlx5e_channels_is_xsk(chs, ix))
+			mlx5e_channels_get_xsk_rqn(chs, ix, &res->rss_rqns[ix]);
+		else
+			mlx5e_channels_get_regular_rqn(chs, ix, &res->rss_rqns[ix]);
+	}
 	res->rss_nch = chs->num;
 
 	mlx5e_rx_res_rss_enable(res);
@@ -621,33 +549,17 @@ void mlx5e_rx_res_channels_deactivate(struct mlx5e_rx_res *res)
 	}
 }
 
-int mlx5e_rx_res_xsk_activate(struct mlx5e_rx_res *res, struct mlx5e_channels *chs,
-			      unsigned int ix)
+void mlx5e_rx_res_xsk_update(struct mlx5e_rx_res *res, struct mlx5e_channels *chs,
+			     unsigned int ix, bool xsk)
 {
-	u32 rqn;
-	int err;
-
-	if (!mlx5e_channels_get_xsk_rqn(chs, ix, &rqn))
-		return -EINVAL;
-
-	err = mlx5e_rqt_redirect_direct(&res->channels[ix].xsk_rqt, rqn);
-	if (err)
-		mlx5_core_warn(res->mdev, "Failed to redirect XSK RQT %#x to XSK RQ %#x (channel %u): err = %d\n",
-			       mlx5e_rqt_get_rqtn(&res->channels[ix].xsk_rqt),
-			       rqn, ix, err);
-	return err;
-}
+	if (xsk)
+		mlx5e_channels_get_xsk_rqn(chs, ix, &res->rss_rqns[ix]);
+	else
+		mlx5e_channels_get_regular_rqn(chs, ix, &res->rss_rqns[ix]);
 
-int mlx5e_rx_res_xsk_deactivate(struct mlx5e_rx_res *res, unsigned int ix)
-{
-	int err;
+	mlx5e_rx_res_rss_enable(res);
 
-	err = mlx5e_rqt_redirect_direct(&res->channels[ix].xsk_rqt, res->drop_rqn);
-	if (err)
-		mlx5_core_warn(res->mdev, "Failed to redirect XSK RQT %#x to drop RQ %#x (channel %u): err = %d\n",
-			       mlx5e_rqt_get_rqtn(&res->channels[ix].xsk_rqt),
-			       res->drop_rqn, ix, err);
-	return err;
+	mlx5e_rx_res_channel_activate_direct(res, chs, ix);
 }
 
 int mlx5e_rx_res_packet_merge_set_param(struct mlx5e_rx_res *res,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h
index b39b20a720e0..5d5f64fab60f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h
@@ -17,8 +17,7 @@ struct mlx5e_rss_params_hash;
 
 enum mlx5e_rx_res_features {
 	MLX5E_RX_RES_FEATURE_INNER_FT = BIT(0),
-	MLX5E_RX_RES_FEATURE_XSK = BIT(1),
-	MLX5E_RX_RES_FEATURE_PTP = BIT(2),
+	MLX5E_RX_RES_FEATURE_PTP = BIT(1),
 };
 
 /* Setup */
@@ -32,7 +31,6 @@ void mlx5e_rx_res_free(struct mlx5e_rx_res *res);
 
 /* TIRN getters for flow steering */
 u32 mlx5e_rx_res_get_tirn_direct(struct mlx5e_rx_res *res, unsigned int ix);
-u32 mlx5e_rx_res_get_tirn_xsk(struct mlx5e_rx_res *res, unsigned int ix);
 u32 mlx5e_rx_res_get_tirn_rss(struct mlx5e_rx_res *res, enum mlx5_traffic_types tt);
 u32 mlx5e_rx_res_get_tirn_rss_inner(struct mlx5e_rx_res *res, enum mlx5_traffic_types tt);
 u32 mlx5e_rx_res_get_tirn_ptp(struct mlx5e_rx_res *res);
@@ -40,9 +38,8 @@ u32 mlx5e_rx_res_get_tirn_ptp(struct mlx5e_rx_res *res);
 /* Activate/deactivate API */
 void mlx5e_rx_res_channels_activate(struct mlx5e_rx_res *res, struct mlx5e_channels *chs);
 void mlx5e_rx_res_channels_deactivate(struct mlx5e_rx_res *res);
-int mlx5e_rx_res_xsk_activate(struct mlx5e_rx_res *res, struct mlx5e_channels *chs,
-			      unsigned int ix);
-int mlx5e_rx_res_xsk_deactivate(struct mlx5e_rx_res *res, unsigned int ix);
+void mlx5e_rx_res_xsk_update(struct mlx5e_rx_res *res, struct mlx5e_channels *chs,
+			     unsigned int ix, bool xsk);
 
 /* Configuration API */
 void mlx5e_rx_res_rss_set_indir_uniform(struct mlx5e_rx_res *res, unsigned int nch);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
index 6058b1e72c6c..9804ef15a4d6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
@@ -124,16 +124,10 @@ static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
 	 * any Fill Ring entries at the setup stage.
 	 */
 
-	err = mlx5e_rx_res_xsk_activate(priv->rx_res, &priv->channels, ix);
-	if (unlikely(err))
-		goto err_deactivate;
+	mlx5e_rx_res_xsk_update(priv->rx_res, &priv->channels, ix, true);
 
 	return 0;
 
-err_deactivate:
-	mlx5e_deactivate_xsk(c);
-	mlx5e_close_xsk(c);
-
 err_remove_pool:
 	mlx5e_xsk_remove_pool(&priv->xsk, ix);
 
@@ -171,7 +165,7 @@ static int mlx5e_xsk_disable_locked(struct mlx5e_priv *priv, u16 ix)
 		goto remove_pool;
 
 	c = priv->channels.c[ix];
-	mlx5e_rx_res_xsk_deactivate(priv->rx_res, ix);
+	mlx5e_rx_res_xsk_update(priv->rx_res, &priv->channels, ix, false);
 	mlx5e_deactivate_xsk(c);
 	mlx5e_close_xsk(c);
 
@@ -209,11 +203,10 @@ int mlx5e_xsk_setup_pool(struct net_device *dev, struct xsk_buff_pool *pool, u16
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5e_params *params = &priv->channels.params;
-	u16 ix;
 
-	if (unlikely(!mlx5e_qid_get_ch_if_in_group(params, qid, MLX5E_RQ_GROUP_XSK, &ix)))
+	if (unlikely(qid >= params->num_channels))
 		return -EINVAL;
 
-	return pool ? mlx5e_xsk_enable_pool(priv, pool, ix) :
-		      mlx5e_xsk_disable_pool(priv, ix);
+	return pool ? mlx5e_xsk_enable_pool(priv, pool, qid) :
+		      mlx5e_xsk_disable_pool(priv, qid);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
index d7dfc7d2c058..ff03c43833bb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c
@@ -66,7 +66,7 @@ static int mlx5e_init_xsk_rq(struct mlx5e_channel *c,
 	rq->xsk_pool     = pool;
 	rq->stats        = &c->priv->channel_stats[c->ix]->xskrq;
 	rq->ptp_cyc2time = mlx5_rq_ts_translator(mdev);
-	rq_xdp_ix        = c->ix + params->num_channels * MLX5E_RQ_GROUP_XSK;
+	rq_xdp_ix        = c->ix;
 	err = mlx5e_rq_set_handlers(rq, params, xsk);
 	if (err)
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
index c856fc3f197e..367a9505ca4f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c
@@ -12,15 +12,14 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
 	struct mlx5e_priv *priv = netdev_priv(dev);
 	struct mlx5e_params *params = &priv->channels.params;
 	struct mlx5e_channel *c;
-	u16 ix;
 
 	if (unlikely(!mlx5e_xdp_is_active(priv)))
 		return -ENETDOWN;
 
-	if (unlikely(!mlx5e_qid_get_ch_if_in_group(params, qid, MLX5E_RQ_GROUP_XSK, &ix)))
+	if (unlikely(qid >= params->num_channels))
 		return -EINVAL;
 
-	c = priv->channels.c[ix];
+	c = priv->channels.c[qid];
 
 	if (!napi_if_scheduled_mark_missed(&c->napi)) {
 		/* To avoid WQE overrun, don't post a NOP if async_icosq is not
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index 2a67798cd446..aac32e505c14 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -451,15 +451,7 @@ static int flow_get_tirn(struct mlx5e_priv *priv,
 		eth_rule->rss = rss;
 		mlx5e_rss_refcnt_inc(eth_rule->rss);
 	} else {
-		struct mlx5e_params *params = &priv->channels.params;
-		enum mlx5e_rq_group group;
-		u16 ix;
-
-		mlx5e_qid_get_ch_and_group(params, fs->ring_cookie, &ix, &group);
-
-		*tirn = group == MLX5E_RQ_GROUP_XSK ?
-			mlx5e_rx_res_get_tirn_xsk(priv->rx_res, ix) :
-			mlx5e_rx_res_get_tirn_direct(priv->rx_res, ix);
+		*tirn = mlx5e_rx_res_get_tirn_direct(priv->rx_res, fs->ring_cookie);
 	}
 
 	return 0;
@@ -682,8 +674,7 @@ static int validate_flow(struct mlx5e_priv *priv,
 		return -ENOSPC;
 
 	if (fs->ring_cookie != RX_CLS_FLOW_DISC)
-		if (!mlx5e_qid_validate(priv->profile, &priv->channels.params,
-					fs->ring_cookie))
+		if (fs->ring_cookie >= priv->channels.params.num_channels)
 			return -EINVAL;
 
 	switch (flow_type_mask(fs->flow_type)) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 129a0d678cce..21fe43406d88 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2690,7 +2690,7 @@ static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv)
 	struct netdev_tc_txq old_tc_to_txq[TC_MAX_QUEUE], *tc_to_txq;
 	struct net_device *netdev = priv->netdev;
 	int old_num_txqs, old_ntc;
-	int num_rxqs, nch, ntc;
+	int nch, ntc;
 	int err;
 	int i;
 
@@ -2701,7 +2701,6 @@ static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv)
 
 	nch = priv->channels.params.num_channels;
 	ntc = priv->channels.params.mqprio.num_tc;
-	num_rxqs = nch * priv->profile->rq_groups;
 	tc_to_txq = priv->channels.params.mqprio.tc_to_txq;
 
 	err = mlx5e_netdev_set_tcs(netdev, nch, ntc, tc_to_txq);
@@ -2710,7 +2709,7 @@ static int mlx5e_update_netdev_queues(struct mlx5e_priv *priv)
 	err = mlx5e_update_tx_netdev_queues(priv);
 	if (err)
 		goto err_tcs;
-	err = netif_set_real_num_rx_queues(netdev, num_rxqs);
+	err = netif_set_real_num_rx_queues(netdev, nch);
 	if (err) {
 		netdev_warn(netdev, "netif_set_real_num_rx_queues failed, %d\n", err);
 		goto err_txqs;
@@ -5199,7 +5198,7 @@ static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
 		goto err_destroy_q_counters;
 	}
 
-	features = MLX5E_RX_RES_FEATURE_XSK | MLX5E_RX_RES_FEATURE_PTP;
+	features = MLX5E_RX_RES_FEATURE_PTP;
 	if (priv->channels.params.tunneled_offload_en)
 		features |= MLX5E_RX_RES_FEATURE_INNER_FT;
 	err = mlx5e_rx_res_init(priv->rx_res, priv->mdev, features,
@@ -5390,7 +5389,6 @@ static const struct mlx5e_profile mlx5e_nic_profile = {
 	.update_carrier	   = mlx5e_update_carrier,
 	.rx_handlers       = &mlx5e_rx_handlers_nic,
 	.max_tc		   = MLX5E_MAX_NUM_TC,
-	.rq_groups	   = MLX5E_NUM_RQ_GROUPS(XSK),
 	.stats_grps	   = mlx5e_nic_stats_grps,
 	.stats_grps_num	   = mlx5e_nic_stats_grps_num,
 	.features          = BIT(MLX5E_PROFILE_FEATURE_PTP_RX) |
@@ -5423,8 +5421,7 @@ mlx5e_calc_max_nch(struct mlx5_core_dev *mdev, struct net_device *netdev,
 	max_nch = mlx5e_profile_max_num_channels(mdev, profile);
 
 	/* netdev rx queues */
-	tmp = netdev->num_rx_queues / max_t(u8, profile->rq_groups, 1);
-	max_nch = min_t(unsigned int, max_nch, tmp);
+	max_nch = min_t(unsigned int, max_nch, netdev->num_rx_queues);
 
 	/* netdev tx queues */
 	tmp = netdev->num_tx_queues;
@@ -5568,11 +5565,7 @@ static unsigned int mlx5e_get_max_num_txqs(struct mlx5_core_dev *mdev,
 static unsigned int mlx5e_get_max_num_rxqs(struct mlx5_core_dev *mdev,
 					   const struct mlx5e_profile *profile)
 {
-	unsigned int nch;
-
-	nch = mlx5e_profile_max_num_channels(mdev, profile);
-
-	return nch * profile->rq_groups;
+	return mlx5e_profile_max_num_channels(mdev, profile);
 }
 
 struct net_device *
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 83b2febe8a7b..794cd8dfe9c9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1224,7 +1224,6 @@ static const struct mlx5e_profile mlx5e_rep_profile = {
 	.update_stats           = mlx5e_stats_update_ndo_stats,
 	.rx_handlers            = &mlx5e_rx_handlers_rep,
 	.max_tc			= 1,
-	.rq_groups		= MLX5E_NUM_RQ_GROUPS(REGULAR),
 	.stats_grps		= mlx5e_rep_stats_grps,
 	.stats_grps_num		= mlx5e_rep_stats_grps_num,
 	.max_nch_limit		= mlx5e_rep_max_nch_limit,
@@ -1244,8 +1243,6 @@ static const struct mlx5e_profile mlx5e_uplink_rep_profile = {
 	.update_carrier	        = mlx5e_update_carrier,
 	.rx_handlers            = &mlx5e_rx_handlers_rep,
 	.max_tc			= MLX5E_MAX_NUM_TC,
-	/* XSK is needed so we can replace profile with NIC netdev */
-	.rq_groups		= MLX5E_NUM_RQ_GROUPS(XSK),
 	.stats_grps		= mlx5e_ul_rep_stats_grps,
 	.stats_grps_num		= mlx5e_ul_rep_stats_grps_num,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index 35f797cfd21e..4e3a75496dd9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -463,7 +463,6 @@ static const struct mlx5e_profile mlx5i_nic_profile = {
 	.update_carrier    = NULL, /* no HW update in IB link */
 	.rx_handlers       = &mlx5i_rx_handlers,
 	.max_tc		   = MLX5I_MAX_NUM_TC,
-	.rq_groups	   = MLX5E_NUM_RQ_GROUPS(REGULAR),
 	.stats_grps        = mlx5i_stats_grps,
 	.stats_grps_num    = mlx5i_stats_grps_num,
 };
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
index 0b86e78dbc0e..0227a521d301 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c
@@ -349,7 +349,6 @@ static const struct mlx5e_profile mlx5i_pkey_nic_profile = {
 	.update_stats	   = NULL,
 	.rx_handlers       = &mlx5i_rx_handlers,
 	.max_tc		   = MLX5I_MAX_NUM_TC,
-	.rq_groups	   = MLX5E_NUM_RQ_GROUPS(REGULAR),
 };
 
 const struct mlx5e_profile *mlx5i_pkey_get_profile(void)
-- 
cgit v1.2.3


From f938a5295cedf7fdcc4f1caad0ac1a9be1f1a5f5 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Mon, 26 Sep 2022 22:46:31 +0200
Subject: dt-bindings: i2c: st,stm32-i2c: Document interrupt-names property

Document interrupt-names property with "event" and "error" interrupt names.
This fixes dtbs_check warnings when building current Linux DTs:

"
arch/arm/boot/dts/stm32mp153c-dhcom-drc02.dtb: i2c@40015000: Unevaluated properties are not allowed ('interrupt-names' was unexpected)
"

Signed-off-by: Marek Vasut <marex@denx.de>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml b/Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml
index a41588763786..63958cac339b 100644
--- a/Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml
+++ b/Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml
@@ -57,6 +57,11 @@ properties:
       - description: interrupt ID for I2C event
       - description: interrupt ID for I2C error
 
+  interrupt-names:
+    items:
+      - const: event
+      - const: error
+
   resets:
     maxItems: 1
 
-- 
cgit v1.2.3


From 367d4c887acd2be7524bd532ae9446ed2f508c3c Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Mon, 26 Sep 2022 22:46:53 +0200
Subject: dt-bindings: i2c: st,stm32-i2c: Document wakeup-source property

Document wakeup-source property. This fixes dtbs_check warnings
when building current Linux DTs:

"
arch/arm/boot/dts/stm32mp153c-dhcom-drc02.dtb: i2c@40015000: Unevaluated properties are not allowed ('wakeup-source' was unexpected)
"

Signed-off-by: Marek Vasut <marex@denx.de>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml b/Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml
index 63958cac339b..bf396e9466aa 100644
--- a/Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml
+++ b/Documentation/devicetree/bindings/i2c/st,stm32-i2c.yaml
@@ -97,6 +97,8 @@ properties:
           - description: register offset within syscfg
           - description: register bitmask for FMP bit
 
+  wakeup-source: true
+
 required:
   - compatible
   - reg
-- 
cgit v1.2.3


From e2062df704dea47efe16edcaa2316d7b5ecca64f Mon Sep 17 00:00:00 2001
From: Zhang Qilong <zhangqilong3@huawei.com>
Date: Thu, 29 Sep 2022 22:30:38 +0800
Subject: i2c: davinci: fix PM disable depth imbalance in davinci_i2c_probe

The pm_runtime_enable will increase power disable depth. Thus a
pairing decrement is needed on the error handling path to keep
it balanced according to context.

Fixes: 17f88151ff190 ("i2c: davinci: Add PM Runtime Support")
Signed-off-by: Zhang Qilong <zhangqilong3@huawei.com>
Reviewed-by: Bartosz Golaszewski <brgl@bgdev.pl>
Signed-off-by: Wolfram Sang <wsa@kernel.org>
---
 drivers/i2c/busses/i2c-davinci.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/i2c/busses/i2c-davinci.c b/drivers/i2c/busses/i2c-davinci.c
index 471c47db546b..c836cf884185 100644
--- a/drivers/i2c/busses/i2c-davinci.c
+++ b/drivers/i2c/busses/i2c-davinci.c
@@ -823,7 +823,7 @@ static int davinci_i2c_probe(struct platform_device *pdev)
 	r = pm_runtime_resume_and_get(dev->dev);
 	if (r < 0) {
 		dev_err(dev->dev, "failed to runtime_get device: %d\n", r);
-		return r;
+		goto err_pm;
 	}
 
 	i2c_davinci_init(dev);
@@ -882,6 +882,7 @@ static int davinci_i2c_probe(struct platform_device *pdev)
 err_unuse_clocks:
 	pm_runtime_dont_use_autosuspend(dev->dev);
 	pm_runtime_put_sync(dev->dev);
+err_pm:
 	pm_runtime_disable(dev->dev);
 
 	return r;
-- 
cgit v1.2.3


From 4e6263ec8bc965f0bd1c24bb40694a38e382e7a9 Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Tue, 27 Sep 2022 20:48:53 +0800
Subject: net: sched: ensure n arg not empty before call bind_class

All bind_class callbacks are directly returned when n arg is empty.
Therefore, bind_class is invoked only when n arg is not empty.

Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 7c15f1f3da17..c98af0ada706 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1915,7 +1915,7 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
 {
 	struct tcf_bind_args *a = (void *)arg;
 
-	if (tp->ops->bind_class) {
+	if (n && tp->ops->bind_class) {
 		struct Qdisc *q = tcf_block_q(tp->chain->block);
 
 		sch_tree_lock(q);
-- 
cgit v1.2.3


From 402963e34a707e4a8f1854ed86437bc375d65766 Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Tue, 27 Sep 2022 20:48:54 +0800
Subject: net: sched: cls_api: introduce tc_cls_bind_class() helper

All the bind_class callback duplicate the same logic, this patch
introduces tc_cls_bind_class() helper for common usage.

Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index d376c995d906..4cabb32a2ad9 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -210,6 +210,18 @@ tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r)
 	__tcf_unbind_filter(q, r);
 }
 
+static inline void tc_cls_bind_class(u32 classid, unsigned long cl,
+				     void *q, struct tcf_result *res,
+				     unsigned long base)
+{
+	if (res->classid == classid) {
+		if (cl)
+			__tcf_bind_filter(q, res, base);
+		else
+			__tcf_unbind_filter(q, res);
+	}
+}
+
 struct tcf_exts {
 #ifdef CONFIG_NET_CLS_ACT
 	__u32	type; /* for backward compat(TCA_OLD_COMPAT) */
-- 
cgit v1.2.3


From cc9039a1349425516eca369183c5a8d2f139cb1b Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Tue, 27 Sep 2022 20:48:55 +0800
Subject: net: sched: use tc_cls_bind_class() in filter

Use tc_cls_bind_class() in filter.

Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_basic.c    | 7 +------
 net/sched/cls_bpf.c      | 7 +------
 net/sched/cls_flower.c   | 7 +------
 net/sched/cls_fw.c       | 7 +------
 net/sched/cls_matchall.c | 7 +------
 net/sched/cls_route.c    | 7 +------
 net/sched/cls_rsvp.h     | 7 +------
 net/sched/cls_tcindex.c  | 7 +------
 net/sched/cls_u32.c      | 7 +------
 9 files changed, 9 insertions(+), 54 deletions(-)

diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index d9fbaa0fbe8b..d229ce99e554 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -261,12 +261,7 @@ static void basic_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
 {
 	struct basic_filter *f = fh;
 
-	if (f && f->res.classid == classid) {
-		if (cl)
-			__tcf_bind_filter(q, &f->res, base);
-		else
-			__tcf_unbind_filter(q, &f->res);
-	}
+	tc_cls_bind_class(classid, cl, q, &f->res, base);
 }
 
 static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh,
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 938be14cfa3f..bc317b3eac12 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -635,12 +635,7 @@ static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl,
 {
 	struct cls_bpf_prog *prog = fh;
 
-	if (prog && prog->res.classid == classid) {
-		if (cl)
-			__tcf_bind_filter(q, &prog->res, base);
-		else
-			__tcf_unbind_filter(q, &prog->res);
-	}
+	tc_cls_bind_class(classid, cl, q, &prog->res, base);
 }
 
 static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg,
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 22d32b82bc09..25bc57ee6ea1 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -3405,12 +3405,7 @@ static void fl_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
 {
 	struct cls_fl_filter *f = fh;
 
-	if (f && f->res.classid == classid) {
-		if (cl)
-			__tcf_bind_filter(q, &f->res, base);
-		else
-			__tcf_unbind_filter(q, &f->res);
-	}
+	tc_cls_bind_class(classid, cl, q, &f->res, base);
 }
 
 static bool fl_delete_empty(struct tcf_proto *tp)
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index fa66191574a4..a32351da968c 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -416,12 +416,7 @@ static void fw_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
 {
 	struct fw_filter *f = fh;
 
-	if (f && f->res.classid == classid) {
-		if (cl)
-			__tcf_bind_filter(q, &f->res, base);
-		else
-			__tcf_unbind_filter(q, &f->res);
-	}
+	tc_cls_bind_class(classid, cl, q, &f->res, base);
 }
 
 static struct tcf_proto_ops cls_fw_ops __read_mostly = {
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 63b99ffb7dbc..39a5d9c170de 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -394,12 +394,7 @@ static void mall_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
 {
 	struct cls_mall_head *head = fh;
 
-	if (head && head->res.classid == classid) {
-		if (cl)
-			__tcf_bind_filter(q, &head->res, base);
-		else
-			__tcf_unbind_filter(q, &head->res);
-	}
+	tc_cls_bind_class(classid, cl, q, &head->res, base);
 }
 
 static struct tcf_proto_ops cls_mall_ops __read_mostly = {
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 17bb04af2fa8..9e43b929d4ca 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -649,12 +649,7 @@ static void route4_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
 {
 	struct route4_filter *f = fh;
 
-	if (f && f->res.classid == classid) {
-		if (cl)
-			__tcf_bind_filter(q, &f->res, base);
-		else
-			__tcf_unbind_filter(q, &f->res);
-	}
+	tc_cls_bind_class(classid, cl, q, &f->res, base);
 }
 
 static struct tcf_proto_ops cls_route4_ops __read_mostly = {
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index fb60f2c2c325..b00a7dbd0587 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -733,12 +733,7 @@ static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
 {
 	struct rsvp_filter *f = fh;
 
-	if (f && f->res.classid == classid) {
-		if (cl)
-			__tcf_bind_filter(q, &f->res, base);
-		else
-			__tcf_unbind_filter(q, &f->res);
-	}
+	tc_cls_bind_class(classid, cl, q, &f->res, base);
 }
 
 static struct tcf_proto_ops RSVP_OPS __read_mostly = {
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index a33076033462..1c9eeb98d826 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -691,12 +691,7 @@ static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl,
 {
 	struct tcindex_filter_result *r = fh;
 
-	if (r && r->res.classid == classid) {
-		if (cl)
-			__tcf_bind_filter(q, &r->res, base);
-		else
-			__tcf_unbind_filter(q, &r->res);
-	}
+	tc_cls_bind_class(classid, cl, q, &r->res, base);
 }
 
 static struct tcf_proto_ops cls_tcindex_ops __read_mostly = {
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 0b3d909214b8..34d25f7a0687 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -1250,12 +1250,7 @@ static void u32_bind_class(void *fh, u32 classid, unsigned long cl, void *q,
 {
 	struct tc_u_knode *n = fh;
 
-	if (n && n->res.classid == classid) {
-		if (cl)
-			__tcf_bind_filter(q, &n->res, base);
-		else
-			__tcf_unbind_filter(q, &n->res);
-	}
+	tc_cls_bind_class(classid, cl, q, &n->res, base);
 }
 
 static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh,
-- 
cgit v1.2.3


From 4fe89d07dcc2804c8b562f6c7896a45643d34b2f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 2 Oct 2022 14:09:07 -0700
Subject: Linux 6.0

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 647a42a1f800..8478e13e9424 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 6
 PATCHLEVEL = 0
 SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION =
 NAME = Hurr durr I'ma ninja sloth
 
 # *DOCUMENTATION*
-- 
cgit v1.2.3


From a91b750fd6629354460282bbf5146c01b05c4859 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 29 Sep 2022 00:25:37 +0900
Subject: net: rds: don't hold sock lock when cancelling work from
 rds_tcp_reset_callbacks()

syzbot is reporting lockdep warning at rds_tcp_reset_callbacks() [1], for
commit ac3615e7f3cffe2a ("RDS: TCP: Reduce code duplication in
rds_tcp_reset_callbacks()") added cancel_delayed_work_sync() into a section
protected by lock_sock() without realizing that rds_send_xmit() might call
lock_sock().

We don't need to protect cancel_delayed_work_sync() using lock_sock(), for
even if rds_{send,recv}_worker() re-queued this work while __flush_work()
 from cancel_delayed_work_sync() was waiting for this work to complete,
retried rds_{send,recv}_worker() is no-op due to the absence of RDS_CONN_UP
bit.

Link: https://syzkaller.appspot.com/bug?extid=78c55c7bc6f66e53dce2 [1]
Reported-by: syzbot <syzbot+78c55c7bc6f66e53dce2@syzkaller.appspotmail.com>
Co-developed-by: Hillf Danton <hdanton@sina.com>
Signed-off-by: Hillf Danton <hdanton@sina.com>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: syzbot <syzbot+78c55c7bc6f66e53dce2@syzkaller.appspotmail.com>
Fixes: ac3615e7f3cffe2a ("RDS: TCP: Reduce code duplication in rds_tcp_reset_callbacks()")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 73ee2771093d..d0ff413f697c 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -166,10 +166,10 @@ void rds_tcp_reset_callbacks(struct socket *sock,
 	 */
 	atomic_set(&cp->cp_state, RDS_CONN_RESETTING);
 	wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags));
-	lock_sock(osock->sk);
 	/* reset receive side state for rds_tcp_data_recv() for osock  */
 	cancel_delayed_work_sync(&cp->cp_send_w);
 	cancel_delayed_work_sync(&cp->cp_recv_w);
+	lock_sock(osock->sk);
 	if (tc->t_tinc) {
 		rds_inc_put(&tc->t_tinc->ti_inc);
 		tc->t_tinc = NULL;
-- 
cgit v1.2.3


From 537dd2d9fb9f4aa7939fb4fcf552ebe4f497bd7e Mon Sep 17 00:00:00 2001
From: Liu Jian <liujian56@huawei.com>
Date: Thu, 29 Sep 2022 21:52:02 +0800
Subject: net: Add helper function to parse netlink msg of ip_tunnel_encap

Add ip_tunnel_netlink_encap_parms to parse netlink msg of ip_tunnel_encap.
Reduces duplicate code, no actual functional changes.

Signed-off-by: Liu Jian <liujian56@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h  |  3 +++
 net/ipv4/ip_tunnel_core.c | 35 +++++++++++++++++++++++++++++++++++
 net/ipv4/ipip.c           | 38 ++------------------------------------
 net/ipv6/ip6_tunnel.c     | 37 ++-----------------------------------
 net/ipv6/sit.c            | 38 ++------------------------------------
 5 files changed, 44 insertions(+), 107 deletions(-)

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index ced80e2f8b58..51da2957cf48 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -302,6 +302,9 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
 		      struct ip_tunnel_parm *p, __u32 fwmark);
 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);
 
+bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
+				   struct ip_tunnel_encap *encap);
+
 extern const struct header_ops ip_tunnel_header_ops;
 __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb);
 
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index cc1caab4a654..6d08f7e39191 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -1079,3 +1079,38 @@ EXPORT_SYMBOL(ip_tunnel_parse_protocol);
 
 const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol };
 EXPORT_SYMBOL(ip_tunnel_header_ops);
+
+/* This function returns true when ENCAP attributes are present in the nl msg */
+bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
+				   struct ip_tunnel_encap *encap)
+{
+	bool ret = false;
+
+	memset(encap, 0, sizeof(*encap));
+
+	if (!data)
+		return ret;
+
+	if (data[IFLA_IPTUN_ENCAP_TYPE]) {
+		ret = true;
+		encap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
+	}
+
+	if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
+		ret = true;
+		encap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
+	}
+
+	if (data[IFLA_IPTUN_ENCAP_SPORT]) {
+		ret = true;
+		encap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
+	}
+
+	if (data[IFLA_IPTUN_ENCAP_DPORT]) {
+		ret = true;
+		encap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 123ea63a04cb..7c64ca06adf3 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -448,40 +448,6 @@ static void ipip_netlink_parms(struct nlattr *data[],
 		*fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
 }
 
-/* This function returns true when ENCAP attributes are present in the nl msg */
-static bool ipip_netlink_encap_parms(struct nlattr *data[],
-				     struct ip_tunnel_encap *ipencap)
-{
-	bool ret = false;
-
-	memset(ipencap, 0, sizeof(*ipencap));
-
-	if (!data)
-		return ret;
-
-	if (data[IFLA_IPTUN_ENCAP_TYPE]) {
-		ret = true;
-		ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
-	}
-
-	if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
-		ret = true;
-		ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
-	}
-
-	if (data[IFLA_IPTUN_ENCAP_SPORT]) {
-		ret = true;
-		ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
-	}
-
-	if (data[IFLA_IPTUN_ENCAP_DPORT]) {
-		ret = true;
-		ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
-	}
-
-	return ret;
-}
-
 static int ipip_newlink(struct net *src_net, struct net_device *dev,
 			struct nlattr *tb[], struct nlattr *data[],
 			struct netlink_ext_ack *extack)
@@ -491,7 +457,7 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev,
 	struct ip_tunnel_encap ipencap;
 	__u32 fwmark = 0;
 
-	if (ipip_netlink_encap_parms(data, &ipencap)) {
+	if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
 		int err = ip_tunnel_encap_setup(t, &ipencap);
 
 		if (err < 0)
@@ -512,7 +478,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 	bool collect_md;
 	__u32 fwmark = t->fwmark;
 
-	if (ipip_netlink_encap_parms(data, &ipencap)) {
+	if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
 		int err = ip_tunnel_encap_setup(t, &ipencap);
 
 		if (err < 0)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 9e97f3b4c7e8..cc5d5e75b658 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1988,39 +1988,6 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
 		parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
 }
 
-static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
-					struct ip_tunnel_encap *ipencap)
-{
-	bool ret = false;
-
-	memset(ipencap, 0, sizeof(*ipencap));
-
-	if (!data)
-		return ret;
-
-	if (data[IFLA_IPTUN_ENCAP_TYPE]) {
-		ret = true;
-		ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
-	}
-
-	if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
-		ret = true;
-		ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
-	}
-
-	if (data[IFLA_IPTUN_ENCAP_SPORT]) {
-		ret = true;
-		ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
-	}
-
-	if (data[IFLA_IPTUN_ENCAP_DPORT]) {
-		ret = true;
-		ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
-	}
-
-	return ret;
-}
-
 static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
 			   struct nlattr *tb[], struct nlattr *data[],
 			   struct netlink_ext_ack *extack)
@@ -2033,7 +2000,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
 
 	nt = netdev_priv(dev);
 
-	if (ip6_tnl_netlink_encap_parms(data, &ipencap)) {
+	if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
 		err = ip6_tnl_encap_setup(nt, &ipencap);
 		if (err < 0)
 			return err;
@@ -2070,7 +2037,7 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (dev == ip6n->fb_tnl_dev)
 		return -EINVAL;
 
-	if (ip6_tnl_netlink_encap_parms(data, &ipencap)) {
+	if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
 		int err = ip6_tnl_encap_setup(t, &ipencap);
 
 		if (err < 0)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 98f1cf40746f..a8a258f672fa 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1534,40 +1534,6 @@ static void ipip6_netlink_parms(struct nlattr *data[],
 		*fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
 }
 
-/* This function returns true when ENCAP attributes are present in the nl msg */
-static bool ipip6_netlink_encap_parms(struct nlattr *data[],
-				      struct ip_tunnel_encap *ipencap)
-{
-	bool ret = false;
-
-	memset(ipencap, 0, sizeof(*ipencap));
-
-	if (!data)
-		return ret;
-
-	if (data[IFLA_IPTUN_ENCAP_TYPE]) {
-		ret = true;
-		ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
-	}
-
-	if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
-		ret = true;
-		ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
-	}
-
-	if (data[IFLA_IPTUN_ENCAP_SPORT]) {
-		ret = true;
-		ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
-	}
-
-	if (data[IFLA_IPTUN_ENCAP_DPORT]) {
-		ret = true;
-		ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
-	}
-
-	return ret;
-}
-
 #ifdef CONFIG_IPV6_SIT_6RD
 /* This function returns true when 6RD attributes are present in the nl msg */
 static bool ipip6_netlink_6rd_parms(struct nlattr *data[],
@@ -1619,7 +1585,7 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev,
 
 	nt = netdev_priv(dev);
 
-	if (ipip6_netlink_encap_parms(data, &ipencap)) {
+	if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
 		err = ip_tunnel_encap_setup(nt, &ipencap);
 		if (err < 0)
 			return err;
@@ -1671,7 +1637,7 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (dev == sitn->fb_tunnel_dev)
 		return -EINVAL;
 
-	if (ipip6_netlink_encap_parms(data, &ipencap)) {
+	if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
 		err = ip_tunnel_encap_setup(t, &ipencap);
 		if (err < 0)
 			return err;
-- 
cgit v1.2.3


From b86fca800a6a3d439c454b462f7f067a18234e60 Mon Sep 17 00:00:00 2001
From: Liu Jian <liujian56@huawei.com>
Date: Thu, 29 Sep 2022 21:52:03 +0800
Subject: net: Add helper function to parse netlink msg of ip_tunnel_parm

Add ip_tunnel_netlink_parms to parse netlink msg of ip_tunnel_parm.
Reduces duplicate code, no actual functional changes.

Signed-off-by: Liu Jian <liujian56@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip_tunnels.h  |  3 +++
 net/ipv4/ip_tunnel_core.c | 32 ++++++++++++++++++++++++++++++++
 net/ipv4/ipip.c           | 24 +-----------------------
 net/ipv6/sit.c            | 27 +--------------------------
 4 files changed, 37 insertions(+), 49 deletions(-)

diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 51da2957cf48..fca357679816 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -305,6 +305,9 @@ void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);
 bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
 				   struct ip_tunnel_encap *encap);
 
+void ip_tunnel_netlink_parms(struct nlattr *data[],
+			     struct ip_tunnel_parm *parms);
+
 extern const struct header_ops ip_tunnel_header_ops;
 __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb);
 
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 6d08f7e39191..92c02c886fe7 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -1114,3 +1114,35 @@ bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
 	return ret;
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms);
+
+void ip_tunnel_netlink_parms(struct nlattr *data[],
+			     struct ip_tunnel_parm *parms)
+{
+	if (data[IFLA_IPTUN_LINK])
+		parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
+
+	if (data[IFLA_IPTUN_LOCAL])
+		parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
+
+	if (data[IFLA_IPTUN_REMOTE])
+		parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
+
+	if (data[IFLA_IPTUN_TTL]) {
+		parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
+		if (parms->iph.ttl)
+			parms->iph.frag_off = htons(IP_DF);
+	}
+
+	if (data[IFLA_IPTUN_TOS])
+		parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
+
+	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
+		parms->iph.frag_off = htons(IP_DF);
+
+	if (data[IFLA_IPTUN_FLAGS])
+		parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
+
+	if (data[IFLA_IPTUN_PROTO])
+		parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_netlink_parms);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 7c64ca06adf3..180f9daf5bec 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -417,29 +417,7 @@ static void ipip_netlink_parms(struct nlattr *data[],
 	if (!data)
 		return;
 
-	if (data[IFLA_IPTUN_LINK])
-		parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
-
-	if (data[IFLA_IPTUN_LOCAL])
-		parms->iph.saddr = nla_get_in_addr(data[IFLA_IPTUN_LOCAL]);
-
-	if (data[IFLA_IPTUN_REMOTE])
-		parms->iph.daddr = nla_get_in_addr(data[IFLA_IPTUN_REMOTE]);
-
-	if (data[IFLA_IPTUN_TTL]) {
-		parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
-		if (parms->iph.ttl)
-			parms->iph.frag_off = htons(IP_DF);
-	}
-
-	if (data[IFLA_IPTUN_TOS])
-		parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
-
-	if (data[IFLA_IPTUN_PROTO])
-		parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
-
-	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
-		parms->iph.frag_off = htons(IP_DF);
+	ip_tunnel_netlink_parms(data, parms);
 
 	if (data[IFLA_IPTUN_COLLECT_METADATA])
 		*collect_md = true;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index a8a258f672fa..d27683e3fc97 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1503,32 +1503,7 @@ static void ipip6_netlink_parms(struct nlattr *data[],
 	if (!data)
 		return;
 
-	if (data[IFLA_IPTUN_LINK])
-		parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
-
-	if (data[IFLA_IPTUN_LOCAL])
-		parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
-
-	if (data[IFLA_IPTUN_REMOTE])
-		parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
-
-	if (data[IFLA_IPTUN_TTL]) {
-		parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
-		if (parms->iph.ttl)
-			parms->iph.frag_off = htons(IP_DF);
-	}
-
-	if (data[IFLA_IPTUN_TOS])
-		parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
-
-	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
-		parms->iph.frag_off = htons(IP_DF);
-
-	if (data[IFLA_IPTUN_FLAGS])
-		parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
-
-	if (data[IFLA_IPTUN_PROTO])
-		parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+	ip_tunnel_netlink_parms(data, parms);
 
 	if (data[IFLA_IPTUN_FWMARK])
 		*fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
-- 
cgit v1.2.3


From 7a62ed61367b8fd01bae1e18e30602c25060d824 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Thu, 29 Sep 2022 08:52:04 -0700
Subject: af_unix: Fix memory leaks of the whole sk due to OOB skb.

syzbot reported a sequence of memory leaks, and one of them indicated we
failed to free a whole sk:

  unreferenced object 0xffff8880126e0000 (size 1088):
    comm "syz-executor419", pid 326, jiffies 4294773607 (age 12.609s)
    hex dump (first 32 bytes):
      00 00 00 00 00 00 00 00 7d 00 00 00 00 00 00 00  ........}.......
      01 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
    backtrace:
      [<000000006fefe750>] sk_prot_alloc+0x64/0x2a0 net/core/sock.c:1970
      [<0000000074006db5>] sk_alloc+0x3b/0x800 net/core/sock.c:2029
      [<00000000728cd434>] unix_create1+0xaf/0x920 net/unix/af_unix.c:928
      [<00000000a279a139>] unix_create+0x113/0x1d0 net/unix/af_unix.c:997
      [<0000000068259812>] __sock_create+0x2ab/0x550 net/socket.c:1516
      [<00000000da1521e1>] sock_create net/socket.c:1566 [inline]
      [<00000000da1521e1>] __sys_socketpair+0x1a8/0x550 net/socket.c:1698
      [<000000007ab259e1>] __do_sys_socketpair net/socket.c:1751 [inline]
      [<000000007ab259e1>] __se_sys_socketpair net/socket.c:1748 [inline]
      [<000000007ab259e1>] __x64_sys_socketpair+0x97/0x100 net/socket.c:1748
      [<000000007dedddc1>] do_syscall_x64 arch/x86/entry/common.c:50 [inline]
      [<000000007dedddc1>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80
      [<000000009456679f>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

We can reproduce this issue by creating two AF_UNIX SOCK_STREAM sockets,
send()ing an OOB skb to each other, and close()ing them without consuming
the OOB skbs.

  int skpair[2];

  socketpair(AF_UNIX, SOCK_STREAM, 0, skpair);

  send(skpair[0], "x", 1, MSG_OOB);
  send(skpair[1], "x", 1, MSG_OOB);

  close(skpair[0]);
  close(skpair[1]);

Currently, we free an OOB skb in unix_sock_destructor() which is called via
__sk_free(), but it's too late because the receiver's unix_sk(sk)->oob_skb
is accounted against the sender's sk->sk_wmem_alloc and __sk_free() is
called only when sk->sk_wmem_alloc is 0.

In the repro sequences, we do not consume the OOB skb, so both two sk's
sock_put() never reach __sk_free() due to the positive sk->sk_wmem_alloc.
Then, no one can consume the OOB skb nor call __sk_free(), and we finally
leak the two whole sk.

Thus, we must free the unconsumed OOB skb earlier when close()ing the
socket.

Fixes: 314001f0bf92 ("af_unix: Add OOB support")
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index bf338b782fc4..d686804119c9 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -569,12 +569,6 @@ static void unix_sock_destructor(struct sock *sk)
 
 	skb_queue_purge(&sk->sk_receive_queue);
 
-#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
-	if (u->oob_skb) {
-		kfree_skb(u->oob_skb);
-		u->oob_skb = NULL;
-	}
-#endif
 	DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
 	DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
 	DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
@@ -620,6 +614,13 @@ static void unix_release_sock(struct sock *sk, int embrion)
 
 	unix_state_unlock(sk);
 
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+	if (u->oob_skb) {
+		kfree_skb(u->oob_skb);
+		u->oob_skb = NULL;
+	}
+#endif
+
 	wake_up_interruptible_all(&u->peer_wait);
 
 	if (skpair != NULL) {
-- 
cgit v1.2.3


From 73ea735073599430818e89b8901452287a15a718 Mon Sep 17 00:00:00 2001
From: Nathan Huckleberry <nhuck@google.com>
Date: Thu, 29 Sep 2022 11:19:47 -0700
Subject: net: sparx5: Fix return type of sparx5_port_xmit_impl

The ndo_start_xmit field in net_device_ops is expected to be of type
netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb, struct net_device *dev).

The mismatched return type breaks forward edge kCFI since the underlying
function definition does not match the function hook definition.

The return type of sparx5_port_xmit_impl should be changed from int to
netdev_tx_t.

Reported-by: Dan Carpenter <error27@gmail.com>
Link: https://github.com/ClangBuiltLinux/linux/issues/1703
Cc: llvm@lists.linux.dev
Signed-off-by: Nathan Huckleberry <nhuck@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microchip/sparx5/sparx5_main.h   | 2 +-
 drivers/net/ethernet/microchip/sparx5/sparx5_packet.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h
index b197129044b5..c9a77a14d3d7 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_main.h
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_main.h
@@ -291,7 +291,7 @@ struct frame_info {
 void sparx5_xtr_flush(struct sparx5 *sparx5, u8 grp);
 void sparx5_ifh_parse(u32 *ifh, struct frame_info *info);
 irqreturn_t sparx5_xtr_handler(int irq, void *_priv);
-int sparx5_port_xmit_impl(struct sk_buff *skb, struct net_device *dev);
+netdev_tx_t sparx5_port_xmit_impl(struct sk_buff *skb, struct net_device *dev);
 int sparx5_manual_injection_mode(struct sparx5 *sparx5);
 void sparx5_port_inj_timer_setup(struct sparx5_port *port);
 
diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c
index 21844beba72d..83c16ca5b30f 100644
--- a/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c
+++ b/drivers/net/ethernet/microchip/sparx5/sparx5_packet.c
@@ -222,13 +222,13 @@ static int sparx5_inject(struct sparx5 *sparx5,
 	return NETDEV_TX_OK;
 }
 
-int sparx5_port_xmit_impl(struct sk_buff *skb, struct net_device *dev)
+netdev_tx_t sparx5_port_xmit_impl(struct sk_buff *skb, struct net_device *dev)
 {
 	struct net_device_stats *stats = &dev->stats;
 	struct sparx5_port *port = netdev_priv(dev);
 	struct sparx5 *sparx5 = port->sparx5;
 	u32 ifh[IFH_LEN];
-	int ret;
+	netdev_tx_t ret;
 
 	memset(ifh, 0, IFH_LEN * 4);
 	sparx5_set_port_ifh(ifh, port->portno);
-- 
cgit v1.2.3


From 1645f44dd5b846a473d7789fe622c278eb880d48 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 30 Sep 2022 16:20:59 +0200
Subject: net: phylink: add ability to validate a set of interface modes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rather than having the ability to validate all supported interface
modes or a single interface mode, introduce the ability to validate
a subset of supported modes.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
[ rebased on current net-next ]
Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index d0af026c9afa..2cf388fad1be 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -637,8 +637,9 @@ static int phylink_validate_mac_and_pcs(struct phylink *pl,
 	return phylink_is_empty_linkmode(supported) ? -EINVAL : 0;
 }
 
-static int phylink_validate_any(struct phylink *pl, unsigned long *supported,
-				struct phylink_link_state *state)
+static int phylink_validate_mask(struct phylink *pl, unsigned long *supported,
+				 struct phylink_link_state *state,
+				 const unsigned long *interfaces)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(all_adv) = { 0, };
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(all_s) = { 0, };
@@ -647,7 +648,7 @@ static int phylink_validate_any(struct phylink *pl, unsigned long *supported,
 	int intf;
 
 	for (intf = 0; intf < PHY_INTERFACE_MODE_MAX; intf++) {
-		if (test_bit(intf, pl->config->supported_interfaces)) {
+		if (test_bit(intf, interfaces)) {
 			linkmode_copy(s, supported);
 
 			t = *state;
@@ -668,12 +669,14 @@ static int phylink_validate_any(struct phylink *pl, unsigned long *supported,
 static int phylink_validate(struct phylink *pl, unsigned long *supported,
 			    struct phylink_link_state *state)
 {
-	if (!phy_interface_empty(pl->config->supported_interfaces)) {
+	const unsigned long *interfaces = pl->config->supported_interfaces;
+
+	if (!phy_interface_empty(interfaces)) {
 		if (state->interface == PHY_INTERFACE_MODE_NA)
-			return phylink_validate_any(pl, supported, state);
+			return phylink_validate_mask(pl, supported, state,
+						     interfaces);
 
-		if (!test_bit(state->interface,
-			      pl->config->supported_interfaces))
+		if (!test_bit(state->interface, interfaces))
 			return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From fd580c9830316edad6f8b1d9f542563730658efe Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 30 Sep 2022 16:21:00 +0200
Subject: net: sfp: augment SFP parsing with phy_interface_t bitmap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We currently parse the SFP EEPROM to a bitmap of ethtool link modes,
and then attempt to convert the link modes to a PHY interface mode.
While this works at present, there are cases where this is sub-optimal.
For example, where a module can operate with several different PHY
interface modes.

To start addressing this, arrange for the SFP EEPROM parsing to also
provide a bitmap of the possible PHY interface modes.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/at803x.c          |  3 +-
 drivers/net/phy/marvell-88x2222.c |  3 +-
 drivers/net/phy/marvell.c         |  3 +-
 drivers/net/phy/marvell10g.c      |  3 +-
 drivers/net/phy/phylink.c         |  4 ++-
 drivers/net/phy/sfp-bus.c         | 75 +++++++++++++++++++++++++++++----------
 drivers/net/phy/sfp.c             |  7 ++--
 drivers/net/phy/sfp.h             |  3 +-
 include/linux/sfp.h               |  5 +--
 9 files changed, 78 insertions(+), 28 deletions(-)

diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
index 11ebd59bf2eb..9e9adde335c8 100644
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -676,6 +676,7 @@ static int at803x_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
 	struct phy_device *phydev = upstream;
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(phy_support);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(sfp_support);
+	DECLARE_PHY_INTERFACE_MASK(interfaces);
 	phy_interface_t iface;
 
 	linkmode_zero(phy_support);
@@ -686,7 +687,7 @@ static int at803x_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
 	phylink_set(phy_support, Asym_Pause);
 
 	linkmode_zero(sfp_support);
-	sfp_parse_support(phydev->sfp_bus, id, sfp_support);
+	sfp_parse_support(phydev->sfp_bus, id, sfp_support, interfaces);
 	/* Some modules support 10G modes as well as others we support.
 	 * Mask out non-supported modes so the correct interface is picked.
 	 */
diff --git a/drivers/net/phy/marvell-88x2222.c b/drivers/net/phy/marvell-88x2222.c
index f070776ca904..fd9ad4820192 100644
--- a/drivers/net/phy/marvell-88x2222.c
+++ b/drivers/net/phy/marvell-88x2222.c
@@ -478,6 +478,7 @@ static int mv2222_config_init(struct phy_device *phydev)
 
 static int mv2222_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
 {
+	DECLARE_PHY_INTERFACE_MASK(interfaces);
 	struct phy_device *phydev = upstream;
 	phy_interface_t sfp_interface;
 	struct mv2222_data *priv;
@@ -489,7 +490,7 @@ static int mv2222_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
 	priv = (struct mv2222_data *)phydev->priv;
 	dev = &phydev->mdio.dev;
 
-	sfp_parse_support(phydev->sfp_bus, id, sfp_supported);
+	sfp_parse_support(phydev->sfp_bus, id, sfp_supported, interfaces);
 	phydev->port = sfp_parse_port(phydev->sfp_bus, id, sfp_supported);
 	sfp_interface = sfp_select_interface(phydev->sfp_bus, sfp_supported);
 
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index a3e810705ce2..2810f4f9da0c 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -2845,6 +2845,7 @@ static int marvell_probe(struct phy_device *phydev)
 
 static int m88e1510_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
 {
+	DECLARE_PHY_INTERFACE_MASK(interfaces);
 	struct phy_device *phydev = upstream;
 	phy_interface_t interface;
 	struct device *dev;
@@ -2856,7 +2857,7 @@ static int m88e1510_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
 
 	dev = &phydev->mdio.dev;
 
-	sfp_parse_support(phydev->sfp_bus, id, supported);
+	sfp_parse_support(phydev->sfp_bus, id, supported, interfaces);
 	interface = sfp_select_interface(phydev->sfp_bus, supported);
 
 	dev_info(dev, "%s SFP module inserted\n", phy_modes(interface));
diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 2b7d0720720b..05a5ed089965 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -466,9 +466,10 @@ static int mv3310_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
 {
 	struct phy_device *phydev = upstream;
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(support) = { 0, };
+	DECLARE_PHY_INTERFACE_MASK(interfaces);
 	phy_interface_t iface;
 
-	sfp_parse_support(phydev->sfp_bus, id, support);
+	sfp_parse_support(phydev->sfp_bus, id, support, interfaces);
 	iface = sfp_select_interface(phydev->sfp_bus, support);
 
 	if (iface != PHY_INTERFACE_MODE_10GBASER) {
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 2cf388fad1be..b76bf8df83ff 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -77,6 +77,7 @@ struct phylink {
 
 	struct sfp_bus *sfp_bus;
 	bool sfp_may_have_phy;
+	DECLARE_PHY_INTERFACE_MASK(sfp_interfaces);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(sfp_support);
 	u8 sfp_port;
 };
@@ -2898,7 +2899,8 @@ static int phylink_sfp_module_insert(void *upstream,
 	ASSERT_RTNL();
 
 	linkmode_zero(support);
-	sfp_parse_support(pl->sfp_bus, id, support);
+	phy_interface_zero(pl->sfp_interfaces);
+	sfp_parse_support(pl->sfp_bus, id, support, pl->sfp_interfaces);
 	pl->sfp_port = sfp_parse_port(pl->sfp_bus, id, support);
 
 	/* If this module may have a PHY connecting later, defer until later */
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index 0a9099c77694..29e3fa86bac3 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -139,12 +139,14 @@ EXPORT_SYMBOL_GPL(sfp_may_have_phy);
  * @bus: a pointer to the &struct sfp_bus structure for the sfp module
  * @id: a pointer to the module's &struct sfp_eeprom_id
  * @support: pointer to an array of unsigned long for the ethtool support mask
+ * @interfaces: pointer to an array of unsigned long for phy interface modes
+ *		mask
  *
  * Parse the EEPROM identification information and derive the supported
  * ethtool link modes for the module.
  */
 void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
-		       unsigned long *support)
+		       unsigned long *support, unsigned long *interfaces)
 {
 	unsigned int br_min, br_nom, br_max;
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(modes) = { 0, };
@@ -171,54 +173,81 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
 	}
 
 	/* Set ethtool support from the compliance fields. */
-	if (id->base.e10g_base_sr)
+	if (id->base.e10g_base_sr) {
 		phylink_set(modes, 10000baseSR_Full);
-	if (id->base.e10g_base_lr)
+		__set_bit(PHY_INTERFACE_MODE_10GBASER, interfaces);
+	}
+	if (id->base.e10g_base_lr) {
 		phylink_set(modes, 10000baseLR_Full);
-	if (id->base.e10g_base_lrm)
+		__set_bit(PHY_INTERFACE_MODE_10GBASER, interfaces);
+	}
+	if (id->base.e10g_base_lrm) {
 		phylink_set(modes, 10000baseLRM_Full);
-	if (id->base.e10g_base_er)
+		__set_bit(PHY_INTERFACE_MODE_10GBASER, interfaces);
+	}
+	if (id->base.e10g_base_er) {
 		phylink_set(modes, 10000baseER_Full);
+		__set_bit(PHY_INTERFACE_MODE_10GBASER, interfaces);
+	}
 	if (id->base.e1000_base_sx ||
 	    id->base.e1000_base_lx ||
-	    id->base.e1000_base_cx)
+	    id->base.e1000_base_cx) {
 		phylink_set(modes, 1000baseX_Full);
+		__set_bit(PHY_INTERFACE_MODE_1000BASEX, interfaces);
+	}
 	if (id->base.e1000_base_t) {
 		phylink_set(modes, 1000baseT_Half);
 		phylink_set(modes, 1000baseT_Full);
+		__set_bit(PHY_INTERFACE_MODE_1000BASEX, interfaces);
+		__set_bit(PHY_INTERFACE_MODE_SGMII, interfaces);
 	}
 
 	/* 1000Base-PX or 1000Base-BX10 */
 	if ((id->base.e_base_px || id->base.e_base_bx10) &&
-	    br_min <= 1300 && br_max >= 1200)
+	    br_min <= 1300 && br_max >= 1200) {
 		phylink_set(modes, 1000baseX_Full);
+		__set_bit(PHY_INTERFACE_MODE_1000BASEX, interfaces);
+	}
 
 	/* 100Base-FX, 100Base-LX, 100Base-PX, 100Base-BX10 */
-	if (id->base.e100_base_fx || id->base.e100_base_lx)
+	if (id->base.e100_base_fx || id->base.e100_base_lx) {
 		phylink_set(modes, 100baseFX_Full);
-	if ((id->base.e_base_px || id->base.e_base_bx10) && br_nom == 100)
+		__set_bit(PHY_INTERFACE_MODE_100BASEX, interfaces);
+	}
+	if ((id->base.e_base_px || id->base.e_base_bx10) && br_nom == 100) {
 		phylink_set(modes, 100baseFX_Full);
+		__set_bit(PHY_INTERFACE_MODE_100BASEX, interfaces);
+	}
 
 	/* For active or passive cables, select the link modes
 	 * based on the bit rates and the cable compliance bytes.
 	 */
 	if ((id->base.sfp_ct_passive || id->base.sfp_ct_active) && br_nom) {
 		/* This may look odd, but some manufacturers use 12000MBd */
-		if (br_min <= 12000 && br_max >= 10300)
+		if (br_min <= 12000 && br_max >= 10300) {
 			phylink_set(modes, 10000baseCR_Full);
-		if (br_min <= 3200 && br_max >= 3100)
+			__set_bit(PHY_INTERFACE_MODE_10GBASER, interfaces);
+		}
+		if (br_min <= 3200 && br_max >= 3100) {
 			phylink_set(modes, 2500baseX_Full);
-		if (br_min <= 1300 && br_max >= 1200)
+			__set_bit(PHY_INTERFACE_MODE_2500BASEX, interfaces);
+		}
+		if (br_min <= 1300 && br_max >= 1200) {
 			phylink_set(modes, 1000baseX_Full);
+			__set_bit(PHY_INTERFACE_MODE_1000BASEX, interfaces);
+		}
 	}
 	if (id->base.sfp_ct_passive) {
-		if (id->base.passive.sff8431_app_e)
+		if (id->base.passive.sff8431_app_e) {
 			phylink_set(modes, 10000baseCR_Full);
+			__set_bit(PHY_INTERFACE_MODE_10GBASER, interfaces);
+		}
 	}
 	if (id->base.sfp_ct_active) {
 		if (id->base.active.sff8431_app_e ||
 		    id->base.active.sff8431_lim) {
 			phylink_set(modes, 10000baseCR_Full);
+			__set_bit(PHY_INTERFACE_MODE_10GBASER, interfaces);
 		}
 	}
 
@@ -243,12 +272,14 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
 	case SFF8024_ECC_10GBASE_T_SFI:
 	case SFF8024_ECC_10GBASE_T_SR:
 		phylink_set(modes, 10000baseT_Full);
+		__set_bit(PHY_INTERFACE_MODE_10GBASER, interfaces);
 		break;
 	case SFF8024_ECC_5GBASE_T:
 		phylink_set(modes, 5000baseT_Full);
 		break;
 	case SFF8024_ECC_2_5GBASE_T:
 		phylink_set(modes, 2500baseT_Full);
+		__set_bit(PHY_INTERFACE_MODE_2500BASEX, interfaces);
 		break;
 	default:
 		dev_warn(bus->sfp_dev,
@@ -261,10 +292,14 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
 	if (id->base.fc_speed_100 ||
 	    id->base.fc_speed_200 ||
 	    id->base.fc_speed_400) {
-		if (id->base.br_nominal >= 31)
+		if (id->base.br_nominal >= 31) {
 			phylink_set(modes, 2500baseX_Full);
-		if (id->base.br_nominal >= 12)
+			__set_bit(PHY_INTERFACE_MODE_2500BASEX, interfaces);
+		}
+		if (id->base.br_nominal >= 12) {
 			phylink_set(modes, 1000baseX_Full);
+			__set_bit(PHY_INTERFACE_MODE_1000BASEX, interfaces);
+		}
 	}
 
 	/* If we haven't discovered any modes that this module supports, try
@@ -277,14 +312,18 @@ void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
 	 * 2500BASE-X, so we allow some slack here.
 	 */
 	if (bitmap_empty(modes, __ETHTOOL_LINK_MODE_MASK_NBITS) && br_nom) {
-		if (br_min <= 1300 && br_max >= 1200)
+		if (br_min <= 1300 && br_max >= 1200) {
 			phylink_set(modes, 1000baseX_Full);
-		if (br_min <= 3200 && br_max >= 2500)
+			__set_bit(PHY_INTERFACE_MODE_1000BASEX, interfaces);
+		}
+		if (br_min <= 3200 && br_max >= 2500) {
 			phylink_set(modes, 2500baseX_Full);
+			__set_bit(PHY_INTERFACE_MODE_2500BASEX, interfaces);
+		}
 	}
 
 	if (bus->sfp_quirk && bus->sfp_quirk->modes)
-		bus->sfp_quirk->modes(id, modes);
+		bus->sfp_quirk->modes(id, modes, interfaces);
 
 	linkmode_or(support, support, modes);
 
diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index cb1dbd0d9701..b150e4765819 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -331,13 +331,16 @@ static void sfp_fixup_halny_gsfp(struct sfp *sfp)
 }
 
 static void sfp_quirk_2500basex(const struct sfp_eeprom_id *id,
-				unsigned long *modes)
+				unsigned long *modes,
+				unsigned long *interfaces)
 {
 	linkmode_set_bit(ETHTOOL_LINK_MODE_2500baseX_Full_BIT, modes);
+	__set_bit(PHY_INTERFACE_MODE_2500BASEX, interfaces);
 }
 
 static void sfp_quirk_ubnt_uf_instant(const struct sfp_eeprom_id *id,
-				      unsigned long *modes)
+				      unsigned long *modes,
+				      unsigned long *interfaces)
 {
 	/* Ubiquiti U-Fiber Instant module claims that support all transceiver
 	 * types including 10G Ethernet which is not truth. So clear all claimed
diff --git a/drivers/net/phy/sfp.h b/drivers/net/phy/sfp.h
index 7ad06deae76c..6cf1643214d3 100644
--- a/drivers/net/phy/sfp.h
+++ b/drivers/net/phy/sfp.h
@@ -9,7 +9,8 @@ struct sfp;
 struct sfp_quirk {
 	const char *vendor;
 	const char *part;
-	void (*modes)(const struct sfp_eeprom_id *id, unsigned long *modes);
+	void (*modes)(const struct sfp_eeprom_id *id, unsigned long *modes,
+		      unsigned long *interfaces);
 	void (*fixup)(struct sfp *sfp);
 };
 
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 302094b855fb..d1f343853b6c 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -535,7 +535,7 @@ int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
 		   unsigned long *support);
 bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id);
 void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
-		       unsigned long *support);
+		       unsigned long *support, unsigned long *interfaces);
 phy_interface_t sfp_select_interface(struct sfp_bus *bus,
 				     unsigned long *link_modes);
 
@@ -568,7 +568,8 @@ static inline bool sfp_may_have_phy(struct sfp_bus *bus,
 
 static inline void sfp_parse_support(struct sfp_bus *bus,
 				     const struct sfp_eeprom_id *id,
-				     unsigned long *support)
+				     unsigned long *support,
+				     unsigned long *interfaces)
 {
 }
 
-- 
cgit v1.2.3


From f81fa96d8a6c7a7723b7cfa2ef8f6e514843d577 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 30 Sep 2022 16:21:01 +0200
Subject: net: phylink: use phy_interface_t bitmaps for optical modules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Where a MAC provides a phy_interface_t bitmap, use these bitmaps to
select the operating interface mode for optical SFP modules, rather
than using the linkmode bitmaps.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 164 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 134 insertions(+), 30 deletions(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index b76bf8df83ff..ab32ef767d69 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -2803,6 +2803,70 @@ static void phylink_sfp_detach(void *upstream, struct sfp_bus *bus)
 	pl->netdev->sfp_bus = NULL;
 }
 
+static const phy_interface_t phylink_sfp_interface_preference[] = {
+	PHY_INTERFACE_MODE_25GBASER,
+	PHY_INTERFACE_MODE_USXGMII,
+	PHY_INTERFACE_MODE_10GBASER,
+	PHY_INTERFACE_MODE_5GBASER,
+	PHY_INTERFACE_MODE_2500BASEX,
+	PHY_INTERFACE_MODE_SGMII,
+	PHY_INTERFACE_MODE_1000BASEX,
+	PHY_INTERFACE_MODE_100BASEX,
+};
+
+static phy_interface_t phylink_choose_sfp_interface(struct phylink *pl,
+						    const unsigned long *intf)
+{
+	phy_interface_t interface;
+	size_t i;
+
+	interface = PHY_INTERFACE_MODE_NA;
+	for (i = 0; i < ARRAY_SIZE(phylink_sfp_interface_preference); i++)
+		if (test_bit(phylink_sfp_interface_preference[i], intf)) {
+			interface = phylink_sfp_interface_preference[i];
+			break;
+		}
+
+	return interface;
+}
+
+static void phylink_sfp_set_config(struct phylink *pl, u8 mode,
+				   unsigned long *supported,
+				   struct phylink_link_state *state)
+{
+	bool changed = false;
+
+	phylink_dbg(pl, "requesting link mode %s/%s with support %*pb\n",
+		    phylink_an_mode_str(mode), phy_modes(state->interface),
+		    __ETHTOOL_LINK_MODE_MASK_NBITS, supported);
+
+	if (!linkmode_equal(pl->supported, supported)) {
+		linkmode_copy(pl->supported, supported);
+		changed = true;
+	}
+
+	if (!linkmode_equal(pl->link_config.advertising, state->advertising)) {
+		linkmode_copy(pl->link_config.advertising, state->advertising);
+		changed = true;
+	}
+
+	if (pl->cur_link_an_mode != mode ||
+	    pl->link_config.interface != state->interface) {
+		pl->cur_link_an_mode = mode;
+		pl->link_config.interface = state->interface;
+
+		changed = true;
+
+		phylink_info(pl, "switched to %s/%s link mode\n",
+			     phylink_an_mode_str(mode),
+			     phy_modes(state->interface));
+	}
+
+	if (changed && !test_bit(PHYLINK_DISABLE_STOPPED,
+				 &pl->phylink_disable_state))
+		phylink_mac_initial_config(pl, false);
+}
+
 static int phylink_sfp_config(struct phylink *pl, u8 mode,
 			      const unsigned long *supported,
 			      const unsigned long *advertising)
@@ -2811,7 +2875,6 @@ static int phylink_sfp_config(struct phylink *pl, u8 mode,
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(support);
 	struct phylink_link_state config;
 	phy_interface_t iface;
-	bool changed;
 	int ret;
 
 	linkmode_copy(support, supported);
@@ -2854,61 +2917,103 @@ static int phylink_sfp_config(struct phylink *pl, u8 mode,
 		return ret;
 	}
 
-	phylink_dbg(pl, "requesting link mode %s/%s with support %*pb\n",
-		    phylink_an_mode_str(mode), phy_modes(config.interface),
-		    __ETHTOOL_LINK_MODE_MASK_NBITS, support);
-
 	if (phy_interface_mode_is_8023z(iface) && pl->phydev)
 		return -EINVAL;
 
-	changed = !linkmode_equal(pl->supported, support) ||
-		  !linkmode_equal(pl->link_config.advertising,
-				  config.advertising);
-	if (changed) {
-		linkmode_copy(pl->supported, support);
-		linkmode_copy(pl->link_config.advertising, config.advertising);
+	pl->link_port = pl->sfp_port;
+
+	phylink_sfp_set_config(pl, mode, support, &config);
+
+	return 0;
+}
+
+static int phylink_sfp_config_optical(struct phylink *pl)
+{
+	__ETHTOOL_DECLARE_LINK_MODE_MASK(support);
+	DECLARE_PHY_INTERFACE_MASK(interfaces);
+	struct phylink_link_state config;
+	phy_interface_t interface;
+	int ret;
+
+	phylink_dbg(pl, "optical SFP: interfaces=[mac=%*pbl, sfp=%*pbl]\n",
+		    (int)PHY_INTERFACE_MODE_MAX,
+		    pl->config->supported_interfaces,
+		    (int)PHY_INTERFACE_MODE_MAX,
+		    pl->sfp_interfaces);
+
+	/* Find the union of the supported interfaces by the PCS/MAC and
+	 * the SFP module.
+	 */
+	phy_interface_and(interfaces, pl->config->supported_interfaces,
+			  pl->sfp_interfaces);
+	if (phy_interface_empty(interfaces)) {
+		phylink_err(pl, "unsupported SFP module: no common interface modes\n");
+		return -EINVAL;
 	}
 
-	if (pl->cur_link_an_mode != mode ||
-	    pl->link_config.interface != config.interface) {
-		pl->link_config.interface = config.interface;
-		pl->cur_link_an_mode = mode;
+	memset(&config, 0, sizeof(config));
+	linkmode_copy(support, pl->sfp_support);
+	linkmode_copy(config.advertising, pl->sfp_support);
+	config.speed = SPEED_UNKNOWN;
+	config.duplex = DUPLEX_UNKNOWN;
+	config.pause = MLO_PAUSE_AN;
+	config.an_enabled = true;
 
-		changed = true;
+	/* For all the interfaces that are supported, reduce the sfp_support
+	 * mask to only those link modes that can be supported.
+	 */
+	ret = phylink_validate_mask(pl, pl->sfp_support, &config, interfaces);
+	if (ret) {
+		phylink_err(pl, "unsupported SFP module: validation with support %*pb failed\n",
+			    __ETHTOOL_LINK_MODE_MASK_NBITS, support);
+		return ret;
+	}
 
-		phylink_info(pl, "switched to %s/%s link mode\n",
-			     phylink_an_mode_str(mode),
-			     phy_modes(config.interface));
+	interface = phylink_choose_sfp_interface(pl, interfaces);
+	if (interface == PHY_INTERFACE_MODE_NA) {
+		phylink_err(pl, "failed to select SFP interface\n");
+		return -EINVAL;
+	}
+
+	phylink_dbg(pl, "optical SFP: chosen %s interface\n",
+		    phy_modes(interface));
+
+	config.interface = interface;
+
+	/* Ignore errors if we're expecting a PHY to attach later */
+	ret = phylink_validate(pl, support, &config);
+	if (ret) {
+		phylink_err(pl, "validation with support %*pb failed: %pe\n",
+			    __ETHTOOL_LINK_MODE_MASK_NBITS, support,
+			    ERR_PTR(ret));
+		return ret;
 	}
 
 	pl->link_port = pl->sfp_port;
 
-	if (changed && !test_bit(PHYLINK_DISABLE_STOPPED,
-				 &pl->phylink_disable_state))
-		phylink_mac_initial_config(pl, false);
+	phylink_sfp_set_config(pl, MLO_AN_INBAND, pl->sfp_support, &config);
 
-	return ret;
+	return 0;
 }
 
 static int phylink_sfp_module_insert(void *upstream,
 				     const struct sfp_eeprom_id *id)
 {
 	struct phylink *pl = upstream;
-	unsigned long *support = pl->sfp_support;
 
 	ASSERT_RTNL();
 
-	linkmode_zero(support);
+	linkmode_zero(pl->sfp_support);
 	phy_interface_zero(pl->sfp_interfaces);
-	sfp_parse_support(pl->sfp_bus, id, support, pl->sfp_interfaces);
-	pl->sfp_port = sfp_parse_port(pl->sfp_bus, id, support);
+	sfp_parse_support(pl->sfp_bus, id, pl->sfp_support, pl->sfp_interfaces);
+	pl->sfp_port = sfp_parse_port(pl->sfp_bus, id, pl->sfp_support);
 
 	/* If this module may have a PHY connecting later, defer until later */
 	pl->sfp_may_have_phy = sfp_may_have_phy(pl->sfp_bus, id);
 	if (pl->sfp_may_have_phy)
 		return 0;
 
-	return phylink_sfp_config(pl, MLO_AN_INBAND, support, support);
+	return phylink_sfp_config_optical(pl);
 }
 
 static int phylink_sfp_module_start(void *upstream)
@@ -2927,8 +3032,7 @@ static int phylink_sfp_module_start(void *upstream)
 	if (!pl->sfp_may_have_phy)
 		return 0;
 
-	return phylink_sfp_config(pl, MLO_AN_INBAND,
-				  pl->sfp_support, pl->sfp_support);
+	return phylink_sfp_config_optical(pl);
 }
 
 static void phylink_sfp_module_stop(void *upstream)
-- 
cgit v1.2.3


From e60846370ca96a042d0e203782b84ed9558a8546 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Fri, 30 Sep 2022 16:21:02 +0200
Subject: net: phylink: rename phylink_sfp_config()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

phylink_sfp_config() now only deals with configuring the MAC for a
SFP containing a PHY. Rename it to be specific.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index ab32ef767d69..f6e9231f0cbe 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -2867,9 +2867,8 @@ static void phylink_sfp_set_config(struct phylink *pl, u8 mode,
 		phylink_mac_initial_config(pl, false);
 }
 
-static int phylink_sfp_config(struct phylink *pl, u8 mode,
-			      const unsigned long *supported,
-			      const unsigned long *advertising)
+static int phylink_sfp_config_phy(struct phylink *pl, u8 mode,
+				  struct phy_device *phy)
 {
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(support1);
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(support);
@@ -2877,10 +2876,10 @@ static int phylink_sfp_config(struct phylink *pl, u8 mode,
 	phy_interface_t iface;
 	int ret;
 
-	linkmode_copy(support, supported);
+	linkmode_copy(support, phy->supported);
 
 	memset(&config, 0, sizeof(config));
-	linkmode_copy(config.advertising, advertising);
+	linkmode_copy(config.advertising, phy->advertising);
 	config.interface = PHY_INTERFACE_MODE_NA;
 	config.speed = SPEED_UNKNOWN;
 	config.duplex = DUPLEX_UNKNOWN;
@@ -3093,7 +3092,7 @@ static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy)
 		mode = MLO_AN_INBAND;
 
 	/* Do the initial configuration */
-	ret = phylink_sfp_config(pl, mode, phy->supported, phy->advertising);
+	ret = phylink_sfp_config_phy(pl, mode, phy);
 	if (ret < 0)
 		return ret;
 
-- 
cgit v1.2.3


From eca68a3c7d05b38b4e728cead0c49718f2bc1d5a Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Fri, 30 Sep 2022 16:21:03 +0200
Subject: net: phylink: pass supported host PHY interface modes to phylib for
 SFP's PHYs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pass the supported PHY interface types to phylib if the PHY we are
connecting is inside a SFP, so that the PHY driver can select an
appropriate host configuration mode for their interface according to
the host capabilities.

For example the Marvell 88X3310 PHY inside RollBall SFP modules
defaults to 10gbase-r mode on host's side, and the marvell10g
driver currently does not change this setting. But a host may not
support 10gbase-r. For example Turris Omnia only supports sgmii,
1000base-x and 2500base-x modes. The PHY can be configured to use
those modes, but in order for the PHY driver to do that, it needs
to know which modes are supported.

Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 17 +++++++++++++++++
 include/linux/phy.h       |  4 ++++
 2 files changed, 21 insertions(+)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index f6e9231f0cbe..9ff8eb516666 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -2814,6 +2814,8 @@ static const phy_interface_t phylink_sfp_interface_preference[] = {
 	PHY_INTERFACE_MODE_100BASEX,
 };
 
+static DECLARE_PHY_INTERFACE_MASK(phylink_sfp_interfaces);
+
 static phy_interface_t phylink_choose_sfp_interface(struct phylink *pl,
 						    const unsigned long *intf)
 {
@@ -3091,6 +3093,10 @@ static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy)
 	else
 		mode = MLO_AN_INBAND;
 
+	/* Set the PHY's host supported interfaces */
+	phy_interface_and(phy->host_interfaces, phylink_sfp_interfaces,
+			  pl->config->supported_interfaces);
+
 	/* Do the initial configuration */
 	ret = phylink_sfp_config_phy(pl, mode, phy);
 	if (ret < 0)
@@ -3444,4 +3450,15 @@ void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs,
 }
 EXPORT_SYMBOL_GPL(phylink_mii_c45_pcs_get_state);
 
+static int __init phylink_init(void)
+{
+	for (int i = 0; i < ARRAY_SIZE(phylink_sfp_interface_preference); ++i)
+		__set_bit(phylink_sfp_interface_preference[i],
+			  phylink_sfp_interfaces);
+
+	return 0;
+}
+
+module_init(phylink_init);
+
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 9c66f357f489..d65fc76fe0ae 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -571,6 +571,7 @@ struct macsec_ops;
  * @advertising: Currently advertised linkmodes
  * @adv_old: Saved advertised while power saving for WoL
  * @lp_advertising: Current link partner advertised linkmodes
+ * @host_interfaces: PHY interface modes supported by host
  * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited
  * @autoneg: Flag autoneg being used
  * @rate_matching: Current rate matching mode
@@ -670,6 +671,9 @@ struct phy_device {
 	/* used with phy_speed_down */
 	__ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old);
 
+	/* Host supported PHY interface types. Should be ignored if empty. */
+	DECLARE_PHY_INTERFACE_MASK(host_interfaces);
+
 	/* Energy efficient ethernet modes which should be prohibited */
 	u32 eee_broken_modes;
 
-- 
cgit v1.2.3


From 3891569b2fc378e7fb882f5dbdc001ee8f78f024 Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Fri, 30 Sep 2022 16:21:04 +0200
Subject: net: phy: marvell10g: Use tabs instead of spaces for indentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some register definitions were defined with spaces used for indentation.
Change them to tabs.

Signed-off-by: Marek Behún <kabel@kernel.org>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell10g.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 05a5ed089965..06d0fe4b76c3 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -117,16 +117,16 @@ enum {
 	MV_V2_33X0_PORT_CTRL_MACTYPE_10GBASER_NO_SGMII_AN	= 0x5,
 	MV_V2_33X0_PORT_CTRL_MACTYPE_10GBASER_RATE_MATCH	= 0x6,
 	MV_V2_33X0_PORT_CTRL_MACTYPE_USXGMII			= 0x7,
-	MV_V2_PORT_INTR_STS     = 0xf040,
-	MV_V2_PORT_INTR_MASK    = 0xf043,
-	MV_V2_PORT_INTR_STS_WOL_EN      = BIT(8),
-	MV_V2_MAGIC_PKT_WORD0   = 0xf06b,
-	MV_V2_MAGIC_PKT_WORD1   = 0xf06c,
-	MV_V2_MAGIC_PKT_WORD2   = 0xf06d,
+	MV_V2_PORT_INTR_STS		= 0xf040,
+	MV_V2_PORT_INTR_MASK		= 0xf043,
+	MV_V2_PORT_INTR_STS_WOL_EN	= BIT(8),
+	MV_V2_MAGIC_PKT_WORD0		= 0xf06b,
+	MV_V2_MAGIC_PKT_WORD1		= 0xf06c,
+	MV_V2_MAGIC_PKT_WORD2		= 0xf06d,
 	/* Wake on LAN registers */
-	MV_V2_WOL_CTRL          = 0xf06e,
-	MV_V2_WOL_CTRL_CLEAR_STS        = BIT(15),
-	MV_V2_WOL_CTRL_MAGIC_PKT_EN     = BIT(0),
+	MV_V2_WOL_CTRL			= 0xf06e,
+	MV_V2_WOL_CTRL_CLEAR_STS	= BIT(15),
+	MV_V2_WOL_CTRL_MAGIC_PKT_EN	= BIT(0),
 	/* Temperature control/read registers (88X3310 only) */
 	MV_V2_TEMP_CTRL		= 0xf08a,
 	MV_V2_TEMP_CTRL_MASK	= 0xc000,
-- 
cgit v1.2.3


From d6d29292640d3f778a28a74c53ae1733c023392f Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Fri, 30 Sep 2022 16:21:05 +0200
Subject: net: phy: marvell10g: select host interface configuration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Select the host interface configuration according to the capabilities of
the host if the host provided them. This is currently provided only when
connecting PHY that is inside a SFP.

The PHY supports several configurations of host communication:
- always communicate with host in 10gbase-r, even if copper speed is
  lower (rate matching mode),
- the same as above but use xaui/rxaui instead of 10gbase-r,
- switch host SerDes mode between 10gbase-r, 5gbase-r, 2500base-x and
  sgmii according to copper speed,
- the same as above but use xaui/rxaui instead of 10gbase-r.

This mode of host communication, called MACTYPE, is by default selected
by strapping pins, but it can be changed in software.

This adds support for selecting this mode according to which modes are
supported by the host.

This allows the kernel to:
- support SFP modules with 88X33X0 or 88E21X0 inside them

Note: we use mv3310_select_mactype() for both 88X3310 and 88X3340,
although 88X3340 does not support XAUI. This is not a problem because
88X3340 does not declare XAUI in it's supported_interfaces, and so this
function will never choose that MACTYPE.

Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
[ rebase, updated, also added support for 88E21X0 ]
Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell10g.c | 112 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 06d0fe4b76c3..383a9c9f36e5 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -96,6 +96,11 @@ enum {
 	MV_PCS_PORT_INFO_NPORTS_MASK	= 0x0380,
 	MV_PCS_PORT_INFO_NPORTS_SHIFT	= 7,
 
+	/* SerDes reinitialization 88E21X0 */
+	MV_AN_21X0_SERDES_CTRL2	= 0x800f,
+	MV_AN_21X0_SERDES_CTRL2_AUTO_INIT_DIS	= BIT(13),
+	MV_AN_21X0_SERDES_CTRL2_RUN_INIT	= BIT(15),
+
 	/* These registers appear at 0x800X and 0xa00X - the 0xa00X control
 	 * registers appear to set themselves to the 0x800X when AN is
 	 * restarted, but status registers appear readable from either.
@@ -140,6 +145,8 @@ struct mv3310_chip {
 	bool (*has_downshift)(struct phy_device *phydev);
 	void (*init_supported_interfaces)(unsigned long *mask);
 	int (*get_mactype)(struct phy_device *phydev);
+	int (*set_mactype)(struct phy_device *phydev, int mactype);
+	int (*select_mactype)(unsigned long *interfaces);
 	int (*init_interface)(struct phy_device *phydev, int mactype);
 
 #ifdef CONFIG_HWMON
@@ -594,6 +601,49 @@ static int mv2110_get_mactype(struct phy_device *phydev)
 	return mactype & MV_PMA_21X0_PORT_CTRL_MACTYPE_MASK;
 }
 
+static int mv2110_set_mactype(struct phy_device *phydev, int mactype)
+{
+	int err, val;
+
+	mactype &= MV_PMA_21X0_PORT_CTRL_MACTYPE_MASK;
+	err = phy_modify_mmd(phydev, MDIO_MMD_PMAPMD, MV_PMA_21X0_PORT_CTRL,
+			     MV_PMA_21X0_PORT_CTRL_SWRST |
+			     MV_PMA_21X0_PORT_CTRL_MACTYPE_MASK,
+			     MV_PMA_21X0_PORT_CTRL_SWRST | mactype);
+	if (err)
+		return err;
+
+	err = phy_set_bits_mmd(phydev, MDIO_MMD_AN, MV_AN_21X0_SERDES_CTRL2,
+			       MV_AN_21X0_SERDES_CTRL2_AUTO_INIT_DIS |
+			       MV_AN_21X0_SERDES_CTRL2_RUN_INIT);
+	if (err)
+		return err;
+
+	err = phy_read_mmd_poll_timeout(phydev, MDIO_MMD_AN,
+					MV_AN_21X0_SERDES_CTRL2, val,
+					!(val &
+					  MV_AN_21X0_SERDES_CTRL2_RUN_INIT),
+					5000, 100000, true);
+	if (err)
+		return err;
+
+	return phy_clear_bits_mmd(phydev, MDIO_MMD_AN, MV_AN_21X0_SERDES_CTRL2,
+				  MV_AN_21X0_SERDES_CTRL2_AUTO_INIT_DIS);
+}
+
+static int mv2110_select_mactype(unsigned long *interfaces)
+{
+	if (test_bit(PHY_INTERFACE_MODE_USXGMII, interfaces))
+		return MV_PMA_21X0_PORT_CTRL_MACTYPE_USXGMII;
+	else if (test_bit(PHY_INTERFACE_MODE_SGMII, interfaces) &&
+		 !test_bit(PHY_INTERFACE_MODE_10GBASER, interfaces))
+		return MV_PMA_21X0_PORT_CTRL_MACTYPE_5GBASER;
+	else if (test_bit(PHY_INTERFACE_MODE_10GBASER, interfaces))
+		return MV_PMA_21X0_PORT_CTRL_MACTYPE_10GBASER_RATE_MATCH;
+	else
+		return -1;
+}
+
 static int mv3310_get_mactype(struct phy_device *phydev)
 {
 	int mactype;
@@ -605,6 +655,46 @@ static int mv3310_get_mactype(struct phy_device *phydev)
 	return mactype & MV_V2_33X0_PORT_CTRL_MACTYPE_MASK;
 }
 
+static int mv3310_set_mactype(struct phy_device *phydev, int mactype)
+{
+	int ret;
+
+	mactype &= MV_V2_33X0_PORT_CTRL_MACTYPE_MASK;
+	ret = phy_modify_mmd_changed(phydev, MDIO_MMD_VEND2, MV_V2_PORT_CTRL,
+				     MV_V2_33X0_PORT_CTRL_MACTYPE_MASK,
+				     mactype);
+	if (ret <= 0)
+		return ret;
+
+	return phy_set_bits_mmd(phydev, MDIO_MMD_VEND2, MV_V2_PORT_CTRL,
+				MV_V2_33X0_PORT_CTRL_SWRST);
+}
+
+static int mv3310_select_mactype(unsigned long *interfaces)
+{
+	if (test_bit(PHY_INTERFACE_MODE_USXGMII, interfaces))
+		return MV_V2_33X0_PORT_CTRL_MACTYPE_USXGMII;
+	else if (test_bit(PHY_INTERFACE_MODE_SGMII, interfaces) &&
+		 test_bit(PHY_INTERFACE_MODE_10GBASER, interfaces))
+		return MV_V2_33X0_PORT_CTRL_MACTYPE_10GBASER;
+	else if (test_bit(PHY_INTERFACE_MODE_SGMII, interfaces) &&
+		 test_bit(PHY_INTERFACE_MODE_RXAUI, interfaces))
+		return MV_V2_33X0_PORT_CTRL_MACTYPE_RXAUI;
+	else if (test_bit(PHY_INTERFACE_MODE_SGMII, interfaces) &&
+		 test_bit(PHY_INTERFACE_MODE_XAUI, interfaces))
+		return MV_V2_3310_PORT_CTRL_MACTYPE_XAUI;
+	else if (test_bit(PHY_INTERFACE_MODE_10GBASER, interfaces))
+		return MV_V2_33X0_PORT_CTRL_MACTYPE_10GBASER_RATE_MATCH;
+	else if (test_bit(PHY_INTERFACE_MODE_RXAUI, interfaces))
+		return MV_V2_33X0_PORT_CTRL_MACTYPE_RXAUI_RATE_MATCH;
+	else if (test_bit(PHY_INTERFACE_MODE_XAUI, interfaces))
+		return MV_V2_3310_PORT_CTRL_MACTYPE_XAUI_RATE_MATCH;
+	else if (test_bit(PHY_INTERFACE_MODE_SGMII, interfaces))
+		return MV_V2_33X0_PORT_CTRL_MACTYPE_10GBASER;
+	else
+		return -1;
+}
+
 static int mv2110_init_interface(struct phy_device *phydev, int mactype)
 {
 	struct mv3310_priv *priv = dev_get_drvdata(&phydev->mdio.dev);
@@ -688,6 +778,20 @@ static int mv3310_config_init(struct phy_device *phydev)
 	if (err)
 		return err;
 
+	/* If host provided host supported interface modes, try to select the
+	 * best one
+	 */
+	if (!phy_interface_empty(phydev->host_interfaces)) {
+		mactype = chip->select_mactype(phydev->host_interfaces);
+		if (mactype >= 0) {
+			phydev_info(phydev, "Changing MACTYPE to %i\n",
+				    mactype);
+			err = chip->set_mactype(phydev, mactype);
+			if (err)
+				return err;
+		}
+	}
+
 	mactype = chip->get_mactype(phydev);
 	if (mactype < 0)
 		return mactype;
@@ -1050,6 +1154,8 @@ static const struct mv3310_chip mv3310_type = {
 	.has_downshift = mv3310_has_downshift,
 	.init_supported_interfaces = mv3310_init_supported_interfaces,
 	.get_mactype = mv3310_get_mactype,
+	.set_mactype = mv3310_set_mactype,
+	.select_mactype = mv3310_select_mactype,
 	.init_interface = mv3310_init_interface,
 
 #ifdef CONFIG_HWMON
@@ -1061,6 +1167,8 @@ static const struct mv3310_chip mv3340_type = {
 	.has_downshift = mv3310_has_downshift,
 	.init_supported_interfaces = mv3340_init_supported_interfaces,
 	.get_mactype = mv3310_get_mactype,
+	.set_mactype = mv3310_set_mactype,
+	.select_mactype = mv3310_select_mactype,
 	.init_interface = mv3340_init_interface,
 
 #ifdef CONFIG_HWMON
@@ -1071,6 +1179,8 @@ static const struct mv3310_chip mv3340_type = {
 static const struct mv3310_chip mv2110_type = {
 	.init_supported_interfaces = mv2110_init_supported_interfaces,
 	.get_mactype = mv2110_get_mactype,
+	.set_mactype = mv2110_set_mactype,
+	.select_mactype = mv2110_select_mactype,
 	.init_interface = mv2110_init_interface,
 
 #ifdef CONFIG_HWMON
@@ -1081,6 +1191,8 @@ static const struct mv3310_chip mv2110_type = {
 static const struct mv3310_chip mv2111_type = {
 	.init_supported_interfaces = mv2111_init_supported_interfaces,
 	.get_mactype = mv2110_get_mactype,
+	.set_mactype = mv2110_set_mactype,
+	.select_mactype = mv2110_select_mactype,
 	.init_interface = mv2110_init_interface,
 
 #ifdef CONFIG_HWMON
-- 
cgit v1.2.3


From 31eb8907aa5b9e9be1a63f2ac574973715172ab4 Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Fri, 30 Sep 2022 16:21:06 +0200
Subject: net: phylink: allow attaching phy for SFP modules on 802.3z mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some SFPs may contain an internal PHY which may in some cases want to
connect with the host interface in 1000base-x/2500base-x mode.
Do not fail if such PHY is being attached in one of these PHY interface
modes.

Signed-off-by: Marek Behún <kabel@kernel.org>
Reviewed-by: Russell King <rmk+kernel@armlinux.org.uk>
Reviewed-by: Pali Rohár <pali@kernel.org>
Cc: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 9ff8eb516666..75464df191ef 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1669,7 +1669,7 @@ static int phylink_attach_phy(struct phylink *pl, struct phy_device *phy,
 {
 	if (WARN_ON(pl->cfg_link_an_mode == MLO_AN_FIXED ||
 		    (pl->cfg_link_an_mode == MLO_AN_INBAND &&
-		     phy_interface_mode_is_8023z(interface))))
+		     phy_interface_mode_is_8023z(interface) && !pl->sfp_bus)))
 		return -EINVAL;
 
 	if (pl->phydev)
@@ -2918,9 +2918,6 @@ static int phylink_sfp_config_phy(struct phylink *pl, u8 mode,
 		return ret;
 	}
 
-	if (phy_interface_mode_is_8023z(iface) && pl->phydev)
-		return -EINVAL;
-
 	pl->link_port = pl->sfp_port;
 
 	phylink_sfp_set_config(pl, mode, support, &config);
-- 
cgit v1.2.3


From 13c8adcf221f1ff407115d3269e0fb57e8cecf82 Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Fri, 30 Sep 2022 16:21:07 +0200
Subject: net: sfp: Add and use macros for SFP quirks definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add macros SFP_QUIRK(), SFP_QUIRK_M() and SFP_QUIRK_F() for defining SFP
quirk table entries. Use them to deduplicate the code a little bit.

Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp.c | 61 ++++++++++++++++++++++-----------------------------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index b150e4765819..3201e2726e61 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -350,42 +350,33 @@ static void sfp_quirk_ubnt_uf_instant(const struct sfp_eeprom_id *id,
 	linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT, modes);
 }
 
+#define SFP_QUIRK(_v, _p, _m, _f) \
+	{ .vendor = _v, .part = _p, .modes = _m, .fixup = _f, }
+#define SFP_QUIRK_M(_v, _p, _m) SFP_QUIRK(_v, _p, _m, NULL)
+#define SFP_QUIRK_F(_v, _p, _f) SFP_QUIRK(_v, _p, NULL, _f)
+
 static const struct sfp_quirk sfp_quirks[] = {
-	{
-		// Alcatel Lucent G-010S-P can operate at 2500base-X, but
-		// incorrectly report 2500MBd NRZ in their EEPROM
-		.vendor = "ALCATELLUCENT",
-		.part = "G010SP",
-		.modes = sfp_quirk_2500basex,
-	}, {
-		// Alcatel Lucent G-010S-A can operate at 2500base-X, but
-		// report 3.2GBd NRZ in their EEPROM
-		.vendor = "ALCATELLUCENT",
-		.part = "3FE46541AA",
-		.modes = sfp_quirk_2500basex,
-		.fixup = sfp_fixup_long_startup,
-	}, {
-		.vendor = "HALNy",
-		.part = "HL-GSFP",
-		.fixup = sfp_fixup_halny_gsfp,
-	}, {
-		// Huawei MA5671A can operate at 2500base-X, but report 1.2GBd
-		// NRZ in their EEPROM
-		.vendor = "HUAWEI",
-		.part = "MA5671A",
-		.modes = sfp_quirk_2500basex,
-		.fixup = sfp_fixup_ignore_tx_fault,
-	}, {
-		// Lantech 8330-262D-E can operate at 2500base-X, but
-		// incorrectly report 2500MBd NRZ in their EEPROM
-		.vendor = "Lantech",
-		.part = "8330-262D-E",
-		.modes = sfp_quirk_2500basex,
-	}, {
-		.vendor = "UBNT",
-		.part = "UF-INSTANT",
-		.modes = sfp_quirk_ubnt_uf_instant,
-	}
+	// Alcatel Lucent G-010S-P can operate at 2500base-X, but incorrectly
+	// report 2500MBd NRZ in their EEPROM
+	SFP_QUIRK_M("ALCATELLUCENT", "G010SP", sfp_quirk_2500basex),
+
+	// Alcatel Lucent G-010S-A can operate at 2500base-X, but report 3.2GBd
+	// NRZ in their EEPROM
+	SFP_QUIRK("ALCATELLUCENT", "3FE46541AA", sfp_quirk_2500basex,
+		  sfp_fixup_long_startup),
+
+	SFP_QUIRK_F("HALNy", "HL-GSFP", sfp_fixup_halny_gsfp),
+
+	// Huawei MA5671A can operate at 2500base-X, but report 1.2GBd NRZ in
+	// their EEPROM
+	SFP_QUIRK("HUAWEI", "MA5671A", sfp_quirk_2500basex,
+		  sfp_fixup_ignore_tx_fault),
+
+	// Lantech 8330-262D-E can operate at 2500base-X, but incorrectly report
+	// 2500MBd NRZ in their EEPROM
+	SFP_QUIRK_M("Lantech", "8330-262D-E", sfp_quirk_2500basex),
+
+	SFP_QUIRK_M("UBNT", "UF-INSTANT", sfp_quirk_ubnt_uf_instant),
 };
 
 static size_t sfp_strlen(const char *str, size_t maxlen)
-- 
cgit v1.2.3


From e85b1347ace677c3822c12d9332dfaaffe594da6 Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Fri, 30 Sep 2022 16:21:08 +0200
Subject: net: sfp: create/destroy I2C mdiobus before PHY probe/after PHY
 release
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of configuring the I2C mdiobus when SFP driver is probed,
create/destroy the mdiobus before the PHY is probed for/after it is
released.

This way we can tell the mdio-i2c code which protocol to use for each
SFP transceiver.

Move the code that determines MDIO I2C protocol from
sfp_sm_probe_for_phy() to sfp_sm_mod_probe(), where most of the SFP ID
parsing is done. Don't allocate I2C bus if no PHY is expected.

Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp.c         | 64 ++++++++++++++++++++++++++++++++++---------
 include/linux/mdio/mdio-i2c.h |  6 ++++
 2 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index 3201e2726e61..ccd7710685f2 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -218,6 +218,7 @@ struct sfp {
 	struct i2c_adapter *i2c;
 	struct mii_bus *i2c_mii;
 	struct sfp_bus *sfp_bus;
+	enum mdio_i2c_proto mdio_protocol;
 	struct phy_device *mod_phy;
 	const struct sff_data *type;
 	size_t i2c_block_size;
@@ -530,9 +531,6 @@ static int sfp_i2c_write(struct sfp *sfp, bool a2, u8 dev_addr, void *buf,
 
 static int sfp_i2c_configure(struct sfp *sfp, struct i2c_adapter *i2c)
 {
-	struct mii_bus *i2c_mii;
-	int ret;
-
 	if (!i2c_check_functionality(i2c, I2C_FUNC_I2C))
 		return -EINVAL;
 
@@ -540,7 +538,15 @@ static int sfp_i2c_configure(struct sfp *sfp, struct i2c_adapter *i2c)
 	sfp->read = sfp_i2c_read;
 	sfp->write = sfp_i2c_write;
 
-	i2c_mii = mdio_i2c_alloc(sfp->dev, i2c);
+	return 0;
+}
+
+static int sfp_i2c_mdiobus_create(struct sfp *sfp)
+{
+	struct mii_bus *i2c_mii;
+	int ret;
+
+	i2c_mii = mdio_i2c_alloc(sfp->dev, sfp->i2c);
 	if (IS_ERR(i2c_mii))
 		return PTR_ERR(i2c_mii);
 
@@ -558,6 +564,12 @@ static int sfp_i2c_configure(struct sfp *sfp, struct i2c_adapter *i2c)
 	return 0;
 }
 
+static void sfp_i2c_mdiobus_destroy(struct sfp *sfp)
+{
+	mdiobus_unregister(sfp->i2c_mii);
+	sfp->i2c_mii = NULL;
+}
+
 /* Interface */
 static int sfp_read(struct sfp *sfp, bool a2, u8 addr, void *buf, size_t len)
 {
@@ -1674,6 +1686,14 @@ static void sfp_sm_fault(struct sfp *sfp, unsigned int next_state, bool warn)
 	}
 }
 
+static int sfp_sm_add_mdio_bus(struct sfp *sfp)
+{
+	if (sfp->mdio_protocol != MDIO_I2C_NONE)
+		return sfp_i2c_mdiobus_create(sfp);
+
+	return 0;
+}
+
 /* Probe a SFP for a PHY device if the module supports copper - the PHY
  * normally sits at I2C bus address 0x56, and may either be a clause 22
  * or clause 45 PHY.
@@ -1689,19 +1709,19 @@ static int sfp_sm_probe_for_phy(struct sfp *sfp)
 {
 	int err = 0;
 
-	switch (sfp->id.base.extended_cc) {
-	case SFF8024_ECC_10GBASE_T_SFI:
-	case SFF8024_ECC_10GBASE_T_SR:
-	case SFF8024_ECC_5GBASE_T:
-	case SFF8024_ECC_2_5GBASE_T:
-		err = sfp_sm_probe_phy(sfp, true);
+	switch (sfp->mdio_protocol) {
+	case MDIO_I2C_NONE:
 		break;
 
-	default:
-		if (sfp->id.base.e1000_base_t)
-			err = sfp_sm_probe_phy(sfp, false);
+	case MDIO_I2C_MARVELL_C22:
+		err = sfp_sm_probe_phy(sfp, false);
+		break;
+
+	case MDIO_I2C_C45:
+		err = sfp_sm_probe_phy(sfp, true);
 		break;
 	}
+
 	return err;
 }
 
@@ -2028,6 +2048,16 @@ static int sfp_sm_mod_probe(struct sfp *sfp, bool report)
 
 	sfp->tx_fault_ignore = false;
 
+	if (sfp->id.base.extended_cc == SFF8024_ECC_10GBASE_T_SFI ||
+	    sfp->id.base.extended_cc == SFF8024_ECC_10GBASE_T_SR ||
+	    sfp->id.base.extended_cc == SFF8024_ECC_5GBASE_T ||
+	    sfp->id.base.extended_cc == SFF8024_ECC_2_5GBASE_T)
+		sfp->mdio_protocol = MDIO_I2C_C45;
+	else if (sfp->id.base.e1000_base_t)
+		sfp->mdio_protocol = MDIO_I2C_MARVELL_C22;
+	else
+		sfp->mdio_protocol = MDIO_I2C_NONE;
+
 	sfp->quirk = sfp_lookup_quirk(&id);
 	if (sfp->quirk && sfp->quirk->fixup)
 		sfp->quirk->fixup(sfp);
@@ -2204,6 +2234,8 @@ static void sfp_sm_main(struct sfp *sfp, unsigned int event)
 			sfp_module_stop(sfp->sfp_bus);
 		if (sfp->mod_phy)
 			sfp_sm_phy_detach(sfp);
+		if (sfp->i2c_mii)
+			sfp_i2c_mdiobus_destroy(sfp);
 		sfp_module_tx_disable(sfp);
 		sfp_soft_stop_poll(sfp);
 		sfp_sm_next(sfp, SFP_S_DOWN, 0);
@@ -2266,6 +2298,12 @@ static void sfp_sm_main(struct sfp *sfp, unsigned int event)
 				     sfp->sm_fault_retries == N_FAULT_INIT);
 		} else if (event == SFP_E_TIMEOUT || event == SFP_E_TX_CLEAR) {
 	init_done:
+			/* Create mdiobus and start trying for PHY */
+			ret = sfp_sm_add_mdio_bus(sfp);
+			if (ret < 0) {
+				sfp_sm_next(sfp, SFP_S_FAIL, 0);
+				break;
+			}
 			sfp->sm_phy_retries = R_PHY_RETRY;
 			goto phy_probe;
 		}
diff --git a/include/linux/mdio/mdio-i2c.h b/include/linux/mdio/mdio-i2c.h
index b1d27f7cd23f..3bde1a555a49 100644
--- a/include/linux/mdio/mdio-i2c.h
+++ b/include/linux/mdio/mdio-i2c.h
@@ -11,6 +11,12 @@ struct device;
 struct i2c_adapter;
 struct mii_bus;
 
+enum mdio_i2c_proto {
+	MDIO_I2C_NONE,
+	MDIO_I2C_MARVELL_C22,
+	MDIO_I2C_C45,
+};
+
 struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c);
 
 #endif
-- 
cgit v1.2.3


From 09bbedac72d5a9267088c15d1a71c8c3a8fb47e7 Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Fri, 30 Sep 2022 16:21:09 +0200
Subject: net: phy: mdio-i2c: support I2C MDIO protocol for RollBall SFP
 modules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some multigig SFPs from RollBall and Hilink do not expose functional
MDIO access to the internal PHY of the SFP via I2C address 0x56
(although there seems to be read-only clause 22 access on this address).

Instead these SFPs PHY can be accessed via I2C via the SFP Enhanced
Digital Diagnostic Interface - I2C address 0x51. The SFP_PAGE has to be
selected to 3 and the password must be filled with 0xff bytes for this
PHY communication to work.

This extends the mdio-i2c driver to support this protocol by adding a
special parameter to mdio_i2c_alloc function via which this RollBall
protocol can be selected.

Signed-off-by: Marek Behún <kabel@kernel.org>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/mdio/mdio-i2c.c   | 310 +++++++++++++++++++++++++++++++++++++++++-
 drivers/net/phy/sfp.c         |   6 +-
 include/linux/mdio/mdio-i2c.h |   4 +-
 3 files changed, 313 insertions(+), 7 deletions(-)

diff --git a/drivers/net/mdio/mdio-i2c.c b/drivers/net/mdio/mdio-i2c.c
index 09200a70b315..bf8bf5e20faf 100644
--- a/drivers/net/mdio/mdio-i2c.c
+++ b/drivers/net/mdio/mdio-i2c.c
@@ -3,6 +3,7 @@
  * MDIO I2C bridge
  *
  * Copyright (C) 2015-2016 Russell King
+ * Copyright (C) 2021 Marek Behun
  *
  * Network PHYs can appear on I2C buses when they are part of SFP module.
  * This driver exposes these PHYs to the networking PHY code, allowing
@@ -12,6 +13,7 @@
 #include <linux/i2c.h>
 #include <linux/mdio/mdio-i2c.h>
 #include <linux/phy.h>
+#include <linux/sfp.h>
 
 /*
  * I2C bus addresses 0x50 and 0x51 are normally an EEPROM, which is
@@ -28,7 +30,7 @@ static unsigned int i2c_mii_phy_addr(int phy_id)
 	return phy_id + 0x40;
 }
 
-static int i2c_mii_read(struct mii_bus *bus, int phy_id, int reg)
+static int i2c_mii_read_default(struct mii_bus *bus, int phy_id, int reg)
 {
 	struct i2c_adapter *i2c = bus->priv;
 	struct i2c_msg msgs[2];
@@ -62,7 +64,8 @@ static int i2c_mii_read(struct mii_bus *bus, int phy_id, int reg)
 	return data[0] << 8 | data[1];
 }
 
-static int i2c_mii_write(struct mii_bus *bus, int phy_id, int reg, u16 val)
+static int i2c_mii_write_default(struct mii_bus *bus, int phy_id, int reg,
+				 u16 val)
 {
 	struct i2c_adapter *i2c = bus->priv;
 	struct i2c_msg msg;
@@ -91,9 +94,288 @@ static int i2c_mii_write(struct mii_bus *bus, int phy_id, int reg, u16 val)
 	return ret < 0 ? ret : 0;
 }
 
-struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c)
+/* RollBall SFPs do not access internal PHY via I2C address 0x56, but
+ * instead via address 0x51, when SFP page is set to 0x03 and password to
+ * 0xffffffff.
+ *
+ * address  size  contents  description
+ * -------  ----  --------  -----------
+ * 0x80     1     CMD       0x01/0x02/0x04 for write/read/done
+ * 0x81     1     DEV       Clause 45 device
+ * 0x82     2     REG       Clause 45 register
+ * 0x84     2     VAL       Register value
+ */
+#define ROLLBALL_PHY_I2C_ADDR		0x51
+
+#define ROLLBALL_PASSWORD		(SFP_VSL + 3)
+
+#define ROLLBALL_CMD_ADDR		0x80
+#define ROLLBALL_DATA_ADDR		0x81
+
+#define ROLLBALL_CMD_WRITE		0x01
+#define ROLLBALL_CMD_READ		0x02
+#define ROLLBALL_CMD_DONE		0x04
+
+#define SFP_PAGE_ROLLBALL_MDIO		3
+
+static int __i2c_transfer_err(struct i2c_adapter *i2c, struct i2c_msg *msgs,
+			      int num)
+{
+	int ret;
+
+	ret = __i2c_transfer(i2c, msgs, num);
+	if (ret < 0)
+		return ret;
+	else if (ret != num)
+		return -EIO;
+	else
+		return 0;
+}
+
+static int __i2c_rollball_get_page(struct i2c_adapter *i2c, int bus_addr,
+				   u8 *page)
+{
+	struct i2c_msg msgs[2];
+	u8 addr = SFP_PAGE;
+
+	msgs[0].addr = bus_addr;
+	msgs[0].flags = 0;
+	msgs[0].len = 1;
+	msgs[0].buf = &addr;
+
+	msgs[1].addr = bus_addr;
+	msgs[1].flags = I2C_M_RD;
+	msgs[1].len = 1;
+	msgs[1].buf = page;
+
+	return __i2c_transfer_err(i2c, msgs, 2);
+}
+
+static int __i2c_rollball_set_page(struct i2c_adapter *i2c, int bus_addr,
+				   u8 page)
+{
+	struct i2c_msg msg;
+	u8 buf[2];
+
+	buf[0] = SFP_PAGE;
+	buf[1] = page;
+
+	msg.addr = bus_addr;
+	msg.flags = 0;
+	msg.len = 2;
+	msg.buf = buf;
+
+	return __i2c_transfer_err(i2c, &msg, 1);
+}
+
+/* In order to not interfere with other SFP code (which possibly may manipulate
+ * SFP_PAGE), for every transfer we do this:
+ *   1. lock the bus
+ *   2. save content of SFP_PAGE
+ *   3. set SFP_PAGE to 3
+ *   4. do the transfer
+ *   5. restore original SFP_PAGE
+ *   6. unlock the bus
+ * Note that one might think that steps 2 to 5 could be theoretically done all
+ * in one call to i2c_transfer (by constructing msgs array in such a way), but
+ * unfortunately tests show that this does not work :-( Changed SFP_PAGE does
+ * not take into account until i2c_transfer() is done.
+ */
+static int i2c_transfer_rollball(struct i2c_adapter *i2c,
+				 struct i2c_msg *msgs, int num)
+{
+	int ret, main_err = 0;
+	u8 saved_page;
+
+	i2c_lock_bus(i2c, I2C_LOCK_SEGMENT);
+
+	/* save original page */
+	ret = __i2c_rollball_get_page(i2c, msgs->addr, &saved_page);
+	if (ret)
+		goto unlock;
+
+	/* change to RollBall MDIO page */
+	ret = __i2c_rollball_set_page(i2c, msgs->addr, SFP_PAGE_ROLLBALL_MDIO);
+	if (ret)
+		goto unlock;
+
+	/* do the transfer; we try to restore original page if this fails */
+	ret = __i2c_transfer_err(i2c, msgs, num);
+	if (ret)
+		main_err = ret;
+
+	/* restore original page */
+	ret = __i2c_rollball_set_page(i2c, msgs->addr, saved_page);
+
+unlock:
+	i2c_unlock_bus(i2c, I2C_LOCK_SEGMENT);
+
+	return main_err ? : ret;
+}
+
+static int i2c_rollball_mii_poll(struct mii_bus *bus, int bus_addr, u8 *buf,
+				 size_t len)
+{
+	struct i2c_adapter *i2c = bus->priv;
+	struct i2c_msg msgs[2];
+	u8 cmd_addr, tmp, *res;
+	int i, ret;
+
+	cmd_addr = ROLLBALL_CMD_ADDR;
+
+	res = buf ? buf : &tmp;
+	len = buf ? len : 1;
+
+	msgs[0].addr = bus_addr;
+	msgs[0].flags = 0;
+	msgs[0].len = 1;
+	msgs[0].buf = &cmd_addr;
+
+	msgs[1].addr = bus_addr;
+	msgs[1].flags = I2C_M_RD;
+	msgs[1].len = len;
+	msgs[1].buf = res;
+
+	/* By experiment it takes up to 70 ms to access a register for these
+	 * SFPs. Sleep 20ms between iterations and try 10 times.
+	 */
+	i = 10;
+	do {
+		msleep(20);
+
+		ret = i2c_transfer_rollball(i2c, msgs, ARRAY_SIZE(msgs));
+		if (ret)
+			return ret;
+
+		if (*res == ROLLBALL_CMD_DONE)
+			return 0;
+	} while (i-- > 0);
+
+	dev_dbg(&bus->dev, "poll timed out\n");
+
+	return -ETIMEDOUT;
+}
+
+static int i2c_rollball_mii_cmd(struct mii_bus *bus, int bus_addr, u8 cmd,
+				u8 *data, size_t len)
+{
+	struct i2c_adapter *i2c = bus->priv;
+	struct i2c_msg msgs[2];
+	u8 cmdbuf[2];
+
+	cmdbuf[0] = ROLLBALL_CMD_ADDR;
+	cmdbuf[1] = cmd;
+
+	msgs[0].addr = bus_addr;
+	msgs[0].flags = 0;
+	msgs[0].len = len;
+	msgs[0].buf = data;
+
+	msgs[1].addr = bus_addr;
+	msgs[1].flags = 0;
+	msgs[1].len = sizeof(cmdbuf);
+	msgs[1].buf = cmdbuf;
+
+	return i2c_transfer_rollball(i2c, msgs, ARRAY_SIZE(msgs));
+}
+
+static int i2c_mii_read_rollball(struct mii_bus *bus, int phy_id, int reg)
+{
+	u8 buf[4], res[6];
+	int bus_addr, ret;
+	u16 val;
+
+	if (!(reg & MII_ADDR_C45))
+		return -EOPNOTSUPP;
+
+	bus_addr = i2c_mii_phy_addr(phy_id);
+	if (bus_addr != ROLLBALL_PHY_I2C_ADDR)
+		return 0xffff;
+
+	buf[0] = ROLLBALL_DATA_ADDR;
+	buf[1] = (reg >> 16) & 0x1f;
+	buf[2] = (reg >> 8) & 0xff;
+	buf[3] = reg & 0xff;
+
+	ret = i2c_rollball_mii_cmd(bus, bus_addr, ROLLBALL_CMD_READ, buf,
+				   sizeof(buf));
+	if (ret < 0)
+		return ret;
+
+	ret = i2c_rollball_mii_poll(bus, bus_addr, res, sizeof(res));
+	if (ret == -ETIMEDOUT)
+		return 0xffff;
+	else if (ret < 0)
+		return ret;
+
+	val = res[4] << 8 | res[5];
+
+	return val;
+}
+
+static int i2c_mii_write_rollball(struct mii_bus *bus, int phy_id, int reg,
+				  u16 val)
+{
+	int bus_addr, ret;
+	u8 buf[6];
+
+	if (!(reg & MII_ADDR_C45))
+		return -EOPNOTSUPP;
+
+	bus_addr = i2c_mii_phy_addr(phy_id);
+	if (bus_addr != ROLLBALL_PHY_I2C_ADDR)
+		return 0;
+
+	buf[0] = ROLLBALL_DATA_ADDR;
+	buf[1] = (reg >> 16) & 0x1f;
+	buf[2] = (reg >> 8) & 0xff;
+	buf[3] = reg & 0xff;
+	buf[4] = val >> 8;
+	buf[5] = val & 0xff;
+
+	ret = i2c_rollball_mii_cmd(bus, bus_addr, ROLLBALL_CMD_WRITE, buf,
+				   sizeof(buf));
+	if (ret < 0)
+		return ret;
+
+	ret = i2c_rollball_mii_poll(bus, bus_addr, NULL, 0);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static int i2c_mii_init_rollball(struct i2c_adapter *i2c)
+{
+	struct i2c_msg msg;
+	u8 pw[5];
+	int ret;
+
+	pw[0] = ROLLBALL_PASSWORD;
+	pw[1] = 0xff;
+	pw[2] = 0xff;
+	pw[3] = 0xff;
+	pw[4] = 0xff;
+
+	msg.addr = ROLLBALL_PHY_I2C_ADDR;
+	msg.flags = 0;
+	msg.len = sizeof(pw);
+	msg.buf = pw;
+
+	ret = i2c_transfer(i2c, &msg, 1);
+	if (ret < 0)
+		return ret;
+	else if (ret != 1)
+		return -EIO;
+	else
+		return 0;
+}
+
+struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c,
+			       enum mdio_i2c_proto protocol)
 {
 	struct mii_bus *mii;
+	int ret;
 
 	if (!i2c_check_functionality(i2c, I2C_FUNC_I2C))
 		return ERR_PTR(-EINVAL);
@@ -104,10 +386,28 @@ struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c)
 
 	snprintf(mii->id, MII_BUS_ID_SIZE, "i2c:%s", dev_name(parent));
 	mii->parent = parent;
-	mii->read = i2c_mii_read;
-	mii->write = i2c_mii_write;
 	mii->priv = i2c;
 
+	switch (protocol) {
+	case MDIO_I2C_ROLLBALL:
+		ret = i2c_mii_init_rollball(i2c);
+		if (ret < 0) {
+			dev_err(parent,
+				"Cannot initialize RollBall MDIO I2C protocol: %d\n",
+				ret);
+			mdiobus_free(mii);
+			return ERR_PTR(ret);
+		}
+
+		mii->read = i2c_mii_read_rollball;
+		mii->write = i2c_mii_write_rollball;
+		break;
+	default:
+		mii->read = i2c_mii_read_default;
+		mii->write = i2c_mii_write_default;
+		break;
+	}
+
 	return mii;
 }
 EXPORT_SYMBOL_GPL(mdio_i2c_alloc);
diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index ccd7710685f2..20f48464a06a 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -546,7 +546,7 @@ static int sfp_i2c_mdiobus_create(struct sfp *sfp)
 	struct mii_bus *i2c_mii;
 	int ret;
 
-	i2c_mii = mdio_i2c_alloc(sfp->dev, sfp->i2c);
+	i2c_mii = mdio_i2c_alloc(sfp->dev, sfp->i2c, sfp->mdio_protocol);
 	if (IS_ERR(i2c_mii))
 		return PTR_ERR(i2c_mii);
 
@@ -1720,6 +1720,10 @@ static int sfp_sm_probe_for_phy(struct sfp *sfp)
 	case MDIO_I2C_C45:
 		err = sfp_sm_probe_phy(sfp, true);
 		break;
+
+	case MDIO_I2C_ROLLBALL:
+		err = -EOPNOTSUPP;
+		break;
 	}
 
 	return err;
diff --git a/include/linux/mdio/mdio-i2c.h b/include/linux/mdio/mdio-i2c.h
index 3bde1a555a49..65b550a6fc32 100644
--- a/include/linux/mdio/mdio-i2c.h
+++ b/include/linux/mdio/mdio-i2c.h
@@ -15,8 +15,10 @@ enum mdio_i2c_proto {
 	MDIO_I2C_NONE,
 	MDIO_I2C_MARVELL_C22,
 	MDIO_I2C_C45,
+	MDIO_I2C_ROLLBALL,
 };
 
-struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c);
+struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c,
+			       enum mdio_i2c_proto protocol);
 
 #endif
-- 
cgit v1.2.3


From 324e88cbe3b7be03af67828469cedb52c8610bd1 Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Fri, 30 Sep 2022 16:21:10 +0200
Subject: net: sfp: add support for multigig RollBall transceivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds support for multigig copper SFP modules from RollBall/Hilink.
These modules have a specific way to access clause 45 registers of the
internal PHY.

We also need to wait at least 22 seconds after deasserting TX disable
before accessing the PHY. The code waits for 25 seconds just to be sure.

Signed-off-by: Marek Behún <kabel@kernel.org>
Reviewed-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp.c | 49 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c
index 20f48464a06a..40c9a64c5e30 100644
--- a/drivers/net/phy/sfp.c
+++ b/drivers/net/phy/sfp.c
@@ -166,6 +166,7 @@ static const enum gpiod_flags gpio_flags[] = {
  * on board (for a copper SFP) time to initialise.
  */
 #define T_WAIT			msecs_to_jiffies(50)
+#define T_WAIT_ROLLBALL		msecs_to_jiffies(25000)
 #define T_START_UP		msecs_to_jiffies(300)
 #define T_START_UP_BAD_GPON	msecs_to_jiffies(60000)
 
@@ -205,8 +206,11 @@ static const enum gpiod_flags gpio_flags[] = {
 
 /* SFP modules appear to always have their PHY configured for bus address
  * 0x56 (which with mdio-i2c, translates to a PHY address of 22).
+ * RollBall SFPs access phy via SFP Enhanced Digital Diagnostic Interface
+ * via address 0x51 (mdio-i2c will use RollBall protocol on this address).
  */
-#define SFP_PHY_ADDR	22
+#define SFP_PHY_ADDR		22
+#define SFP_PHY_ADDR_ROLLBALL	17
 
 struct sff_data {
 	unsigned int gpios;
@@ -252,6 +256,7 @@ struct sfp {
 	struct sfp_eeprom_id id;
 	unsigned int module_power_mW;
 	unsigned int module_t_start_up;
+	unsigned int module_t_wait;
 	bool tx_fault_ignore;
 
 	const struct sfp_quirk *quirk;
@@ -331,6 +336,22 @@ static void sfp_fixup_halny_gsfp(struct sfp *sfp)
 	sfp->state_hw_mask &= ~(SFP_F_TX_FAULT | SFP_F_LOS);
 }
 
+static void sfp_fixup_rollball(struct sfp *sfp)
+{
+	sfp->mdio_protocol = MDIO_I2C_ROLLBALL;
+	sfp->module_t_wait = T_WAIT_ROLLBALL;
+}
+
+static void sfp_fixup_rollball_cc(struct sfp *sfp)
+{
+	sfp_fixup_rollball(sfp);
+
+	/* Some RollBall SFPs may have wrong (zero) extended compliance code
+	 * burned in EEPROM. For PHY probing we need the correct one.
+	 */
+	sfp->id.base.extended_cc = SFF8024_ECC_10GBASE_T_SFI;
+}
+
 static void sfp_quirk_2500basex(const struct sfp_eeprom_id *id,
 				unsigned long *modes,
 				unsigned long *interfaces)
@@ -378,6 +399,12 @@ static const struct sfp_quirk sfp_quirks[] = {
 	SFP_QUIRK_M("Lantech", "8330-262D-E", sfp_quirk_2500basex),
 
 	SFP_QUIRK_M("UBNT", "UF-INSTANT", sfp_quirk_ubnt_uf_instant),
+
+	SFP_QUIRK_F("OEM", "SFP-10G-T", sfp_fixup_rollball_cc),
+	SFP_QUIRK_F("OEM", "RTSFP-10", sfp_fixup_rollball_cc),
+	SFP_QUIRK_F("OEM", "RTSFP-10G", sfp_fixup_rollball_cc),
+	SFP_QUIRK_F("Turris", "RTSFP-10", sfp_fixup_rollball),
+	SFP_QUIRK_F("Turris", "RTSFP-10G", sfp_fixup_rollball),
 };
 
 static size_t sfp_strlen(const char *str, size_t maxlen)
@@ -1585,12 +1612,12 @@ static void sfp_sm_phy_detach(struct sfp *sfp)
 	sfp->mod_phy = NULL;
 }
 
-static int sfp_sm_probe_phy(struct sfp *sfp, bool is_c45)
+static int sfp_sm_probe_phy(struct sfp *sfp, int addr, bool is_c45)
 {
 	struct phy_device *phy;
 	int err;
 
-	phy = get_phy_device(sfp->i2c_mii, SFP_PHY_ADDR, is_c45);
+	phy = get_phy_device(sfp->i2c_mii, addr, is_c45);
 	if (phy == ERR_PTR(-ENODEV))
 		return PTR_ERR(phy);
 	if (IS_ERR(phy)) {
@@ -1714,15 +1741,15 @@ static int sfp_sm_probe_for_phy(struct sfp *sfp)
 		break;
 
 	case MDIO_I2C_MARVELL_C22:
-		err = sfp_sm_probe_phy(sfp, false);
+		err = sfp_sm_probe_phy(sfp, SFP_PHY_ADDR, false);
 		break;
 
 	case MDIO_I2C_C45:
-		err = sfp_sm_probe_phy(sfp, true);
+		err = sfp_sm_probe_phy(sfp, SFP_PHY_ADDR, true);
 		break;
 
 	case MDIO_I2C_ROLLBALL:
-		err = -EOPNOTSUPP;
+		err = sfp_sm_probe_phy(sfp, SFP_PHY_ADDR_ROLLBALL, true);
 		break;
 	}
 
@@ -2049,6 +2076,7 @@ static int sfp_sm_mod_probe(struct sfp *sfp, bool report)
 		sfp->state_hw_mask |= SFP_F_LOS;
 
 	sfp->module_t_start_up = T_START_UP;
+	sfp->module_t_wait = T_WAIT;
 
 	sfp->tx_fault_ignore = false;
 
@@ -2263,9 +2291,10 @@ static void sfp_sm_main(struct sfp *sfp, unsigned int event)
 
 		/* We need to check the TX_FAULT state, which is not defined
 		 * while TX_DISABLE is asserted. The earliest we want to do
-		 * anything (such as probe for a PHY) is 50ms.
+		 * anything (such as probe for a PHY) is 50ms (or more on
+		 * specific modules).
 		 */
-		sfp_sm_next(sfp, SFP_S_WAIT, T_WAIT);
+		sfp_sm_next(sfp, SFP_S_WAIT, sfp->module_t_wait);
 		break;
 
 	case SFP_S_WAIT:
@@ -2279,8 +2308,8 @@ static void sfp_sm_main(struct sfp *sfp, unsigned int event)
 			 * deasserting.
 			 */
 			timeout = sfp->module_t_start_up;
-			if (timeout > T_WAIT)
-				timeout -= T_WAIT;
+			if (timeout > sfp->module_t_wait)
+				timeout -= sfp->module_t_wait;
 			else
 				timeout = 1;
 
-- 
cgit v1.2.3


From 69800e516e968fa1cf9202b872dc86515eeeebe6 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 30 Sep 2022 08:59:31 -0700
Subject: mptcp: propagate fastclose error

When an mptcp socket is closed due to an incoming FASTCLOSE
option, so specific sk_err is set and later syscall will
fail usually with EPIPE.

Align the current fastclose error handling with TCP reset,
properly setting the socket error according to the current
msk state and propagating such error.

Additionally sendmsg() is currently not handling properly
the sk_err, always returning EPIPE.

Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c | 47 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 760404b15cd0..cad0346c9281 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1707,7 +1707,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			goto out;
 		} else if (ret) {
 			release_sock(ssk);
-			goto out;
+			goto do_error;
 		}
 		release_sock(ssk);
 	}
@@ -1717,9 +1717,13 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 		ret = sk_stream_wait_connect(sk, &timeo);
 		if (ret)
-			goto out;
+			goto do_error;
 	}
 
+	ret = -EPIPE;
+	if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)))
+		goto do_error;
+
 	pfrag = sk_page_frag(sk);
 
 	while (msg_data_left(msg)) {
@@ -1728,11 +1732,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		bool dfrag_collapsed;
 		size_t psize, offset;
 
-		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
-			ret = -EPIPE;
-			goto out;
-		}
-
 		/* reuse tail pfrag, if possible, or carve a new one from the
 		 * page allocator
 		 */
@@ -1764,7 +1763,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		if (copy_page_from_iter(dfrag->page, offset, psize,
 					&msg->msg_iter) != psize) {
 			ret = -EFAULT;
-			goto out;
+			goto do_error;
 		}
 
 		/* data successfully copied into the write queue */
@@ -1796,7 +1795,7 @@ wait_for_memory:
 		__mptcp_push_pending(sk, msg->msg_flags);
 		ret = sk_stream_wait_memory(sk, &timeo);
 		if (ret)
-			goto out;
+			goto do_error;
 	}
 
 	if (copied)
@@ -1804,7 +1803,14 @@ wait_for_memory:
 
 out:
 	release_sock(sk);
-	return copied ? : ret;
+	return copied;
+
+do_error:
+	if (copied)
+		goto out;
+
+	copied = sk_stream_error(sk, msg->msg_flags, ret);
+	goto out;
 }
 
 static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
@@ -2441,12 +2447,31 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
 		unlock_sock_fast(tcp_sk, slow);
 	}
 
+	/* Mirror the tcp_reset() error propagation */
+	switch (sk->sk_state) {
+	case TCP_SYN_SENT:
+		sk->sk_err = ECONNREFUSED;
+		break;
+	case TCP_CLOSE_WAIT:
+		sk->sk_err = EPIPE;
+		break;
+	case TCP_CLOSE:
+		return;
+	default:
+		sk->sk_err = ECONNRESET;
+	}
+
 	inet_sk_state_store(sk, TCP_CLOSE);
 	sk->sk_shutdown = SHUTDOWN_MASK;
 	smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
 	set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags);
 
-	mptcp_close_wake_up(sk);
+	/* the calling mptcp_worker will properly destroy the socket */
+	if (sock_flag(sk, SOCK_DEAD))
+		return;
+
+	sk->sk_state_change(sk);
+	sk_error_report(sk);
 }
 
 static void __mptcp_retrans(struct sock *sk)
-- 
cgit v1.2.3


From d21f8348551831efd917528afc545832a372426e Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 30 Sep 2022 08:59:32 -0700
Subject: mptcp: use fastclose on more edge scenarios

Daire reported a user-space application hang-up when the
peer is forcibly closed before the data transfer completion.

The relevant application expects the peer to either
do an application-level clean shutdown or a transport-level
connection reset.

We can accommodate a such user by extending the fastclose
usage: at fd close time, if the msk socket has some unread
data, and at FIN_WAIT timeout.

Note that at MPTCP close time we must ensure that the TCP
subflows will reset: set the linger socket option to a suitable
value.

Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c | 63 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 19 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index cad0346c9281..acf44075ba40 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2313,8 +2313,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
 
 	lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
 
-	if (flags & MPTCP_CF_FASTCLOSE)
+	if (flags & MPTCP_CF_FASTCLOSE) {
+		/* be sure to force the tcp_disconnect() path,
+		 * to generate the egress reset
+		 */
+		ssk->sk_lingertime = 0;
+		sock_set_flag(ssk, SOCK_LINGER);
 		subflow->send_fastclose = 1;
+	}
 
 	need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
 	if (!dispose_it) {
@@ -2577,6 +2583,16 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
 	mptcp_reset_timeout(msk, 0);
 }
 
+static void mptcp_do_fastclose(struct sock *sk)
+{
+	struct mptcp_subflow_context *subflow, *tmp;
+	struct mptcp_sock *msk = mptcp_sk(sk);
+
+	mptcp_for_each_subflow_safe(msk, subflow, tmp)
+		__mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow),
+				  subflow, MPTCP_CF_FASTCLOSE);
+}
+
 static void mptcp_worker(struct work_struct *work)
 {
 	struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
@@ -2605,11 +2621,15 @@ static void mptcp_worker(struct work_struct *work)
 	 * closed, but we need the msk around to reply to incoming DATA_FIN,
 	 * even if it is orphaned and in FIN_WAIT2 state
 	 */
-	if (sock_flag(sk, SOCK_DEAD) &&
-	    (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) {
-		inet_sk_state_store(sk, TCP_CLOSE);
-		__mptcp_destroy_sock(sk);
-		goto unlock;
+	if (sock_flag(sk, SOCK_DEAD)) {
+		if (mptcp_check_close_timeout(sk)) {
+			inet_sk_state_store(sk, TCP_CLOSE);
+			mptcp_do_fastclose(sk);
+		}
+		if (sk->sk_state == TCP_CLOSE) {
+			__mptcp_destroy_sock(sk);
+			goto unlock;
+		}
 	}
 
 	if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
@@ -2850,6 +2870,18 @@ static void __mptcp_destroy_sock(struct sock *sk)
 	sock_put(sk);
 }
 
+static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
+{
+	/* Concurrent splices from sk_receive_queue into receive_queue will
+	 * always show at least one non-empty queue when checked in this order.
+	 */
+	if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
+	    skb_queue_empty_lockless(&msk->receive_queue))
+		return 0;
+
+	return EPOLLIN | EPOLLRDNORM;
+}
+
 bool __mptcp_close(struct sock *sk, long timeout)
 {
 	struct mptcp_subflow_context *subflow;
@@ -2863,8 +2895,13 @@ bool __mptcp_close(struct sock *sk, long timeout)
 		goto cleanup;
 	}
 
-	if (mptcp_close_state(sk))
+	if (mptcp_check_readable(msk)) {
+		/* the msk has read data, do the MPTCP equivalent of TCP reset */
+		inet_sk_state_store(sk, TCP_CLOSE);
+		mptcp_do_fastclose(sk);
+	} else if (mptcp_close_state(sk)) {
 		__mptcp_wr_shutdown(sk);
+	}
 
 	sk_stream_wait_close(sk, timeout);
 
@@ -3681,18 +3718,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
 	return err;
 }
 
-static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
-{
-	/* Concurrent splices from sk_receive_queue into receive_queue will
-	 * always show at least one non-empty queue when checked in this order.
-	 */
-	if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) &&
-	    skb_queue_empty_lockless(&msk->receive_queue))
-		return 0;
-
-	return EPOLLIN | EPOLLRDNORM;
-}
-
 static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
 {
 	struct sock *sk = (struct sock *)msk;
-- 
cgit v1.2.3


From 6bf41020b72b1c5ab96acc309a135153abbe20c8 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 30 Sep 2022 08:59:33 -0700
Subject: selftests: mptcp: update and extend fastclose test-cases

After the previous patches, the MPTCP protocol can generate
fast-closes on both ends of the connection. Rework the relevant
test-case to carefully trigger the fast-close code-path on a
single end at the time, while ensuring than a predictable amount
of data is spooled on both ends.

Additionally add another test-cases for the passive socket
fast-close.

Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/net/mptcp/mptcp_connect.c | 65 ++++++++++++++--
 tools/testing/selftests/net/mptcp/mptcp_join.sh   | 90 ++++++++++++++++++-----
 2 files changed, 130 insertions(+), 25 deletions(-)

diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
index 24d4e9cb617e..e54653ea2ed4 100644
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.c
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -72,6 +72,8 @@ static int cfg_wait;
 static uint32_t cfg_mark;
 static char *cfg_input;
 static int cfg_repeat = 1;
+static int cfg_truncate;
+static int cfg_rcv_trunc;
 
 struct cfg_cmsg_types {
 	unsigned int cmsg_enabled:1;
@@ -95,11 +97,15 @@ static struct cfg_sockopt_types cfg_sockopt_types;
 
 static void die_usage(void)
 {
-	fprintf(stderr, "Usage: mptcp_connect [-6] [-c cmsg] [-i file] [-I num] [-j] [-l] "
+	fprintf(stderr, "Usage: mptcp_connect [-6] [-c cmsg] [-f offset] [-i file] [-I num] [-j] [-l] "
 		"[-m mode] [-M mark] [-o option] [-p port] [-P mode] [-j] [-l] [-r num] "
 		"[-s MPTCP|TCP] [-S num] [-r num] [-t num] [-T num] [-u] [-w sec] connect_address\n");
 	fprintf(stderr, "\t-6 use ipv6\n");
 	fprintf(stderr, "\t-c cmsg -- test cmsg type <cmsg>\n");
+	fprintf(stderr, "\t-f offset -- stop the I/O after receiving and sending the specified amount "
+		"of bytes. If there are unread bytes in the receive queue, that will cause a MPTCP "
+		"fastclose at close/shutdown. If offset is negative, expect the peer to close before "
+		"all the local data as been sent, thus toleration errors on write and EPIPE signals\n");
 	fprintf(stderr, "\t-i file -- read the data to send from the given file instead of stdin");
 	fprintf(stderr, "\t-I num -- repeat the transfer 'num' times. In listen mode accepts num "
 		"incoming connections, in client mode, disconnect and reconnect to the server\n");
@@ -382,7 +388,7 @@ static size_t do_rnd_write(const int fd, char *buf, const size_t len)
 
 	bw = write(fd, buf, do_w);
 	if (bw < 0)
-		perror("write");
+		return bw;
 
 	/* let the join handshake complete, before going on */
 	if (cfg_join && first) {
@@ -571,7 +577,7 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
 		.fd = peerfd,
 		.events = POLLIN | POLLOUT,
 	};
-	unsigned int woff = 0, wlen = 0;
+	unsigned int woff = 0, wlen = 0, total_wlen = 0, total_rlen = 0;
 	char wbuf[8192];
 
 	set_nonblock(peerfd, true);
@@ -597,7 +603,16 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
 		}
 
 		if (fds.revents & POLLIN) {
-			len = do_rnd_read(peerfd, rbuf, sizeof(rbuf));
+			ssize_t rb = sizeof(rbuf);
+
+			/* limit the total amount of read data to the trunc value*/
+			if (cfg_truncate > 0) {
+				if (rb + total_rlen > cfg_truncate)
+					rb = cfg_truncate - total_rlen;
+				len = read(peerfd, rbuf, rb);
+			} else {
+				len = do_rnd_read(peerfd, rbuf, sizeof(rbuf));
+			}
 			if (len == 0) {
 				/* no more data to receive:
 				 * peer has closed its write side
@@ -612,10 +627,13 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
 
 			/* Else, still have data to transmit */
 			} else if (len < 0) {
+				if (cfg_rcv_trunc)
+					return 0;
 				perror("read");
 				return 3;
 			}
 
+			total_rlen += len;
 			do_write(outfd, rbuf, len);
 		}
 
@@ -628,12 +646,21 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
 			if (wlen > 0) {
 				ssize_t bw;
 
+				/* limit the total amount of written data to the trunc value */
+				if (cfg_truncate > 0 && wlen + total_wlen > cfg_truncate)
+					wlen = cfg_truncate - total_wlen;
+
 				bw = do_rnd_write(peerfd, wbuf + woff, wlen);
-				if (bw < 0)
+				if (bw < 0) {
+					if (cfg_rcv_trunc)
+						return 0;
+					perror("write");
 					return 111;
+				}
 
 				woff += bw;
 				wlen -= bw;
+				total_wlen += bw;
 			} else if (wlen == 0) {
 				/* We have no more data to send. */
 				fds.events &= ~POLLOUT;
@@ -652,10 +679,16 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd, bool *in_closed_after
 		}
 
 		if (fds.revents & (POLLERR | POLLNVAL)) {
+			if (cfg_rcv_trunc)
+				return 0;
 			fprintf(stderr, "Unexpected revents: "
 				"POLLERR/POLLNVAL(%x)\n", fds.revents);
 			return 5;
 		}
+
+		if (cfg_truncate > 0 && total_wlen >= cfg_truncate &&
+		    total_rlen >= cfg_truncate)
+			break;
 	}
 
 	/* leave some time for late join/announce */
@@ -1160,11 +1193,13 @@ again:
 	}
 
 	/* close the client socket open only if we are not going to reconnect */
-	ret = copyfd_io(fd_in, fd, 1, cfg_repeat == 1);
+	ret = copyfd_io(fd_in, fd, 1, 0);
 	if (ret)
 		return ret;
 
-	if (--cfg_repeat > 0) {
+	if (cfg_truncate > 0) {
+		xdisconnect(fd, peer->ai_addrlen);
+	} else if (--cfg_repeat > 0) {
 		xdisconnect(fd, peer->ai_addrlen);
 
 		/* the socket could be unblocking at this point, we need the
@@ -1176,7 +1211,10 @@ again:
 		if (cfg_input)
 			close(fd_in);
 		goto again;
+	} else {
+		close(fd);
 	}
+
 	return 0;
 }
 
@@ -1262,8 +1300,19 @@ static void parse_opts(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "6c:hi:I:jlm:M:o:p:P:r:R:s:S:t:T:w:")) != -1) {
+	while ((c = getopt(argc, argv, "6c:f:hi:I:jlm:M:o:p:P:r:R:s:S:t:T:w:")) != -1) {
 		switch (c) {
+		case 'f':
+			cfg_truncate = atoi(optarg);
+
+			/* when receiving a fastclose, ignore PIPE signals and
+			 * all the I/O errors later in the code
+			 */
+			if (cfg_truncate < 0) {
+				cfg_rcv_trunc = true;
+				signal(SIGPIPE, handle_signal);
+			}
+			break;
 		case 'j':
 			cfg_join = true;
 			cfg_mode = CFG_MODE_POLL;
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
index 2957fe414639..f3dd5f2a0272 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -346,10 +346,21 @@ check_transfer()
 	local in=$1
 	local out=$2
 	local what=$3
+	local bytes=$4
 	local i a b
 
 	local line
-	cmp -l "$in" "$out" | while read -r i a b; do
+	if [ -n "$bytes" ]; then
+		# when truncating we must check the size explicitly
+		local out_size=$(wc -c $out | awk '{print $1}')
+		if [ $out_size -ne $bytes ]; then
+			echo "[ FAIL ] $what output file has wrong size ($out_size, $bytes)"
+			fail_test
+			return 1
+		fi
+		bytes="--bytes=${bytes}"
+	fi
+	cmp -l "$in" "$out" ${bytes} | while read -r i a b; do
 		local sum=$((0${a} + 0${b}))
 		if [ $check_invert -eq 0 ] || [ $sum -ne $((0xff)) ]; then
 			echo "[ FAIL ] $what does not match (in, out):"
@@ -707,9 +718,31 @@ do_transfer()
 	fi
 
 	local flags="subflow"
+	local extra_cl_args=""
+	local extra_srv_args=""
+	local trunc_size=""
 	if [[ "${addr_nr_ns2}" = "fastclose_"* ]]; then
+		if [ ${test_link_fail} -le 1 ]; then
+			echo "fastclose tests need test_link_fail argument"
+			fail_test
+			return 1
+		fi
+
 		# disconnect
-		extra_args="$extra_args -I ${addr_nr_ns2:10}"
+		trunc_size=${test_link_fail}
+		local side=${addr_nr_ns2:10}
+
+		if [ ${side} = "client" ]; then
+			extra_cl_args="-f ${test_link_fail}"
+			extra_srv_args="-f -1"
+		elif [ ${side} = "server" ]; then
+			extra_srv_args="-f ${test_link_fail}"
+			extra_cl_args="-f -1"
+		else
+			echo "wrong/unknown fastclose spec ${side}"
+			fail_test
+			return 1
+		fi
 		addr_nr_ns2=0
 	elif [[ "${addr_nr_ns2}" = "userspace_"* ]]; then
 		userspace_pm=1
@@ -737,39 +770,41 @@ do_transfer()
 		local_addr="0.0.0.0"
 	fi
 
+	extra_srv_args="$extra_args $extra_srv_args"
 	if [ "$test_link_fail" -gt 1 ];then
 		timeout ${timeout_test} \
 			ip netns exec ${listener_ns} \
 				./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \
-					$extra_args ${local_addr} < "$sinfail" > "$sout" &
+					$extra_srv_args ${local_addr} < "$sinfail" > "$sout" &
 	else
 		timeout ${timeout_test} \
 			ip netns exec ${listener_ns} \
 				./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \
-					$extra_args ${local_addr} < "$sin" > "$sout" &
+					$extra_srv_args ${local_addr} < "$sin" > "$sout" &
 	fi
 	local spid=$!
 
 	wait_local_port_listen "${listener_ns}" "${port}"
 
+	extra_cl_args="$extra_args $extra_cl_args"
 	if [ "$test_link_fail" -eq 0 ];then
 		timeout ${timeout_test} \
 			ip netns exec ${connector_ns} \
 				./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \
-					$extra_args $connect_addr < "$cin" > "$cout" &
+					$extra_cl_args $connect_addr < "$cin" > "$cout" &
 	elif [ "$test_link_fail" -eq 1 ] || [ "$test_link_fail" -eq 2 ];then
 		( cat "$cinfail" ; sleep 2; link_failure $listener_ns ; cat "$cinfail" ) | \
 			tee "$cinsent" | \
 			timeout ${timeout_test} \
 				ip netns exec ${connector_ns} \
 					./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \
-						$extra_args $connect_addr > "$cout" &
+						$extra_cl_args $connect_addr > "$cout" &
 	else
 		tee "$cinsent" < "$cinfail" | \
 			timeout ${timeout_test} \
 				ip netns exec ${connector_ns} \
 					./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \
-						$extra_args $connect_addr > "$cout" &
+						$extra_cl_args $connect_addr > "$cout" &
 	fi
 	local cpid=$!
 
@@ -971,15 +1006,15 @@ do_transfer()
 	fi
 
 	if [ "$test_link_fail" -gt 1 ];then
-		check_transfer $sinfail $cout "file received by client"
+		check_transfer $sinfail $cout "file received by client" $trunc_size
 	else
-		check_transfer $sin $cout "file received by client"
+		check_transfer $sin $cout "file received by client" $trunc_size
 	fi
 	retc=$?
 	if [ "$test_link_fail" -eq 0 ];then
-		check_transfer $cin $sout "file received by server"
+		check_transfer $cin $sout "file received by server" $trunc_size
 	else
-		check_transfer $cinsent $sout "file received by server"
+		check_transfer $cinsent $sout "file received by server" $trunc_size
 	fi
 	rets=$?
 
@@ -1188,12 +1223,23 @@ chk_fclose_nr()
 {
 	local fclose_tx=$1
 	local fclose_rx=$2
+	local ns_invert=$3
 	local count
 	local dump_stats
+	local ns_tx=$ns2
+	local ns_rx=$ns1
+	local extra_msg="   "
+
+	if [[ $ns_invert = "invert" ]]; then
+		ns_tx=$ns1
+		ns_rx=$ns2
+		extra_msg=${extra_msg}"invert"
+	fi
 
 	printf "%-${nr_blank}s %s" " " "ctx"
-	count=$(ip netns exec $ns2 nstat -as | grep MPTcpExtMPFastcloseTx | awk '{print $2}')
+	count=$(ip netns exec $ns_tx nstat -as | grep MPTcpExtMPFastcloseTx | awk '{print $2}')
 	[ -z "$count" ] && count=0
+	[ "$count" != "$fclose_tx" ] && extra_msg="$extra_msg,tx=$count"
 	if [ "$count" != "$fclose_tx" ]; then
 		echo "[fail] got $count MP_FASTCLOSE[s] TX expected $fclose_tx"
 		fail_test
@@ -1203,17 +1249,20 @@ chk_fclose_nr()
 	fi
 
 	echo -n " - fclzrx"
-	count=$(ip netns exec $ns1 nstat -as | grep MPTcpExtMPFastcloseRx | awk '{print $2}')
+	count=$(ip netns exec $ns_rx nstat -as | grep MPTcpExtMPFastcloseRx | awk '{print $2}')
 	[ -z "$count" ] && count=0
+	[ "$count" != "$fclose_rx" ] && extra_msg="$extra_msg,rx=$count"
 	if [ "$count" != "$fclose_rx" ]; then
 		echo "[fail] got $count MP_FASTCLOSE[s] RX expected $fclose_rx"
 		fail_test
 		dump_stats=1
 	else
-		echo "[ ok ]"
+		echo -n "[ ok ]"
 	fi
 
 	[ "${dump_stats}" = 1 ] && dump_stats
+
+	echo "$extra_msg"
 }
 
 chk_rst_nr()
@@ -1236,7 +1285,7 @@ chk_rst_nr()
 	printf "%-${nr_blank}s %s" " " "rtx"
 	count=$(ip netns exec $ns_tx nstat -as | grep MPTcpExtMPRstTx | awk '{print $2}')
 	[ -z "$count" ] && count=0
-	if [ "$count" != "$rst_tx" ]; then
+	if [ $count -lt $rst_tx ]; then
 		echo "[fail] got $count MP_RST[s] TX expected $rst_tx"
 		fail_test
 		dump_stats=1
@@ -1247,7 +1296,7 @@ chk_rst_nr()
 	echo -n " - rstrx "
 	count=$(ip netns exec $ns_rx nstat -as | grep MPTcpExtMPRstRx | awk '{print $2}')
 	[ -z "$count" ] && count=0
-	if [ "$count" != "$rst_rx" ]; then
+	if [ "$count" -lt "$rst_rx" ]; then
 		echo "[fail] got $count MP_RST[s] RX expected $rst_rx"
 		fail_test
 		dump_stats=1
@@ -2801,11 +2850,18 @@ fullmesh_tests()
 fastclose_tests()
 {
 	if reset "fastclose test"; then
-		run_tests $ns1 $ns2 10.0.1.1 1024 0 fastclose_2
+		run_tests $ns1 $ns2 10.0.1.1 1024 0 fastclose_client
 		chk_join_nr 0 0 0
 		chk_fclose_nr 1 1
 		chk_rst_nr 1 1 invert
 	fi
+
+	if reset "fastclose server test"; then
+		run_tests $ns1 $ns2 10.0.1.1 1024 0 fastclose_server
+		chk_join_nr 0 0 0
+		chk_fclose_nr 1 1 invert
+		chk_rst_nr 1 1
+	fi
 }
 
 pedit_action_pkts()
-- 
cgit v1.2.3


From d89e3ed76b6efd1a4fd213feebedd01337a16b5a Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 30 Sep 2022 08:59:34 -0700
Subject: mptcp: update misleading comments.

The MPTCP data path is quite complex and hard to understend even
without some foggy comments referring to modified code and/or
completely misleading from the beginning.

Update a few of them to more accurately describing the current
status.

Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mptcp/protocol.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index acf44075ba40..f599ad44ed24 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -662,9 +662,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 
 		skb = skb_peek(&ssk->sk_receive_queue);
 		if (!skb) {
-			/* if no data is found, a racing workqueue/recvmsg
-			 * already processed the new data, stop here or we
-			 * can enter an infinite loop
+			/* With racing move_skbs_to_msk() and __mptcp_move_skbs(),
+			 * a different CPU can have already processed the pending
+			 * data, stop here or we can enter an infinite loop
 			 */
 			if (!moved)
 				done = true;
@@ -672,9 +672,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 		}
 
 		if (__mptcp_check_fallback(msk)) {
-			/* if we are running under the workqueue, TCP could have
-			 * collapsed skbs between dummy map creation and now
-			 * be sure to adjust the size
+			/* Under fallback skbs have no MPTCP extension and TCP could
+			 * collapse them between the dummy map creation and the
+			 * current dequeue. Be sure to adjust the map size.
 			 */
 			map_remaining = skb->len;
 			subflow->map_data_len = skb->len;
@@ -3768,7 +3768,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
 		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
 
-	/* This barrier is coupled with smp_wmb() in tcp_reset() */
+	/* This barrier is coupled with smp_wmb() in __mptcp_error_report() */
 	smp_rmb();
 	if (sk->sk_err)
 		mask |= EPOLLERR;
-- 
cgit v1.2.3


From 9e6fd874c7bb47b6a4295abc4c81b2f41b97e970 Mon Sep 17 00:00:00 2001
From: Jiasheng Jiang <jiasheng@iscas.ac.cn>
Date: Fri, 30 Sep 2022 12:48:43 +0800
Subject: net: prestera: acl: Add check for kmemdup

As the kemdup could return NULL, it should be better to check the return
value and return error if fails.
Moreover, the return value of prestera_acl_ruleset_keymask_set() should
be checked by cascade.

Fixes: 604ba230902d ("net: prestera: flower template support")
Signed-off-by: Jiasheng Jiang <jiasheng@iscas.ac.cn>
Reviewed-by: Taras Chornyi<tchornyi@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/prestera/prestera_acl.c    | 8 ++++++--
 drivers/net/ethernet/marvell/prestera/prestera_acl.h    | 4 ++--
 drivers/net/ethernet/marvell/prestera/prestera_flower.c | 6 +++++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/marvell/prestera/prestera_acl.c b/drivers/net/ethernet/marvell/prestera/prestera_acl.c
index 3d4b85f2d541..f6b2933859d0 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_acl.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_acl.c
@@ -178,10 +178,14 @@ err_rhashtable_init:
 	return ERR_PTR(err);
 }
 
-void prestera_acl_ruleset_keymask_set(struct prestera_acl_ruleset *ruleset,
-				      void *keymask)
+int prestera_acl_ruleset_keymask_set(struct prestera_acl_ruleset *ruleset,
+				     void *keymask)
 {
 	ruleset->keymask = kmemdup(keymask, ACL_KEYMASK_SIZE, GFP_KERNEL);
+	if (!ruleset->keymask)
+		return -ENOMEM;
+
+	return 0;
 }
 
 int prestera_acl_ruleset_offload(struct prestera_acl_ruleset *ruleset)
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_acl.h b/drivers/net/ethernet/marvell/prestera/prestera_acl.h
index 03fc5b9dc925..131bfbc87cd7 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_acl.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera_acl.h
@@ -185,8 +185,8 @@ struct prestera_acl_ruleset *
 prestera_acl_ruleset_lookup(struct prestera_acl *acl,
 			    struct prestera_flow_block *block,
 			    u32 chain_index);
-void prestera_acl_ruleset_keymask_set(struct prestera_acl_ruleset *ruleset,
-				      void *keymask);
+int prestera_acl_ruleset_keymask_set(struct prestera_acl_ruleset *ruleset,
+				     void *keymask);
 bool prestera_acl_ruleset_is_offload(struct prestera_acl_ruleset *ruleset);
 int prestera_acl_ruleset_offload(struct prestera_acl_ruleset *ruleset);
 void prestera_acl_ruleset_put(struct prestera_acl_ruleset *ruleset);
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_flower.c b/drivers/net/ethernet/marvell/prestera/prestera_flower.c
index 19d3b55c578e..cf551a8379ac 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_flower.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_flower.c
@@ -452,7 +452,9 @@ int prestera_flower_tmplt_create(struct prestera_flow_block *block,
 	}
 
 	/* preserve keymask/template to this ruleset */
-	prestera_acl_ruleset_keymask_set(ruleset, rule.re_key.match.mask);
+	err = prestera_acl_ruleset_keymask_set(ruleset, rule.re_key.match.mask);
+	if (err)
+		goto err_ruleset_keymask_set;
 
 	/* skip error, as it is not possible to reject template operation,
 	 * so, keep the reference to the ruleset for rules to be added
@@ -468,6 +470,8 @@ int prestera_flower_tmplt_create(struct prestera_flow_block *block,
 	list_add_rcu(&template->list, &block->template_list);
 	return 0;
 
+err_ruleset_keymask_set:
+	prestera_acl_ruleset_put(ruleset);
 err_ruleset_get:
 	kfree(template);
 err_malloc:
-- 
cgit v1.2.3


From 5eddb24901ee49eee23c0bfce6af2e83fd5679bd Mon Sep 17 00:00:00 2001
From: Coco Li <lixiaoyan@google.com>
Date: Fri, 30 Sep 2022 15:09:05 -0700
Subject: gro: add support of (hw)gro packets to gro stack

Current GRO stack only supports incoming packets containing
one frame/MSS.

This patch changes GRO to accept packets that are already GRO.

HW-GRO (aka RSC for some vendors) is very often limited in presence
of interleaved packets. Linux SW GRO stack can complete the job
and provide larger GRO packets, thus reducing rate of ACK packets
and cpu overhead.

This also means BIG TCP can still be used, even if HW-GRO/RSC was
able to cook ~64 KB GRO packets.

v2: fix logic in tcp_gro_receive()

    Only support TCP for the moment (Paolo)

Co-Developed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Coco Li <lixiaoyan@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/gro.c         | 18 ++++++++++++++----
 net/ipv4/tcp_offload.c | 17 +++++++++++++++--
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/net/core/gro.c b/net/core/gro.c
index b4190eb08467..bc9451743307 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 	unsigned int gro_max_size;
 	unsigned int new_truesize;
 	struct sk_buff *lp;
+	int segs;
 
 	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
 	gro_max_size = READ_ONCE(p->dev->gro_max_size);
@@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 			return -E2BIG;
 	}
 
+	segs = NAPI_GRO_CB(skb)->count;
 	lp = NAPI_GRO_CB(p)->last;
 	pinfo = skb_shinfo(lp);
 
@@ -265,7 +267,7 @@ merge:
 	lp = p;
 
 done:
-	NAPI_GRO_CB(p)->count++;
+	NAPI_GRO_CB(p)->count += segs;
 	p->data_len += len;
 	p->truesize += delta_truesize;
 	p->len += len;
@@ -496,8 +498,15 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
 					 sizeof(u32))); /* Avoid slow unaligned acc */
 		*(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
-		NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
+		NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
 		NAPI_GRO_CB(skb)->is_atomic = 1;
+		NAPI_GRO_CB(skb)->count = 1;
+		if (unlikely(skb_is_gso(skb))) {
+			NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
+			/* Only support TCP at the moment. */
+			if (!skb_is_gso_tcp(skb))
+				NAPI_GRO_CB(skb)->flush = 1;
+		}
 
 		/* Setup for GRO checksum validation */
 		switch (skb->ip_summed) {
@@ -545,10 +554,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	else
 		gro_list->count++;
 
-	NAPI_GRO_CB(skb)->count = 1;
 	NAPI_GRO_CB(skb)->age = jiffies;
 	NAPI_GRO_CB(skb)->last = skb;
-	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
+	if (!skb_is_gso(skb))
+		skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 	list_add(&skb->list, &gro_list->list);
 	ret = GRO_HELD;
 
@@ -660,6 +669,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 
 	skb->encapsulation = 0;
 	skb_shinfo(skb)->gso_type = 0;
+	skb_shinfo(skb)->gso_size = 0;
 	if (unlikely(skb->slow_gro)) {
 		skb_orphan(skb);
 		skb_ext_reset(skb);
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index a844a0d38482..45dda7889387 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -255,7 +255,15 @@ found:
 
 	mss = skb_shinfo(p)->gso_size;
 
-	flush |= (len - 1) >= mss;
+	/* If skb is a GRO packet, make sure its gso_size matches prior packet mss.
+	 * If it is a single frame, do not aggregate it if its length
+	 * is bigger than our mss.
+	 */
+	if (unlikely(skb_is_gso(skb)))
+		flush |= (mss != skb_shinfo(skb)->gso_size);
+	else
+		flush |= (len - 1) >= mss;
+
 	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
 #ifdef CONFIG_TLS_DEVICE
 	flush |= p->decrypted ^ skb->decrypted;
@@ -269,7 +277,12 @@ found:
 	tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
 
 out_check_final:
-	flush = len < mss;
+	/* Force a flush if last segment is smaller than mss. */
+	if (unlikely(skb_is_gso(skb)))
+		flush = len != NAPI_GRO_CB(skb)->count * skb_shinfo(skb)->gso_size;
+	else
+		flush = len < mss;
+
 	flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
 					TCP_FLAG_RST | TCP_FLAG_SYN |
 					TCP_FLAG_FIN));
-- 
cgit v1.2.3


From cb4b12071a4b68df323c339f60805834246b3e9e Mon Sep 17 00:00:00 2001
From: Raju Lakkaraju <Raju.Lakkaraju@microchip.com>
Date: Fri, 30 Sep 2022 14:57:40 +0530
Subject: eth: lan743x: reject extts for non-pci11x1x devices

Remove PTP_PF_EXTTS support for non-PCI11x1x devices since they do not support
the PTP-IO Input event triggered timestamping mechanisms added

Fixes: 60942c397af6 ("net: lan743x: Add support for PTP-IO Event Input External Timestamp (extts)")
Signed-off-by: Raju Lakkaraju <Raju.Lakkaraju@microchip.com>
Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microchip/lan743x_ptp.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/microchip/lan743x_ptp.c b/drivers/net/ethernet/microchip/lan743x_ptp.c
index 6a11e2ceb013..da3ea905adbb 100644
--- a/drivers/net/ethernet/microchip/lan743x_ptp.c
+++ b/drivers/net/ethernet/microchip/lan743x_ptp.c
@@ -1049,6 +1049,10 @@ static int lan743x_ptpci_verify_pin_config(struct ptp_clock_info *ptp,
 					   enum ptp_pin_function func,
 					   unsigned int chan)
 {
+	struct lan743x_ptp *lan_ptp =
+		container_of(ptp, struct lan743x_ptp, ptp_clock_info);
+	struct lan743x_adapter *adapter =
+		container_of(lan_ptp, struct lan743x_adapter, ptp);
 	int result = 0;
 
 	/* Confirm the requested function is supported. Parameter
@@ -1057,7 +1061,10 @@ static int lan743x_ptpci_verify_pin_config(struct ptp_clock_info *ptp,
 	switch (func) {
 	case PTP_PF_NONE:
 	case PTP_PF_PEROUT:
+		break;
 	case PTP_PF_EXTTS:
+		if (!adapter->is_pci11x1x)
+			result = -1;
 		break;
 	case PTP_PF_PHYSYNC:
 	default:
-- 
cgit v1.2.3


From b43f9acbb8942b05252be83ac25a81cec70cc192 Mon Sep 17 00:00:00 2001
From: Jianglei Nie <niejianglei2021@163.com>
Date: Fri, 30 Sep 2022 14:28:43 +0800
Subject: bnx2x: fix potential memory leak in bnx2x_tpa_stop()

bnx2x_tpa_stop() allocates a memory chunk from new_data with
bnx2x_frag_alloc(). The new_data should be freed when gets some error.
But when "pad + len > fp->rx_buf_size" is true, bnx2x_tpa_stop() returns
without releasing the new_data, which will lead to a memory leak.

We should free the new_data with bnx2x_frag_free() when "pad + len >
fp->rx_buf_size" is true.

Fixes: 07b0f00964def8af9321cfd6c4a7e84f6362f728 ("bnx2x: fix possible panic under memory stress")
Signed-off-by: Jianglei Nie <niejianglei2021@163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 712b5595bc39..24bfc65e28e1 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -789,6 +789,7 @@ static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			BNX2X_ERR("skb_put is about to fail...  pad %d  len %d  rx_buf_size %d\n",
 				  pad, len, fp->rx_buf_size);
 			bnx2x_panic();
+			bnx2x_frag_free(fp, new_data);
 			return;
 		}
 #endif
-- 
cgit v1.2.3


From 9bc61c04ff6cce6a3756b86e6b34914f7b39d734 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Fri, 30 Sep 2022 16:37:30 +0200
Subject: net: Remove DECnet leftovers from flow.h.

DECnet was removed by commit 1202cdd66531 ("Remove DECnet support from
kernel"). Let's also revome its flow structure.

Compile-tested only (allmodconfig).

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 987bd511d652..2f0da4f0318b 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -54,11 +54,6 @@ union flowi_uli {
 		__u8	code;
 	} icmpt;
 
-	struct {
-		__le16	dport;
-		__le16	sport;
-	} dnports;
-
 	__be32		gre_key;
 
 	struct {
@@ -156,27 +151,11 @@ struct flowi6 {
 	__u32			mp_hash;
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
-struct flowidn {
-	struct flowi_common	__fl_common;
-#define flowidn_oif		__fl_common.flowic_oif
-#define flowidn_iif		__fl_common.flowic_iif
-#define flowidn_mark		__fl_common.flowic_mark
-#define flowidn_scope		__fl_common.flowic_scope
-#define flowidn_proto		__fl_common.flowic_proto
-#define flowidn_flags		__fl_common.flowic_flags
-	__le16			daddr;
-	__le16			saddr;
-	union flowi_uli		uli;
-#define fld_sport		uli.ports.sport
-#define fld_dport		uli.ports.dport
-} __attribute__((__aligned__(BITS_PER_LONG/8)));
-
 struct flowi {
 	union {
 		struct flowi_common	__fl_common;
 		struct flowi4		ip4;
 		struct flowi6		ip6;
-		struct flowidn		dn;
 	} u;
 #define flowi_oif	u.__fl_common.flowic_oif
 #define flowi_iif	u.__fl_common.flowic_iif
@@ -211,11 +190,6 @@ static inline struct flowi_common *flowi6_to_flowi_common(struct flowi6 *fl6)
 	return &(fl6->__fl_common);
 }
 
-static inline struct flowi *flowidn_to_flowi(struct flowidn *fldn)
-{
-	return container_of(fldn, struct flowi, u.dn);
-}
-
 __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys);
 
 #endif
-- 
cgit v1.2.3


From 95698ff6177b5f1f13f251da60e7348413046ae4 Mon Sep 17 00:00:00 2001
From: Shenwei Wang <shenwei.wang@nxp.com>
Date: Fri, 30 Sep 2022 15:44:27 -0500
Subject: net: fec: using page pool to manage RX buffers

This patch optimizes the RX buffer management by using the page
pool. The purpose for this change is to prepare for the following
XDP support. The current driver uses one frame per page for easy
management.

Added __maybe_unused attribute to the following functions to avoid
the compiling warning. Those functions will be removed by a separate
patch once this page pool solution is accepted.
 - fec_enet_new_rxbdp
 - fec_enet_copybreak

The following are the comparing result between page pool implementation
and the original implementation (non page pool).

 --- small packet (64 bytes) testing are almost the same
 --- no matter what the implementation is
 --- on both i.MX8 and i.MX6SX platforms.

shenwei@5810:~/pktgen$ iperf -c 10.81.16.245 -w 2m -i 1 -l 64
------------------------------------------------------------
Client connecting to 10.81.16.245, TCP port 5001
TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
------------------------------------------------------------
[  1] local 10.81.17.20 port 39728 connected with 10.81.16.245 port 5001
[ ID] Interval       Transfer     Bandwidth
[  1] 0.0000-1.0000 sec  37.0 MBytes   311 Mbits/sec
[  1] 1.0000-2.0000 sec  36.6 MBytes   307 Mbits/sec
[  1] 2.0000-3.0000 sec  37.2 MBytes   312 Mbits/sec
[  1] 3.0000-4.0000 sec  37.1 MBytes   312 Mbits/sec
[  1] 4.0000-5.0000 sec  37.2 MBytes   312 Mbits/sec
[  1] 5.0000-6.0000 sec  37.2 MBytes   312 Mbits/sec
[  1] 6.0000-7.0000 sec  37.2 MBytes   312 Mbits/sec
[  1] 7.0000-8.0000 sec  37.2 MBytes   312 Mbits/sec
[  1] 0.0000-8.0943 sec   299 MBytes   310 Mbits/sec

 --- Page Pool implementation on i.MX8 ----

shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
------------------------------------------------------------
Client connecting to 10.81.16.245, TCP port 5001
TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
------------------------------------------------------------
[  1] local 10.81.17.20 port 43204 connected with 10.81.16.245 port 5001
[ ID] Interval       Transfer     Bandwidth
[  1] 0.0000-1.0000 sec   111 MBytes   933 Mbits/sec
[  1] 1.0000-2.0000 sec   111 MBytes   934 Mbits/sec
[  1] 2.0000-3.0000 sec   112 MBytes   935 Mbits/sec
[  1] 3.0000-4.0000 sec   111 MBytes   933 Mbits/sec
[  1] 4.0000-5.0000 sec   111 MBytes   934 Mbits/sec
[  1] 5.0000-6.0000 sec   111 MBytes   933 Mbits/sec
[  1] 6.0000-7.0000 sec   111 MBytes   931 Mbits/sec
[  1] 7.0000-8.0000 sec   112 MBytes   935 Mbits/sec
[  1] 8.0000-9.0000 sec   111 MBytes   933 Mbits/sec
[  1] 9.0000-10.0000 sec   112 MBytes   935 Mbits/sec
[  1] 0.0000-10.0077 sec  1.09 GBytes   933 Mbits/sec

 --- Non Page Pool implementation on i.MX8 ----

shenwei@5810:~$ iperf -c 10.81.16.245 -w 2m -i 1
------------------------------------------------------------
Client connecting to 10.81.16.245, TCP port 5001
TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
------------------------------------------------------------
[  1] local 10.81.17.20 port 49154 connected with 10.81.16.245 port 5001
[ ID] Interval       Transfer     Bandwidth
[  1] 0.0000-1.0000 sec   104 MBytes   868 Mbits/sec
[  1] 1.0000-2.0000 sec   105 MBytes   878 Mbits/sec
[  1] 2.0000-3.0000 sec   105 MBytes   881 Mbits/sec
[  1] 3.0000-4.0000 sec   105 MBytes   879 Mbits/sec
[  1] 4.0000-5.0000 sec   105 MBytes   878 Mbits/sec
[  1] 5.0000-6.0000 sec   105 MBytes   878 Mbits/sec
[  1] 6.0000-7.0000 sec   104 MBytes   875 Mbits/sec
[  1] 7.0000-8.0000 sec   104 MBytes   875 Mbits/sec
[  1] 8.0000-9.0000 sec   104 MBytes   873 Mbits/sec
[  1] 9.0000-10.0000 sec   104 MBytes   875 Mbits/sec
[  1] 0.0000-10.0073 sec  1.02 GBytes   875 Mbits/sec

 --- Page Pool implementation on i.MX6SX ----

shenwei@5810:~/pktgen$ iperf -c 10.81.16.245 -w 2m -i 1
------------------------------------------------------------
Client connecting to 10.81.16.245, TCP port 5001
TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
------------------------------------------------------------
[  1] local 10.81.17.20 port 57288 connected with 10.81.16.245 port 5001
[ ID] Interval       Transfer     Bandwidth
[  1] 0.0000-1.0000 sec  78.8 MBytes   661 Mbits/sec
[  1] 1.0000-2.0000 sec  82.5 MBytes   692 Mbits/sec
[  1] 2.0000-3.0000 sec  82.4 MBytes   691 Mbits/sec
[  1] 3.0000-4.0000 sec  82.4 MBytes   691 Mbits/sec
[  1] 4.0000-5.0000 sec  82.5 MBytes   692 Mbits/sec
[  1] 5.0000-6.0000 sec  82.4 MBytes   691 Mbits/sec
[  1] 6.0000-7.0000 sec  82.5 MBytes   692 Mbits/sec
[  1] 7.0000-8.0000 sec  82.4 MBytes   691 Mbits/sec
[  1] 8.0000-9.0000 sec  82.4 MBytes   691 Mbits/sec
[  1] 9.0000-9.5506 sec  45.0 MBytes   686 Mbits/sec
[  1] 0.0000-9.5506 sec   783 MBytes   688 Mbits/sec

 --- Non Page Pool implementation on i.MX6SX ----

shenwei@5810:~/pktgen$ iperf -c 10.81.16.245 -w 2m -i 1
------------------------------------------------------------
Client connecting to 10.81.16.245, TCP port 5001
TCP window size:  416 KByte (WARNING: requested 1.91 MByte)
------------------------------------------------------------
[  1] local 10.81.17.20 port 36486 connected with 10.81.16.245 port 5001
[ ID] Interval       Transfer     Bandwidth
[  1] 0.0000-1.0000 sec  70.5 MBytes   591 Mbits/sec
[  1] 1.0000-2.0000 sec  64.5 MBytes   541 Mbits/sec
[  1] 2.0000-3.0000 sec  73.6 MBytes   618 Mbits/sec
[  1] 3.0000-4.0000 sec  73.6 MBytes   618 Mbits/sec
[  1] 4.0000-5.0000 sec  72.9 MBytes   611 Mbits/sec
[  1] 5.0000-6.0000 sec  73.4 MBytes   616 Mbits/sec
[  1] 6.0000-7.0000 sec  73.5 MBytes   617 Mbits/sec
[  1] 7.0000-8.0000 sec  73.4 MBytes   616 Mbits/sec
[  1] 8.0000-9.0000 sec  73.4 MBytes   616 Mbits/sec
[  1] 9.0000-10.0000 sec  73.9 MBytes   620 Mbits/sec
[  1] 0.0000-10.0174 sec   723 MBytes   605 Mbits/sec

Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/Kconfig    |   1 +
 drivers/net/ethernet/freescale/fec.h      |  21 +++-
 drivers/net/ethernet/freescale/fec_main.c | 155 +++++++++++++++++++-----------
 3 files changed, 119 insertions(+), 58 deletions(-)

diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig
index b7bf45cec29d..ce866ae3df03 100644
--- a/drivers/net/ethernet/freescale/Kconfig
+++ b/drivers/net/ethernet/freescale/Kconfig
@@ -28,6 +28,7 @@ config FEC
 	depends on PTP_1588_CLOCK_OPTIONAL
 	select CRC32
 	select PHYLIB
+	select PAGE_POOL
 	imply NET_SELFTESTS
 	help
 	  Say Y here if you want to use the built-in 10/100 Fast ethernet
diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index b0100fe3c9e4..33f84a30e167 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -17,6 +17,7 @@
 #include <linux/clocksource.h>
 #include <linux/net_tstamp.h>
 #include <linux/pm_qos.h>
+#include <linux/bpf.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/timecounter.h>
 #include <dt-bindings/firmware/imx/rsrc.h>
@@ -346,8 +347,11 @@ struct bufdesc_ex {
  * the skbuffer directly.
  */
 
+#define FEC_ENET_XDP_HEADROOM	(XDP_PACKET_HEADROOM)
+
 #define FEC_ENET_RX_PAGES	256
-#define FEC_ENET_RX_FRSIZE	2048
+#define FEC_ENET_RX_FRSIZE	(PAGE_SIZE - FEC_ENET_XDP_HEADROOM \
+		- SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
 #define FEC_ENET_RX_FRPPG	(PAGE_SIZE / FEC_ENET_RX_FRSIZE)
 #define RX_RING_SIZE		(FEC_ENET_RX_FRPPG * FEC_ENET_RX_PAGES)
 #define FEC_ENET_TX_FRSIZE	2048
@@ -517,6 +521,12 @@ struct bufdesc_prop {
 	unsigned char dsize_log2;
 };
 
+struct fec_enet_priv_txrx_info {
+	int	offset;
+	struct	page *page;
+	struct  sk_buff *skb;
+};
+
 struct fec_enet_priv_tx_q {
 	struct bufdesc_prop bd;
 	unsigned char *tx_bounce[TX_RING_SIZE];
@@ -532,7 +542,14 @@ struct fec_enet_priv_tx_q {
 
 struct fec_enet_priv_rx_q {
 	struct bufdesc_prop bd;
-	struct  sk_buff *rx_skbuff[RX_RING_SIZE];
+	struct  fec_enet_priv_txrx_info rx_skb_info[RX_RING_SIZE];
+
+	/* page_pool */
+	struct page_pool *page_pool;
+	struct xdp_rxq_info xdp_rxq;
+
+	/* rx queue number, in the range 0-7 */
+	u8 id;
 };
 
 struct fec_stop_mode_gpr {
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index ff1950e96c6c..98d5cd313fdd 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -66,6 +66,8 @@
 #include <linux/mfd/syscon.h>
 #include <linux/regmap.h>
 #include <soc/imx/cpuidle.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
 
 #include <asm/cacheflush.h>
 
@@ -422,6 +424,48 @@ fec_enet_clear_csum(struct sk_buff *skb, struct net_device *ndev)
 	return 0;
 }
 
+static int
+fec_enet_create_page_pool(struct fec_enet_private *fep,
+			  struct fec_enet_priv_rx_q *rxq, int size)
+{
+	struct page_pool_params pp_params = {
+		.order = 0,
+		.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
+		.pool_size = size,
+		.nid = dev_to_node(&fep->pdev->dev),
+		.dev = &fep->pdev->dev,
+		.dma_dir = DMA_FROM_DEVICE,
+		.offset = FEC_ENET_XDP_HEADROOM,
+		.max_len = FEC_ENET_RX_FRSIZE,
+	};
+	int err;
+
+	rxq->page_pool = page_pool_create(&pp_params);
+	if (IS_ERR(rxq->page_pool)) {
+		err = PTR_ERR(rxq->page_pool);
+		rxq->page_pool = NULL;
+		return err;
+	}
+
+	err = xdp_rxq_info_reg(&rxq->xdp_rxq, fep->netdev, rxq->id, 0);
+	if (err < 0)
+		goto err_free_pp;
+
+	err = xdp_rxq_info_reg_mem_model(&rxq->xdp_rxq, MEM_TYPE_PAGE_POOL,
+					 rxq->page_pool);
+	if (err)
+		goto err_unregister_rxq;
+
+	return 0;
+
+err_unregister_rxq:
+	xdp_rxq_info_unreg(&rxq->xdp_rxq);
+err_free_pp:
+	page_pool_destroy(rxq->page_pool);
+	rxq->page_pool = NULL;
+	return err;
+}
+
 static struct bufdesc *
 fec_enet_txq_submit_frag_skb(struct fec_enet_priv_tx_q *txq,
 			     struct sk_buff *skb,
@@ -1450,7 +1494,7 @@ static void fec_enet_tx(struct net_device *ndev)
 		fec_enet_tx_queue(ndev, i);
 }
 
-static int
+static int __maybe_unused
 fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc *bdp, struct sk_buff *skb)
 {
 	struct  fec_enet_private *fep = netdev_priv(ndev);
@@ -1470,8 +1514,9 @@ fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc *bdp, struct sk_buff
 	return 0;
 }
 
-static bool fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
-			       struct bufdesc *bdp, u32 length, bool swap)
+static bool __maybe_unused
+fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
+		   struct bufdesc *bdp, u32 length, bool swap)
 {
 	struct  fec_enet_private *fep = netdev_priv(ndev);
 	struct sk_buff *new_skb;
@@ -1496,6 +1541,21 @@ static bool fec_enet_copybreak(struct net_device *ndev, struct sk_buff **skb,
 	return true;
 }
 
+static void fec_enet_update_cbd(struct fec_enet_priv_rx_q *rxq,
+				struct bufdesc *bdp, int index)
+{
+	struct page *new_page;
+	dma_addr_t phys_addr;
+
+	new_page = page_pool_dev_alloc_pages(rxq->page_pool);
+	WARN_ON(!new_page);
+	rxq->rx_skb_info[index].page = new_page;
+
+	rxq->rx_skb_info[index].offset = FEC_ENET_XDP_HEADROOM;
+	phys_addr = page_pool_get_dma_addr(new_page) + FEC_ENET_XDP_HEADROOM;
+	bdp->cbd_bufaddr = cpu_to_fec32(phys_addr);
+}
+
 /* During a receive, the bd_rx.cur points to the current incoming buffer.
  * When we update through the ring, if the next incoming buffer has
  * not been given to the system, we just set the empty indicator,
@@ -1508,7 +1568,6 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 	struct fec_enet_priv_rx_q *rxq;
 	struct bufdesc *bdp;
 	unsigned short status;
-	struct  sk_buff *skb_new = NULL;
 	struct  sk_buff *skb;
 	ushort	pkt_len;
 	__u8 *data;
@@ -1517,8 +1576,8 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 	bool	vlan_packet_rcvd = false;
 	u16	vlan_tag;
 	int	index = 0;
-	bool	is_copybreak;
 	bool	need_swap = fep->quirks & FEC_QUIRK_SWAP_FRAME;
+	struct page *page;
 
 #ifdef CONFIG_M532x
 	flush_cache_all();
@@ -1570,31 +1629,25 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 		ndev->stats.rx_bytes += pkt_len;
 
 		index = fec_enet_get_bd_index(bdp, &rxq->bd);
-		skb = rxq->rx_skbuff[index];
+		page = rxq->rx_skb_info[index].page;
+		dma_sync_single_for_cpu(&fep->pdev->dev,
+					fec32_to_cpu(bdp->cbd_bufaddr),
+					pkt_len,
+					DMA_FROM_DEVICE);
+		prefetch(page_address(page));
+		fec_enet_update_cbd(rxq, bdp, index);
 
 		/* The packet length includes FCS, but we don't want to
 		 * include that when passing upstream as it messes up
 		 * bridging applications.
 		 */
-		is_copybreak = fec_enet_copybreak(ndev, &skb, bdp, pkt_len - 4,
-						  need_swap);
-		if (!is_copybreak) {
-			skb_new = netdev_alloc_skb(ndev, FEC_ENET_RX_FRSIZE);
-			if (unlikely(!skb_new)) {
-				ndev->stats.rx_dropped++;
-				goto rx_processing_done;
-			}
-			dma_unmap_single(&fep->pdev->dev,
-					 fec32_to_cpu(bdp->cbd_bufaddr),
-					 FEC_ENET_RX_FRSIZE - fep->rx_align,
-					 DMA_FROM_DEVICE);
-		}
-
-		prefetch(skb->data - NET_IP_ALIGN);
+		skb = build_skb(page_address(page), PAGE_SIZE);
+		skb_reserve(skb, FEC_ENET_XDP_HEADROOM);
 		skb_put(skb, pkt_len - 4);
+		skb_mark_for_recycle(skb);
 		data = skb->data;
 
-		if (!is_copybreak && need_swap)
+		if (need_swap)
 			swap_buffer(data, pkt_len);
 
 #if !defined(CONFIG_M5272)
@@ -1649,16 +1702,6 @@ fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
 		skb_record_rx_queue(skb, queue_id);
 		napi_gro_receive(&fep->napi, skb);
 
-		if (is_copybreak) {
-			dma_sync_single_for_device(&fep->pdev->dev,
-						   fec32_to_cpu(bdp->cbd_bufaddr),
-						   FEC_ENET_RX_FRSIZE - fep->rx_align,
-						   DMA_FROM_DEVICE);
-		} else {
-			rxq->rx_skbuff[index] = skb_new;
-			fec_enet_new_rxbdp(ndev, bdp, skb_new);
-		}
-
 rx_processing_done:
 		/* Clear the status flags for this buffer */
 		status &= ~BD_ENET_RX_STATS;
@@ -3002,26 +3045,19 @@ static void fec_enet_free_buffers(struct net_device *ndev)
 	struct fec_enet_private *fep = netdev_priv(ndev);
 	unsigned int i;
 	struct sk_buff *skb;
-	struct bufdesc	*bdp;
 	struct fec_enet_priv_tx_q *txq;
 	struct fec_enet_priv_rx_q *rxq;
 	unsigned int q;
 
 	for (q = 0; q < fep->num_rx_queues; q++) {
 		rxq = fep->rx_queue[q];
-		bdp = rxq->bd.base;
-		for (i = 0; i < rxq->bd.ring_size; i++) {
-			skb = rxq->rx_skbuff[i];
-			rxq->rx_skbuff[i] = NULL;
-			if (skb) {
-				dma_unmap_single(&fep->pdev->dev,
-						 fec32_to_cpu(bdp->cbd_bufaddr),
-						 FEC_ENET_RX_FRSIZE - fep->rx_align,
-						 DMA_FROM_DEVICE);
-				dev_kfree_skb(skb);
-			}
-			bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
-		}
+		for (i = 0; i < rxq->bd.ring_size; i++)
+			page_pool_release_page(rxq->page_pool, rxq->rx_skb_info[i].page);
+
+		if (xdp_rxq_info_is_reg(&rxq->xdp_rxq))
+			xdp_rxq_info_unreg(&rxq->xdp_rxq);
+		page_pool_destroy(rxq->page_pool);
+		rxq->page_pool = NULL;
 	}
 
 	for (q = 0; q < fep->num_tx_queues; q++) {
@@ -3111,24 +3147,31 @@ static int
 fec_enet_alloc_rxq_buffers(struct net_device *ndev, unsigned int queue)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
-	unsigned int i;
-	struct sk_buff *skb;
-	struct bufdesc	*bdp;
 	struct fec_enet_priv_rx_q *rxq;
+	dma_addr_t phys_addr;
+	struct bufdesc	*bdp;
+	struct page *page;
+	int i, err;
 
 	rxq = fep->rx_queue[queue];
 	bdp = rxq->bd.base;
+
+	err = fec_enet_create_page_pool(fep, rxq, rxq->bd.ring_size);
+	if (err < 0) {
+		netdev_err(ndev, "%s failed queue %d (%d)\n", __func__, queue, err);
+		return err;
+	}
+
 	for (i = 0; i < rxq->bd.ring_size; i++) {
-		skb = __netdev_alloc_skb(ndev, FEC_ENET_RX_FRSIZE, GFP_KERNEL);
-		if (!skb)
+		page = page_pool_dev_alloc_pages(rxq->page_pool);
+		if (!page)
 			goto err_alloc;
 
-		if (fec_enet_new_rxbdp(ndev, bdp, skb)) {
-			dev_kfree_skb(skb);
-			goto err_alloc;
-		}
+		phys_addr = page_pool_get_dma_addr(page) + FEC_ENET_XDP_HEADROOM;
+		bdp->cbd_bufaddr = cpu_to_fec32(phys_addr);
 
-		rxq->rx_skbuff[i] = skb;
+		rxq->rx_skb_info[i].page = page;
+		rxq->rx_skb_info[i].offset = FEC_ENET_XDP_HEADROOM;
 		bdp->cbd_sc = cpu_to_fec16(BD_ENET_RX_EMPTY);
 
 		if (fep->bufdesc_ex) {
-- 
cgit v1.2.3


From 5390334b59a3a011f1e9eff2d3023b0e407dd61b Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Fri, 30 Sep 2022 10:35:39 +0200
Subject: net: lan966x: Add port police support using tc-matchall

Add support for port police. It is possible to police only on the
ingress side. To be able to add police support also it was required to
add tc-matchall classifier offload support.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microchip/lan966x/Makefile    |   3 +-
 .../net/ethernet/microchip/lan966x/lan966x_main.h  |  24 +++
 .../ethernet/microchip/lan966x/lan966x_police.c    | 235 +++++++++++++++++++++
 .../net/ethernet/microchip/lan966x/lan966x_regs.h  |  72 +++++++
 .../net/ethernet/microchip/lan966x/lan966x_tc.c    |  50 +++++
 .../microchip/lan966x/lan966x_tc_matchall.c        |  85 ++++++++
 6 files changed, 468 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/microchip/lan966x/lan966x_police.c
 create mode 100644 drivers/net/ethernet/microchip/lan966x/lan966x_tc_matchall.c

diff --git a/drivers/net/ethernet/microchip/lan966x/Makefile b/drivers/net/ethernet/microchip/lan966x/Makefile
index 7360c1c7b53c..d00f7b67b6ec 100644
--- a/drivers/net/ethernet/microchip/lan966x/Makefile
+++ b/drivers/net/ethernet/microchip/lan966x/Makefile
@@ -10,4 +10,5 @@ lan966x-switch-objs  := lan966x_main.o lan966x_phylink.o lan966x_port.o \
 			lan966x_vlan.o lan966x_fdb.o lan966x_mdb.o \
 			lan966x_ptp.o lan966x_fdma.o lan966x_lag.o \
 			lan966x_tc.o lan966x_mqprio.o lan966x_taprio.o \
-			lan966x_tbf.o lan966x_cbs.o lan966x_ets.o
+			lan966x_tbf.o lan966x_cbs.o lan966x_ets.o \
+			lan966x_tc_matchall.o lan966x_police.o
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
index 78665eb9a3f1..10ffc6a76d39 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
@@ -276,6 +276,12 @@ struct lan966x_port_config {
 	bool autoneg;
 };
 
+struct lan966x_port_tc {
+	bool ingress_shared_block;
+	unsigned long police_id;
+	struct flow_stats police_stat;
+};
+
 struct lan966x_port {
 	struct net_device *dev;
 	struct lan966x *lan966x;
@@ -302,6 +308,8 @@ struct lan966x_port {
 	struct net_device *bond;
 	bool lag_tx_active;
 	enum netdev_lag_hash hash_type;
+
+	struct lan966x_port_tc tc;
 };
 
 extern const struct phylink_mac_ops lan966x_phylink_mac_ops;
@@ -481,6 +489,22 @@ int lan966x_ets_add(struct lan966x_port *port,
 int lan966x_ets_del(struct lan966x_port *port,
 		    struct tc_ets_qopt_offload *qopt);
 
+int lan966x_tc_matchall(struct lan966x_port *port,
+			struct tc_cls_matchall_offload *f,
+			bool ingress);
+
+int lan966x_police_port_add(struct lan966x_port *port,
+			    struct flow_action *action,
+			    struct flow_action_entry *act,
+			    unsigned long police_id,
+			    bool ingress,
+			    struct netlink_ext_ack *extack);
+int lan966x_police_port_del(struct lan966x_port *port,
+			    unsigned long police_id,
+			    struct netlink_ext_ack *extack);
+void lan966x_police_port_stats(struct lan966x_port *port,
+			       struct flow_stats *stats);
+
 static inline void __iomem *lan_addr(void __iomem *base[],
 				     int id, int tinst, int tcnt,
 				     int gbase, int ginst,
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_police.c b/drivers/net/ethernet/microchip/lan966x/lan966x_police.c
new file mode 100644
index 000000000000..a9aec900d608
--- /dev/null
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_police.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include "lan966x_main.h"
+
+/* 0-8 : 9 port policers */
+#define POL_IDX_PORT	0
+
+/* Policer order: Serial (QoS -> Port -> VCAP) */
+#define POL_ORDER	0x1d3
+
+struct lan966x_tc_policer {
+	/* kilobit per second */
+	u32 rate;
+	/* bytes */
+	u32 burst;
+};
+
+static int lan966x_police_add(struct lan966x_port *port,
+			      struct lan966x_tc_policer *pol,
+			      u16 pol_idx)
+{
+	struct lan966x *lan966x = port->lan966x;
+
+	/* Rate unit is 33 1/3 kpps */
+	pol->rate = DIV_ROUND_UP(pol->rate * 3, 100);
+	/* Avoid zero burst size */
+	pol->burst = pol->burst ?: 1;
+	/* Unit is 4kB */
+	pol->burst = DIV_ROUND_UP(pol->burst, 4096);
+
+	if (pol->rate > GENMASK(15, 0) ||
+	    pol->burst > GENMASK(6, 0))
+		return -EINVAL;
+
+	lan_wr(ANA_POL_MODE_DROP_ON_YELLOW_ENA_SET(0) |
+	       ANA_POL_MODE_MARK_ALL_FRMS_RED_ENA_SET(0) |
+	       ANA_POL_MODE_IPG_SIZE_SET(20) |
+	       ANA_POL_MODE_FRM_MODE_SET(1) |
+	       ANA_POL_MODE_OVERSHOOT_ENA_SET(1),
+	       lan966x, ANA_POL_MODE(pol_idx));
+
+	lan_wr(ANA_POL_PIR_STATE_PIR_LVL_SET(0),
+	       lan966x, ANA_POL_PIR_STATE(pol_idx));
+
+	lan_wr(ANA_POL_PIR_CFG_PIR_RATE_SET(pol->rate) |
+	       ANA_POL_PIR_CFG_PIR_BURST_SET(pol->burst),
+	       lan966x, ANA_POL_PIR_CFG(pol_idx));
+
+	return 0;
+}
+
+static int lan966x_police_del(struct lan966x_port *port,
+			      u16 pol_idx)
+{
+	struct lan966x *lan966x = port->lan966x;
+
+	lan_wr(ANA_POL_MODE_DROP_ON_YELLOW_ENA_SET(0) |
+	       ANA_POL_MODE_MARK_ALL_FRMS_RED_ENA_SET(0) |
+	       ANA_POL_MODE_IPG_SIZE_SET(20) |
+	       ANA_POL_MODE_FRM_MODE_SET(2) |
+	       ANA_POL_MODE_OVERSHOOT_ENA_SET(1),
+	       lan966x, ANA_POL_MODE(pol_idx));
+
+	lan_wr(ANA_POL_PIR_STATE_PIR_LVL_SET(0),
+	       lan966x, ANA_POL_PIR_STATE(pol_idx));
+
+	lan_wr(ANA_POL_PIR_CFG_PIR_RATE_SET(GENMASK(14, 0)) |
+	       ANA_POL_PIR_CFG_PIR_BURST_SET(0),
+	       lan966x, ANA_POL_PIR_CFG(pol_idx));
+
+	return 0;
+}
+
+static int lan966x_police_validate(struct lan966x_port *port,
+				   const struct flow_action *action,
+				   const struct flow_action_entry *act,
+				   unsigned long police_id,
+				   bool ingress,
+				   struct netlink_ext_ack *extack)
+{
+	if (act->police.exceed.act_id != FLOW_ACTION_DROP) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Offload not supported when exceed action is not drop");
+		return -EOPNOTSUPP;
+	}
+
+	if (act->police.notexceed.act_id != FLOW_ACTION_PIPE &&
+	    act->police.notexceed.act_id != FLOW_ACTION_ACCEPT) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Offload not supported when conform action is not pipe or ok");
+		return -EOPNOTSUPP;
+	}
+
+	if (act->police.notexceed.act_id == FLOW_ACTION_ACCEPT &&
+	    !flow_action_is_last_entry(action, act)) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Offload not supported when conform action is ok, but action is not last");
+		return -EOPNOTSUPP;
+	}
+
+	if (act->police.peakrate_bytes_ps ||
+	    act->police.avrate || act->police.overhead) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Offload not supported when peakrate/avrate/overhead is configured");
+		return -EOPNOTSUPP;
+	}
+
+	if (act->police.rate_pkt_ps) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "QoS offload not support packets per second");
+		return -EOPNOTSUPP;
+	}
+
+	if (!ingress) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Policer is not supported on egress");
+		return -EOPNOTSUPP;
+	}
+
+	if (port->tc.ingress_shared_block) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Policer is not supported on shared ingress blocks");
+		return -EOPNOTSUPP;
+	}
+
+	if (port->tc.police_id && port->tc.police_id != police_id) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Only one policer per port is supported");
+		return -EEXIST;
+	}
+
+	return 0;
+}
+
+int lan966x_police_port_add(struct lan966x_port *port,
+			    struct flow_action *action,
+			    struct flow_action_entry *act,
+			    unsigned long police_id,
+			    bool ingress,
+			    struct netlink_ext_ack *extack)
+{
+	struct lan966x *lan966x = port->lan966x;
+	struct rtnl_link_stats64 new_stats;
+	struct lan966x_tc_policer pol;
+	struct flow_stats *old_stats;
+	int err;
+
+	err = lan966x_police_validate(port, action, act, police_id, ingress,
+				      extack);
+	if (err)
+		return err;
+
+	memset(&pol, 0, sizeof(pol));
+
+	pol.rate = div_u64(act->police.rate_bytes_ps, 1000) * 8;
+	pol.burst = act->police.burst;
+
+	err = lan966x_police_add(port, &pol, POL_IDX_PORT + port->chip_port);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Failed to add policer to port");
+		return err;
+	}
+
+	lan_rmw(ANA_POL_CFG_PORT_POL_ENA_SET(1) |
+		ANA_POL_CFG_POL_ORDER_SET(POL_ORDER),
+		ANA_POL_CFG_PORT_POL_ENA |
+		ANA_POL_CFG_POL_ORDER,
+		lan966x, ANA_POL_CFG(port->chip_port));
+
+	port->tc.police_id = police_id;
+
+	/* Setup initial stats */
+	old_stats = &port->tc.police_stat;
+	lan966x_stats_get(port->dev, &new_stats);
+	old_stats->bytes = new_stats.rx_bytes;
+	old_stats->pkts = new_stats.rx_packets;
+	old_stats->drops = new_stats.rx_dropped;
+	old_stats->lastused = jiffies;
+
+	return 0;
+}
+
+int lan966x_police_port_del(struct lan966x_port *port,
+			    unsigned long police_id,
+			    struct netlink_ext_ack *extack)
+{
+	struct lan966x *lan966x = port->lan966x;
+	int err;
+
+	if (port->tc.police_id != police_id) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Invalid policer id");
+		return -EINVAL;
+	}
+
+	err = lan966x_police_del(port, port->tc.police_id);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Failed to add policer to port");
+		return err;
+	}
+
+	lan_rmw(ANA_POL_CFG_PORT_POL_ENA_SET(0) |
+		ANA_POL_CFG_POL_ORDER_SET(POL_ORDER),
+		ANA_POL_CFG_PORT_POL_ENA |
+		ANA_POL_CFG_POL_ORDER,
+		lan966x, ANA_POL_CFG(port->chip_port));
+
+	port->tc.police_id = 0;
+
+	return 0;
+}
+
+void lan966x_police_port_stats(struct lan966x_port *port,
+			       struct flow_stats *stats)
+{
+	struct rtnl_link_stats64 new_stats;
+	struct flow_stats *old_stats;
+
+	old_stats = &port->tc.police_stat;
+	lan966x_stats_get(port->dev, &new_stats);
+
+	flow_stats_update(stats,
+			  new_stats.rx_bytes - old_stats->bytes,
+			  new_stats.rx_packets - old_stats->pkts,
+			  new_stats.rx_dropped - old_stats->drops,
+			  old_stats->lastused,
+			  FLOW_ACTION_HW_STATS_IMMEDIATE);
+
+	old_stats->bytes = new_stats.rx_bytes;
+	old_stats->pkts = new_stats.rx_packets;
+	old_stats->drops = new_stats.rx_dropped;
+	old_stats->lastused = jiffies;
+}
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_regs.h b/drivers/net/ethernet/microchip/lan966x/lan966x_regs.h
index 4f00f95d66b6..5cb88d81afba 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_regs.h
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_regs.h
@@ -354,6 +354,21 @@ enum lan966x_target {
 #define ANA_PORT_CFG_PORTID_VAL_GET(x)\
 	FIELD_GET(ANA_PORT_CFG_PORTID_VAL, x)
 
+/*      ANA:PORT:POL_CFG */
+#define ANA_POL_CFG(g)            __REG(TARGET_ANA, 0, 1, 28672, g, 9, 128, 116, 0, 1, 4)
+
+#define ANA_POL_CFG_PORT_POL_ENA                 BIT(17)
+#define ANA_POL_CFG_PORT_POL_ENA_SET(x)\
+	FIELD_PREP(ANA_POL_CFG_PORT_POL_ENA, x)
+#define ANA_POL_CFG_PORT_POL_ENA_GET(x)\
+	FIELD_GET(ANA_POL_CFG_PORT_POL_ENA, x)
+
+#define ANA_POL_CFG_POL_ORDER                    GENMASK(8, 0)
+#define ANA_POL_CFG_POL_ORDER_SET(x)\
+	FIELD_PREP(ANA_POL_CFG_POL_ORDER, x)
+#define ANA_POL_CFG_POL_ORDER_GET(x)\
+	FIELD_GET(ANA_POL_CFG_POL_ORDER, x)
+
 /*      ANA:PFC:PFC_CFG */
 #define ANA_PFC_CFG(g)            __REG(TARGET_ANA, 0, 1, 30720, g, 8, 64, 0, 0, 1, 4)
 
@@ -408,6 +423,63 @@ enum lan966x_target {
 #define ANA_AGGR_CFG_AC_IP4_TCPUDP_ENA_GET(x)\
 	FIELD_GET(ANA_AGGR_CFG_AC_IP4_TCPUDP_ENA, x)
 
+/*      ANA:POL:POL_PIR_CFG */
+#define ANA_POL_PIR_CFG(g)        __REG(TARGET_ANA, 0, 1, 16384, g, 345, 32, 0, 0, 1, 4)
+
+#define ANA_POL_PIR_CFG_PIR_RATE                 GENMASK(20, 6)
+#define ANA_POL_PIR_CFG_PIR_RATE_SET(x)\
+	FIELD_PREP(ANA_POL_PIR_CFG_PIR_RATE, x)
+#define ANA_POL_PIR_CFG_PIR_RATE_GET(x)\
+	FIELD_GET(ANA_POL_PIR_CFG_PIR_RATE, x)
+
+#define ANA_POL_PIR_CFG_PIR_BURST                GENMASK(5, 0)
+#define ANA_POL_PIR_CFG_PIR_BURST_SET(x)\
+	FIELD_PREP(ANA_POL_PIR_CFG_PIR_BURST, x)
+#define ANA_POL_PIR_CFG_PIR_BURST_GET(x)\
+	FIELD_GET(ANA_POL_PIR_CFG_PIR_BURST, x)
+
+/*      ANA:POL:POL_MODE_CFG */
+#define ANA_POL_MODE(g)           __REG(TARGET_ANA, 0, 1, 16384, g, 345, 32, 8, 0, 1, 4)
+
+#define ANA_POL_MODE_DROP_ON_YELLOW_ENA          BIT(11)
+#define ANA_POL_MODE_DROP_ON_YELLOW_ENA_SET(x)\
+	FIELD_PREP(ANA_POL_MODE_DROP_ON_YELLOW_ENA, x)
+#define ANA_POL_MODE_DROP_ON_YELLOW_ENA_GET(x)\
+	FIELD_GET(ANA_POL_MODE_DROP_ON_YELLOW_ENA, x)
+
+#define ANA_POL_MODE_MARK_ALL_FRMS_RED_ENA       BIT(10)
+#define ANA_POL_MODE_MARK_ALL_FRMS_RED_ENA_SET(x)\
+	FIELD_PREP(ANA_POL_MODE_MARK_ALL_FRMS_RED_ENA, x)
+#define ANA_POL_MODE_MARK_ALL_FRMS_RED_ENA_GET(x)\
+	FIELD_GET(ANA_POL_MODE_MARK_ALL_FRMS_RED_ENA, x)
+
+#define ANA_POL_MODE_IPG_SIZE                    GENMASK(9, 5)
+#define ANA_POL_MODE_IPG_SIZE_SET(x)\
+	FIELD_PREP(ANA_POL_MODE_IPG_SIZE, x)
+#define ANA_POL_MODE_IPG_SIZE_GET(x)\
+	FIELD_GET(ANA_POL_MODE_IPG_SIZE, x)
+
+#define ANA_POL_MODE_FRM_MODE                    GENMASK(4, 3)
+#define ANA_POL_MODE_FRM_MODE_SET(x)\
+	FIELD_PREP(ANA_POL_MODE_FRM_MODE, x)
+#define ANA_POL_MODE_FRM_MODE_GET(x)\
+	FIELD_GET(ANA_POL_MODE_FRM_MODE, x)
+
+#define ANA_POL_MODE_OVERSHOOT_ENA               BIT(0)
+#define ANA_POL_MODE_OVERSHOOT_ENA_SET(x)\
+	FIELD_PREP(ANA_POL_MODE_OVERSHOOT_ENA, x)
+#define ANA_POL_MODE_OVERSHOOT_ENA_GET(x)\
+	FIELD_GET(ANA_POL_MODE_OVERSHOOT_ENA, x)
+
+/*      ANA:POL:POL_PIR_STATE */
+#define ANA_POL_PIR_STATE(g)      __REG(TARGET_ANA, 0, 1, 16384, g, 345, 32, 12, 0, 1, 4)
+
+#define ANA_POL_PIR_STATE_PIR_LVL                GENMASK(21, 0)
+#define ANA_POL_PIR_STATE_PIR_LVL_SET(x)\
+	FIELD_PREP(ANA_POL_PIR_STATE_PIR_LVL, x)
+#define ANA_POL_PIR_STATE_PIR_LVL_GET(x)\
+	FIELD_GET(ANA_POL_PIR_STATE_PIR_LVL, x)
+
 /*      CHIP_TOP:CUPHY_CFG:CUPHY_PORT_CFG */
 #define CHIP_TOP_CUPHY_PORT_CFG(r) __REG(TARGET_CHIP_TOP, 0, 1, 16, 0, 1, 20, 8, r, 2, 4)
 
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_tc.c b/drivers/net/ethernet/microchip/lan966x/lan966x_tc.c
index 336eb7ee0d60..651d5493ae55 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_tc.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_tc.c
@@ -4,6 +4,8 @@
 
 #include "lan966x_main.h"
 
+static LIST_HEAD(lan966x_tc_block_cb_list);
+
 static int lan966x_tc_setup_qdisc_mqprio(struct lan966x_port *port,
 					 struct tc_mqprio_qopt_offload *mqprio)
 {
@@ -59,6 +61,52 @@ static int lan966x_tc_setup_qdisc_ets(struct lan966x_port *port,
 	return -EOPNOTSUPP;
 }
 
+static int lan966x_tc_block_cb(enum tc_setup_type type, void *type_data,
+			       void *cb_priv, bool ingress)
+{
+	struct lan966x_port *port = cb_priv;
+
+	switch (type) {
+	case TC_SETUP_CLSMATCHALL:
+		return lan966x_tc_matchall(port, type_data, ingress);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int lan966x_tc_block_cb_ingress(enum tc_setup_type type,
+				       void *type_data, void *cb_priv)
+{
+	return lan966x_tc_block_cb(type, type_data, cb_priv, true);
+}
+
+static int lan966x_tc_block_cb_egress(enum tc_setup_type type,
+				      void *type_data, void *cb_priv)
+{
+	return lan966x_tc_block_cb(type, type_data, cb_priv, false);
+}
+
+static int lan966x_tc_setup_block(struct lan966x_port *port,
+				  struct flow_block_offload *f)
+{
+	flow_setup_cb_t *cb;
+	bool ingress;
+
+	if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) {
+		cb = lan966x_tc_block_cb_ingress;
+		port->tc.ingress_shared_block = f->block_shared;
+		ingress = true;
+	} else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS) {
+		cb = lan966x_tc_block_cb_egress;
+		ingress = false;
+	} else {
+		return -EOPNOTSUPP;
+	}
+
+	return flow_block_cb_setup_simple(f, &lan966x_tc_block_cb_list,
+					  cb, port, port, ingress);
+}
+
 int lan966x_tc_setup(struct net_device *dev, enum tc_setup_type type,
 		     void *type_data)
 {
@@ -75,6 +123,8 @@ int lan966x_tc_setup(struct net_device *dev, enum tc_setup_type type,
 		return lan966x_tc_setup_qdisc_cbs(port, type_data);
 	case TC_SETUP_QDISC_ETS:
 		return lan966x_tc_setup_qdisc_ets(port, type_data);
+	case TC_SETUP_BLOCK:
+		return lan966x_tc_setup_block(port, type_data);
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_tc_matchall.c b/drivers/net/ethernet/microchip/lan966x/lan966x_tc_matchall.c
new file mode 100644
index 000000000000..dc065b556ef7
--- /dev/null
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_tc_matchall.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include "lan966x_main.h"
+
+static int lan966x_tc_matchall_add(struct lan966x_port *port,
+				   struct tc_cls_matchall_offload *f,
+				   bool ingress)
+{
+	struct flow_action_entry *act;
+
+	if (!flow_offload_has_one_action(&f->rule->action)) {
+		NL_SET_ERR_MSG_MOD(f->common.extack,
+				   "Only once action per filter is supported");
+		return -EOPNOTSUPP;
+	}
+
+	act = &f->rule->action.entries[0];
+	switch (act->id) {
+	case FLOW_ACTION_POLICE:
+		return lan966x_police_port_add(port, &f->rule->action, act,
+					       f->cookie, ingress,
+					       f->common.extack);
+	default:
+		NL_SET_ERR_MSG_MOD(f->common.extack,
+				   "Unsupported action");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int lan966x_tc_matchall_del(struct lan966x_port *port,
+				   struct tc_cls_matchall_offload *f,
+				   bool ingress)
+{
+	if (f->cookie == port->tc.police_id) {
+		return lan966x_police_port_del(port, f->cookie,
+					       f->common.extack);
+	} else {
+		NL_SET_ERR_MSG_MOD(f->common.extack,
+				   "Unsupported action");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static int lan966x_tc_matchall_stats(struct lan966x_port *port,
+				     struct tc_cls_matchall_offload *f,
+				     bool ingress)
+{
+	if (f->cookie == port->tc.police_id) {
+		lan966x_police_port_stats(port, &f->stats);
+	} else {
+		NL_SET_ERR_MSG_MOD(f->common.extack,
+				   "Unsupported action");
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+int lan966x_tc_matchall(struct lan966x_port *port,
+			struct tc_cls_matchall_offload *f,
+			bool ingress)
+{
+	if (!tc_cls_can_offload_and_chain0(port->dev, &f->common)) {
+		NL_SET_ERR_MSG_MOD(f->common.extack,
+				   "Only chain zero is supported");
+		return -EOPNOTSUPP;
+	}
+
+	switch (f->command) {
+	case TC_CLSMATCHALL_REPLACE:
+		return lan966x_tc_matchall_add(port, f, ingress);
+	case TC_CLSMATCHALL_DESTROY:
+		return lan966x_tc_matchall_del(port, f, ingress);
+	case TC_CLSMATCHALL_STATS:
+		return lan966x_tc_matchall_stats(port, f, ingress);
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
-- 
cgit v1.2.3


From b69e95397c3ca8c596da006fa16076b54c54d1d4 Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Fri, 30 Sep 2022 10:35:40 +0200
Subject: net: lan966x: Add port mirroring support using tc-matchall

Add support for port mirroring. It is possible to mirror only one port
at a time and it is possible to have both ingress and egress mirroring.
Frames injected by the CPU don't get egress mirrored because they are
bypassing the analyzer module.

Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/microchip/lan966x/Makefile    |   2 +-
 .../net/ethernet/microchip/lan966x/lan966x_main.h  |  20 +++
 .../ethernet/microchip/lan966x/lan966x_mirror.c    | 138 +++++++++++++++++++++
 .../net/ethernet/microchip/lan966x/lan966x_regs.h  |  24 ++++
 .../microchip/lan966x/lan966x_tc_matchall.c        |  10 ++
 5 files changed, 193 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/microchip/lan966x/lan966x_mirror.c

diff --git a/drivers/net/ethernet/microchip/lan966x/Makefile b/drivers/net/ethernet/microchip/lan966x/Makefile
index d00f7b67b6ec..962f7c5f9e7d 100644
--- a/drivers/net/ethernet/microchip/lan966x/Makefile
+++ b/drivers/net/ethernet/microchip/lan966x/Makefile
@@ -11,4 +11,4 @@ lan966x-switch-objs  := lan966x_main.o lan966x_phylink.o lan966x_port.o \
 			lan966x_ptp.o lan966x_fdma.o lan966x_lag.o \
 			lan966x_tc.o lan966x_mqprio.o lan966x_taprio.o \
 			lan966x_tbf.o lan966x_cbs.o lan966x_ets.o \
-			lan966x_tc_matchall.o lan966x_police.o
+			lan966x_tc_matchall.o lan966x_police.o lan966x_mirror.o
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
index 10ffc6a76d39..9656071b8289 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.h
@@ -264,6 +264,11 @@ struct lan966x {
 	struct lan966x_rx rx;
 	struct lan966x_tx tx;
 	struct napi_struct napi;
+
+	/* Mirror */
+	struct lan966x_port *mirror_monitor;
+	u32 mirror_mask[2];
+	u32 mirror_count;
 };
 
 struct lan966x_port_config {
@@ -279,7 +284,10 @@ struct lan966x_port_config {
 struct lan966x_port_tc {
 	bool ingress_shared_block;
 	unsigned long police_id;
+	unsigned long ingress_mirror_id;
+	unsigned long egress_mirror_id;
 	struct flow_stats police_stat;
+	struct flow_stats mirror_stat;
 };
 
 struct lan966x_port {
@@ -505,6 +513,18 @@ int lan966x_police_port_del(struct lan966x_port *port,
 void lan966x_police_port_stats(struct lan966x_port *port,
 			       struct flow_stats *stats);
 
+int lan966x_mirror_port_add(struct lan966x_port *port,
+			    struct flow_action_entry *action,
+			    unsigned long mirror_id,
+			    bool ingress,
+			    struct netlink_ext_ack *extack);
+int lan966x_mirror_port_del(struct lan966x_port *port,
+			    bool ingress,
+			    struct netlink_ext_ack *extack);
+void lan966x_mirror_port_stats(struct lan966x_port *port,
+			       struct flow_stats *stats,
+			       bool ingress);
+
 static inline void __iomem *lan_addr(void __iomem *base[],
 				     int id, int tinst, int tcnt,
 				     int gbase, int ginst,
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_mirror.c b/drivers/net/ethernet/microchip/lan966x/lan966x_mirror.c
new file mode 100644
index 000000000000..7e1ba3f40c35
--- /dev/null
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_mirror.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include "lan966x_main.h"
+
+int lan966x_mirror_port_add(struct lan966x_port *port,
+			    struct flow_action_entry *action,
+			    unsigned long mirror_id,
+			    bool ingress,
+			    struct netlink_ext_ack *extack)
+{
+	struct lan966x *lan966x = port->lan966x;
+	struct lan966x_port *monitor_port;
+
+	if (!lan966x_netdevice_check(action->dev)) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Destination not an lan966x port");
+		return -EOPNOTSUPP;
+	}
+
+	monitor_port = netdev_priv(action->dev);
+
+	if (lan966x->mirror_mask[ingress] & BIT(port->chip_port)) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Mirror already exists");
+		return -EEXIST;
+	}
+
+	if (lan966x->mirror_monitor &&
+	    lan966x->mirror_monitor != monitor_port) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Cannot change mirror port while in use");
+		return -EBUSY;
+	}
+
+	if (port == monitor_port) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "Cannot mirror the monitor port");
+		return -EINVAL;
+	}
+
+	lan966x->mirror_mask[ingress] |= BIT(port->chip_port);
+
+	lan966x->mirror_monitor = monitor_port;
+	lan_wr(BIT(monitor_port->chip_port), lan966x, ANA_MIRRORPORTS);
+
+	if (ingress) {
+		lan_rmw(ANA_PORT_CFG_SRC_MIRROR_ENA_SET(1),
+			ANA_PORT_CFG_SRC_MIRROR_ENA,
+			lan966x, ANA_PORT_CFG(port->chip_port));
+	} else {
+		lan_wr(lan966x->mirror_mask[0], lan966x,
+		       ANA_EMIRRORPORTS);
+	}
+
+	lan966x->mirror_count++;
+
+	if (ingress)
+		port->tc.ingress_mirror_id = mirror_id;
+	else
+		port->tc.egress_mirror_id = mirror_id;
+
+	return 0;
+}
+
+int lan966x_mirror_port_del(struct lan966x_port *port,
+			    bool ingress,
+			    struct netlink_ext_ack *extack)
+{
+	struct lan966x *lan966x = port->lan966x;
+
+	if (!(lan966x->mirror_mask[ingress] & BIT(port->chip_port))) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "There is no mirroring for this port");
+		return -ENOENT;
+	}
+
+	lan966x->mirror_mask[ingress] &= ~BIT(port->chip_port);
+
+	if (ingress) {
+		lan_rmw(ANA_PORT_CFG_SRC_MIRROR_ENA_SET(0),
+			ANA_PORT_CFG_SRC_MIRROR_ENA,
+			lan966x, ANA_PORT_CFG(port->chip_port));
+	} else {
+		lan_wr(lan966x->mirror_mask[0], lan966x,
+		       ANA_EMIRRORPORTS);
+	}
+
+	lan966x->mirror_count--;
+
+	if (lan966x->mirror_count == 0) {
+		lan966x->mirror_monitor = NULL;
+		lan_wr(0, lan966x, ANA_MIRRORPORTS);
+	}
+
+	if (ingress)
+		port->tc.ingress_mirror_id = 0;
+	else
+		port->tc.egress_mirror_id = 0;
+
+	return 0;
+}
+
+void lan966x_mirror_port_stats(struct lan966x_port *port,
+			       struct flow_stats *stats,
+			       bool ingress)
+{
+	struct rtnl_link_stats64 new_stats;
+	struct flow_stats *old_stats;
+
+	old_stats = &port->tc.mirror_stat;
+	lan966x_stats_get(port->dev, &new_stats);
+
+	if (ingress) {
+		flow_stats_update(stats,
+				  new_stats.rx_bytes - old_stats->bytes,
+				  new_stats.rx_packets - old_stats->pkts,
+				  new_stats.rx_dropped - old_stats->drops,
+				  old_stats->lastused,
+				  FLOW_ACTION_HW_STATS_IMMEDIATE);
+
+		old_stats->bytes = new_stats.rx_bytes;
+		old_stats->pkts = new_stats.rx_packets;
+		old_stats->drops = new_stats.rx_dropped;
+		old_stats->lastused = jiffies;
+	} else {
+		flow_stats_update(stats,
+				  new_stats.tx_bytes - old_stats->bytes,
+				  new_stats.tx_packets - old_stats->pkts,
+				  new_stats.tx_dropped - old_stats->drops,
+				  old_stats->lastused,
+				  FLOW_ACTION_HW_STATS_IMMEDIATE);
+
+		old_stats->bytes = new_stats.tx_bytes;
+		old_stats->pkts = new_stats.tx_packets;
+		old_stats->drops = new_stats.tx_dropped;
+		old_stats->lastused = jiffies;
+	}
+}
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_regs.h b/drivers/net/ethernet/microchip/lan966x/lan966x_regs.h
index 5cb88d81afba..1d90b93dd417 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_regs.h
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_regs.h
@@ -90,6 +90,24 @@ enum lan966x_target {
 #define ANA_AUTOAGE_AGE_PERIOD_GET(x)\
 	FIELD_GET(ANA_AUTOAGE_AGE_PERIOD, x)
 
+/*      ANA:ANA:MIRRORPORTS */
+#define ANA_MIRRORPORTS           __REG(TARGET_ANA, 0, 1, 29824, 0, 1, 244, 60, 0, 1, 4)
+
+#define ANA_MIRRORPORTS_MIRRORPORTS              GENMASK(8, 0)
+#define ANA_MIRRORPORTS_MIRRORPORTS_SET(x)\
+	FIELD_PREP(ANA_MIRRORPORTS_MIRRORPORTS, x)
+#define ANA_MIRRORPORTS_MIRRORPORTS_GET(x)\
+	FIELD_GET(ANA_MIRRORPORTS_MIRRORPORTS, x)
+
+/*      ANA:ANA:EMIRRORPORTS */
+#define ANA_EMIRRORPORTS          __REG(TARGET_ANA, 0, 1, 29824, 0, 1, 244, 64, 0, 1, 4)
+
+#define ANA_EMIRRORPORTS_EMIRRORPORTS            GENMASK(8, 0)
+#define ANA_EMIRRORPORTS_EMIRRORPORTS_SET(x)\
+	FIELD_PREP(ANA_EMIRRORPORTS_EMIRRORPORTS, x)
+#define ANA_EMIRRORPORTS_EMIRRORPORTS_GET(x)\
+	FIELD_GET(ANA_EMIRRORPORTS_EMIRRORPORTS, x)
+
 /*      ANA:ANA:FLOODING */
 #define ANA_FLOODING(r)           __REG(TARGET_ANA, 0, 1, 29824, 0, 1, 244, 68, r, 8, 4)
 
@@ -330,6 +348,12 @@ enum lan966x_target {
 /*      ANA:PORT:PORT_CFG */
 #define ANA_PORT_CFG(g)           __REG(TARGET_ANA, 0, 1, 28672, g, 9, 128, 112, 0, 1, 4)
 
+#define ANA_PORT_CFG_SRC_MIRROR_ENA              BIT(13)
+#define ANA_PORT_CFG_SRC_MIRROR_ENA_SET(x)\
+	FIELD_PREP(ANA_PORT_CFG_SRC_MIRROR_ENA, x)
+#define ANA_PORT_CFG_SRC_MIRROR_ENA_GET(x)\
+	FIELD_GET(ANA_PORT_CFG_SRC_MIRROR_ENA, x)
+
 #define ANA_PORT_CFG_LEARNAUTO                   BIT(6)
 #define ANA_PORT_CFG_LEARNAUTO_SET(x)\
 	FIELD_PREP(ANA_PORT_CFG_LEARNAUTO, x)
diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_tc_matchall.c b/drivers/net/ethernet/microchip/lan966x/lan966x_tc_matchall.c
index dc065b556ef7..7368433b9277 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_tc_matchall.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_tc_matchall.c
@@ -20,6 +20,9 @@ static int lan966x_tc_matchall_add(struct lan966x_port *port,
 		return lan966x_police_port_add(port, &f->rule->action, act,
 					       f->cookie, ingress,
 					       f->common.extack);
+	case FLOW_ACTION_MIRRED:
+		return lan966x_mirror_port_add(port, act, f->cookie,
+					       ingress, f->common.extack);
 	default:
 		NL_SET_ERR_MSG_MOD(f->common.extack,
 				   "Unsupported action");
@@ -36,6 +39,10 @@ static int lan966x_tc_matchall_del(struct lan966x_port *port,
 	if (f->cookie == port->tc.police_id) {
 		return lan966x_police_port_del(port, f->cookie,
 					       f->common.extack);
+	} else if (f->cookie == port->tc.ingress_mirror_id ||
+		   f->cookie == port->tc.egress_mirror_id) {
+		return lan966x_mirror_port_del(port, ingress,
+					       f->common.extack);
 	} else {
 		NL_SET_ERR_MSG_MOD(f->common.extack,
 				   "Unsupported action");
@@ -51,6 +58,9 @@ static int lan966x_tc_matchall_stats(struct lan966x_port *port,
 {
 	if (f->cookie == port->tc.police_id) {
 		lan966x_police_port_stats(port, &f->stats);
+	} else if (f->cookie == port->tc.ingress_mirror_id ||
+		   f->cookie == port->tc.egress_mirror_id) {
+		lan966x_mirror_port_stats(port, &f->stats, ingress);
 	} else {
 		NL_SET_ERR_MSG_MOD(f->common.extack,
 				   "Unsupported action");
-- 
cgit v1.2.3


From 12aece8b01507a2d357a1861f470e83621fbb6f2 Mon Sep 17 00:00:00 2001
From: Zheng Wang <zyytlz.wz@163.com>
Date: Sat, 1 Oct 2022 01:57:25 +0800
Subject: eth: sp7021: fix use after free bug in spl2sw_nvmem_get_mac_address

This frees "mac" and tries to display its address as part of the error
message on the next line.  Swap the order.

Fixes: fd3040b9394c ("net: ethernet: Add driver for Sunplus SP7021")
Signed-off-by: Zheng Wang <zyytlz.wz@163.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sunplus/spl2sw_driver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sunplus/spl2sw_driver.c b/drivers/net/ethernet/sunplus/spl2sw_driver.c
index 546206640492..61d1d07dc070 100644
--- a/drivers/net/ethernet/sunplus/spl2sw_driver.c
+++ b/drivers/net/ethernet/sunplus/spl2sw_driver.c
@@ -248,8 +248,8 @@ static int spl2sw_nvmem_get_mac_address(struct device *dev, struct device_node *
 
 	/* Check if mac address is valid */
 	if (!is_valid_ether_addr(mac)) {
-		kfree(mac);
 		dev_info(dev, "Invalid mac address in nvmem (%pM)!\n", mac);
+		kfree(mac);
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From ca7f49ff884677f97858c3934806e0e666425af0 Mon Sep 17 00:00:00 2001
From: Geetha sowjanya <gakula@marvell.com>
Date: Sat, 1 Oct 2022 10:29:42 +0530
Subject: octeontx2-af: cn10k: Introduce driver for macsec block.

CN10K-B and CNF10K-B has macsec block(MCS) to encrypt and
decrypt packets at MAC level. This block is a global resource
with hardware resources like SecYs, SCs and SAs and is in
between NIX block and RPM LMAC. CN10K-B silicon has only one MCS
block which receives packets from all LMACS whereas CNF10K-B has
seven MCS blocks for seven LMACs. Both MCS blocks are
similar in operation except for few register offsets and some
configurations require writing to different registers. Those
differences between IPs are handled using separate ops.
This patch adds basic driver and does the initial hardware
calibration and parser configuration.

Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/octeontx2/af/Makefile |   2 +-
 drivers/net/ethernet/marvell/octeontx2/af/mcs.c    | 368 +++++++++++++++++++++
 drivers/net/ethernet/marvell/octeontx2/af/mcs.h    |  92 ++++++
 .../ethernet/marvell/octeontx2/af/mcs_cnf10kb.c    |  65 ++++
 .../net/ethernet/marvell/octeontx2/af/mcs_reg.h    |  78 +++++
 .../net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c |  52 +++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c    |  14 +
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h    |   8 +
 8 files changed, 678 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mcs.c
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mcs.h
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/Makefile b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
index 40203560b291..3cf4c8285c90 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/Makefile
+++ b/drivers/net/ethernet/marvell/octeontx2/af/Makefile
@@ -11,4 +11,4 @@ rvu_mbox-y := mbox.o rvu_trace.o
 rvu_af-y := cgx.o rvu.o rvu_cgx.o rvu_npa.o rvu_nix.o \
 		  rvu_reg.o rvu_npc.o rvu_debugfs.o ptp.o rvu_npc_fs.o \
 		  rvu_cpt.o rvu_devlink.o rpm.o rvu_cn10k.o rvu_switch.o \
-		  rvu_sdp.o rvu_npc_hash.o
+		  rvu_sdp.o rvu_npc_hash.o mcs.o mcs_rvu_if.o mcs_cnf10kb.o
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
new file mode 100644
index 000000000000..259ccb42af6b
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell MCS driver
+ *
+ * Copyright (C) 2022 Marvell.
+ */
+
+#include <linux/bitfield.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include "mcs.h"
+#include "mcs_reg.h"
+
+#define DRV_NAME	"Marvell MCS Driver"
+
+#define PCI_CFG_REG_BAR_NUM	0
+
+static const struct pci_device_id mcs_id_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_CAVIUM, PCI_DEVID_CN10K_MCS) },
+	{ 0, }  /* end of table */
+};
+
+static LIST_HEAD(mcs_list);
+
+static void *alloc_mem(struct mcs *mcs, int n)
+{
+	return devm_kcalloc(mcs->dev, n, sizeof(u16), GFP_KERNEL);
+}
+
+static int mcs_alloc_struct_mem(struct mcs *mcs, struct mcs_rsrc_map *res)
+{
+	struct hwinfo *hw = mcs->hw;
+	int err;
+
+	res->flowid2pf_map = alloc_mem(mcs, hw->tcam_entries);
+	if (!res->flowid2pf_map)
+		return -ENOMEM;
+
+	res->secy2pf_map = alloc_mem(mcs, hw->secy_entries);
+	if (!res->secy2pf_map)
+		return -ENOMEM;
+
+	res->sc2pf_map = alloc_mem(mcs, hw->sc_entries);
+	if (!res->sc2pf_map)
+		return -ENOMEM;
+
+	res->sa2pf_map = alloc_mem(mcs, hw->sa_entries);
+	if (!res->sa2pf_map)
+		return -ENOMEM;
+
+	res->flowid2secy_map = alloc_mem(mcs, hw->tcam_entries);
+	if (!res->flowid2secy_map)
+		return -ENOMEM;
+
+	res->flow_ids.max = hw->tcam_entries - MCS_RSRC_RSVD_CNT;
+	err = rvu_alloc_bitmap(&res->flow_ids);
+	if (err)
+		return err;
+
+	res->secy.max = hw->secy_entries - MCS_RSRC_RSVD_CNT;
+	err = rvu_alloc_bitmap(&res->secy);
+	if (err)
+		return err;
+
+	res->sc.max = hw->sc_entries;
+	err = rvu_alloc_bitmap(&res->sc);
+	if (err)
+		return err;
+
+	res->sa.max = hw->sa_entries;
+	err = rvu_alloc_bitmap(&res->sa);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+int mcs_get_blkcnt(void)
+{
+	struct mcs *mcs;
+	int idmax = -ENODEV;
+
+	/* Check MCS block is present in hardware */
+	if (!pci_dev_present(mcs_id_table))
+		return 0;
+
+	list_for_each_entry(mcs, &mcs_list, mcs_list)
+		if (mcs->mcs_id > idmax)
+			idmax = mcs->mcs_id;
+
+	if (idmax < 0)
+		return 0;
+
+	return idmax + 1;
+}
+
+struct mcs *mcs_get_pdata(int mcs_id)
+{
+	struct mcs *mcs_dev;
+
+	list_for_each_entry(mcs_dev, &mcs_list, mcs_list) {
+		if (mcs_dev->mcs_id == mcs_id)
+			return mcs_dev;
+	}
+	return NULL;
+}
+
+/* Set lmac to bypass/operational mode */
+void mcs_set_lmac_mode(struct mcs *mcs, int lmac_id, u8 mode)
+{
+	u64 reg;
+
+	reg = MCSX_MCS_TOP_SLAVE_CHANNEL_CFG(lmac_id * 2);
+	mcs_reg_write(mcs, reg, (u64)mode);
+}
+
+void cn10kb_mcs_parser_cfg(struct mcs *mcs)
+{
+	u64 reg, val;
+
+	/* VLAN CTag */
+	val = BIT_ULL(0) | (0x8100ull & 0xFFFF) << 1 | BIT_ULL(17);
+	/* RX */
+	reg = MCSX_PEX_RX_SLAVE_VLAN_CFGX(0);
+	mcs_reg_write(mcs, reg, val);
+
+	/* TX */
+	reg = MCSX_PEX_TX_SLAVE_VLAN_CFGX(0);
+	mcs_reg_write(mcs, reg, val);
+
+	/* VLAN STag */
+	val = BIT_ULL(0) | (0x88a8ull & 0xFFFF) << 1 | BIT_ULL(18);
+	/* RX */
+	reg = MCSX_PEX_RX_SLAVE_VLAN_CFGX(1);
+	mcs_reg_write(mcs, reg, val);
+
+	/* TX */
+	reg = MCSX_PEX_TX_SLAVE_VLAN_CFGX(1);
+	mcs_reg_write(mcs, reg, val);
+}
+
+static void mcs_lmac_init(struct mcs *mcs, int lmac_id)
+{
+	u64 reg;
+
+	/* Port mode 25GB */
+	reg = MCSX_PAB_RX_SLAVE_PORT_CFGX(lmac_id);
+	mcs_reg_write(mcs, reg, 0);
+
+	if (mcs->hw->mcs_blks > 1) {
+		reg = MCSX_PAB_RX_SLAVE_FIFO_SKID_CFGX(lmac_id);
+		mcs_reg_write(mcs, reg, 0xe000e);
+		return;
+	}
+
+	reg = MCSX_PAB_TX_SLAVE_PORT_CFGX(lmac_id);
+	mcs_reg_write(mcs, reg, 0);
+}
+
+int mcs_set_lmac_channels(int mcs_id, u16 base)
+{
+	struct mcs *mcs;
+	int lmac;
+	u64 cfg;
+
+	mcs = mcs_get_pdata(mcs_id);
+	if (!mcs)
+		return -ENODEV;
+	for (lmac = 0; lmac < mcs->hw->lmac_cnt; lmac++) {
+		cfg = mcs_reg_read(mcs, MCSX_LINK_LMACX_CFG(lmac));
+		cfg &= ~(MCSX_LINK_LMAC_BASE_MASK | MCSX_LINK_LMAC_RANGE_MASK);
+		cfg |=	FIELD_PREP(MCSX_LINK_LMAC_RANGE_MASK, ilog2(16));
+		cfg |=	FIELD_PREP(MCSX_LINK_LMAC_BASE_MASK, base);
+		mcs_reg_write(mcs, MCSX_LINK_LMACX_CFG(lmac), cfg);
+		base += 16;
+	}
+	return 0;
+}
+
+static int mcs_x2p_calibration(struct mcs *mcs)
+{
+	unsigned long timeout = jiffies + usecs_to_jiffies(20000);
+	int i, err = 0;
+	u64 val;
+
+	/* set X2P calibration */
+	val = mcs_reg_read(mcs, MCSX_MIL_GLOBAL);
+	val |= BIT_ULL(5);
+	mcs_reg_write(mcs, MCSX_MIL_GLOBAL, val);
+
+	/* Wait for calibration to complete */
+	while (!(mcs_reg_read(mcs, MCSX_MIL_RX_GBL_STATUS) & BIT_ULL(0))) {
+		if (time_before(jiffies, timeout)) {
+			usleep_range(80, 100);
+			continue;
+		} else {
+			err = -EBUSY;
+			dev_err(mcs->dev, "MCS X2P calibration failed..ignoring\n");
+			return err;
+		}
+	}
+
+	val = mcs_reg_read(mcs, MCSX_MIL_RX_GBL_STATUS);
+	for (i = 0; i < mcs->hw->mcs_x2p_intf; i++) {
+		if (val & BIT_ULL(1 + i))
+			continue;
+		err = -EBUSY;
+		dev_err(mcs->dev, "MCS:%d didn't respond to X2P calibration\n", i);
+	}
+	/* Clear X2P calibrate */
+	mcs_reg_write(mcs, MCSX_MIL_GLOBAL, mcs_reg_read(mcs, MCSX_MIL_GLOBAL) & ~BIT_ULL(5));
+
+	return err;
+}
+
+static void mcs_set_external_bypass(struct mcs *mcs, u8 bypass)
+{
+	u64 val;
+
+	/* Set MCS to external bypass */
+	val = mcs_reg_read(mcs, MCSX_MIL_GLOBAL);
+	if (bypass)
+		val |= BIT_ULL(6);
+	else
+		val &= ~BIT_ULL(6);
+	mcs_reg_write(mcs, MCSX_MIL_GLOBAL, val);
+}
+
+static void mcs_global_cfg(struct mcs *mcs)
+{
+	/* Disable external bypass */
+	mcs_set_external_bypass(mcs, false);
+
+	/* Set MCS to perform standard IEEE802.1AE macsec processing */
+	if (mcs->hw->mcs_blks == 1) {
+		mcs_reg_write(mcs, MCSX_IP_MODE, BIT_ULL(3));
+		return;
+	}
+
+	mcs_reg_write(mcs, MCSX_BBE_RX_SLAVE_CAL_ENTRY, 0xe4);
+	mcs_reg_write(mcs, MCSX_BBE_RX_SLAVE_CAL_LEN, 4);
+}
+
+void cn10kb_mcs_set_hw_capabilities(struct mcs *mcs)
+{
+	struct hwinfo *hw = mcs->hw;
+
+	hw->tcam_entries = 128;		/* TCAM entries */
+	hw->secy_entries  = 128;	/* SecY entries */
+	hw->sc_entries = 128;		/* SC CAM entries */
+	hw->sa_entries = 256;		/* SA entries */
+	hw->lmac_cnt = 20;		/* lmacs/ports per mcs block */
+	hw->mcs_x2p_intf = 5;		/* x2p clabration intf */
+	hw->mcs_blks = 1;		/* MCS blocks */
+}
+
+static struct mcs_ops cn10kb_mcs_ops = {
+	.mcs_set_hw_capabilities	= cn10kb_mcs_set_hw_capabilities,
+	.mcs_parser_cfg			= cn10kb_mcs_parser_cfg,
+};
+
+static int mcs_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct device *dev = &pdev->dev;
+	int lmac, err = 0;
+	struct mcs *mcs;
+
+	mcs = devm_kzalloc(dev, sizeof(*mcs), GFP_KERNEL);
+	if (!mcs)
+		return -ENOMEM;
+
+	mcs->hw = devm_kzalloc(dev, sizeof(struct hwinfo), GFP_KERNEL);
+	if (!mcs->hw)
+		return -ENOMEM;
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(dev, "Failed to enable PCI device\n");
+		pci_set_drvdata(pdev, NULL);
+		return err;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(dev, "PCI request regions failed 0x%x\n", err);
+		goto exit;
+	}
+
+	mcs->reg_base = pcim_iomap(pdev, PCI_CFG_REG_BAR_NUM, 0);
+	if (!mcs->reg_base) {
+		dev_err(dev, "mcs: Cannot map CSR memory space, aborting\n");
+		err = -ENOMEM;
+		goto exit;
+	}
+
+	pci_set_drvdata(pdev, mcs);
+	mcs->pdev = pdev;
+	mcs->dev = &pdev->dev;
+
+	if (pdev->subsystem_device == PCI_SUBSYS_DEVID_CN10K_B)
+		mcs->mcs_ops = &cn10kb_mcs_ops;
+	else
+		mcs->mcs_ops = cnf10kb_get_mac_ops();
+
+	/* Set hardware capabilities */
+	mcs->mcs_ops->mcs_set_hw_capabilities(mcs);
+
+	mcs_global_cfg(mcs);
+
+	/* Perform X2P clibration */
+	err = mcs_x2p_calibration(mcs);
+	if (err)
+		goto err_x2p;
+
+	mcs->mcs_id = (pci_resource_start(pdev, PCI_CFG_REG_BAR_NUM) >> 24)
+			& MCS_ID_MASK;
+
+	/* Set mcs tx side resources */
+	err = mcs_alloc_struct_mem(mcs, &mcs->tx);
+	if (err)
+		goto err_x2p;
+
+	/* Set mcs rx side resources */
+	err = mcs_alloc_struct_mem(mcs, &mcs->rx);
+	if (err)
+		goto err_x2p;
+
+	/* per port config */
+	for (lmac = 0; lmac < mcs->hw->lmac_cnt; lmac++)
+		mcs_lmac_init(mcs, lmac);
+
+	/* Parser configuration */
+	mcs->mcs_ops->mcs_parser_cfg(mcs);
+
+	list_add(&mcs->mcs_list, &mcs_list);
+
+	return 0;
+
+err_x2p:
+	/* Enable external bypass */
+	mcs_set_external_bypass(mcs, true);
+exit:
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return err;
+}
+
+static void mcs_remove(struct pci_dev *pdev)
+{
+	struct mcs *mcs = pci_get_drvdata(pdev);
+
+	/* Set MCS to external bypass */
+	mcs_set_external_bypass(mcs, true);
+	pci_free_irq_vectors(pdev);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+struct pci_driver mcs_driver = {
+	.name = DRV_NAME,
+	.id_table = mcs_id_table,
+	.probe = mcs_probe,
+	.remove = mcs_remove,
+};
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.h b/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
new file mode 100644
index 000000000000..002fee898e7b
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Marvell CN10K MCS driver
+ *
+ * Copyright (C) 2022 Marvell.
+ */
+
+#ifndef MCS_H
+#define MCS_H
+
+#include <linux/bits.h>
+#include "rvu.h"
+
+#define PCI_DEVID_CN10K_MCS		0xA096
+
+#define MCSX_LINK_LMAC_RANGE_MASK	GENMASK_ULL(19, 16)
+#define MCSX_LINK_LMAC_BASE_MASK	GENMASK_ULL(11, 0)
+
+#define MCS_ID_MASK			0x7
+
+/* Reserved resources for default bypass entry */
+#define MCS_RSRC_RSVD_CNT		1
+
+struct mcs_rsrc_map {
+	u16 *flowid2pf_map;
+	u16 *secy2pf_map;
+	u16 *sc2pf_map;
+	u16 *sa2pf_map;
+	u16 *flowid2secy_map;	/* bitmap flowid mapped to secy*/
+	struct rsrc_bmap	flow_ids;
+	struct rsrc_bmap	secy;
+	struct rsrc_bmap	sc;
+	struct rsrc_bmap	sa;
+};
+
+struct hwinfo {
+	u8 tcam_entries;
+	u8 secy_entries;
+	u8 sc_entries;
+	u16 sa_entries;
+	u8 mcs_x2p_intf;
+	u8 lmac_cnt;
+	u8 mcs_blks;
+	unsigned long	lmac_bmap; /* bitmap of enabled mcs lmac */
+};
+
+struct mcs {
+	void __iomem		*reg_base;
+	struct pci_dev		*pdev;
+	struct device		*dev;
+	struct hwinfo		*hw;
+	struct mcs_rsrc_map	tx;
+	struct mcs_rsrc_map	rx;
+	u8			mcs_id;
+	struct mcs_ops		*mcs_ops;
+	struct list_head	mcs_list;
+};
+
+struct mcs_ops {
+	void	(*mcs_set_hw_capabilities)(struct mcs *mcs);
+	void	(*mcs_parser_cfg)(struct mcs *mcs);
+};
+
+extern struct pci_driver mcs_driver;
+
+static inline void mcs_reg_write(struct mcs *mcs, u64 offset, u64 val)
+{
+	writeq(val, mcs->reg_base + offset);
+}
+
+static inline u64 mcs_reg_read(struct mcs *mcs, u64 offset)
+{
+	return readq(mcs->reg_base + offset);
+}
+
+/* MCS APIs */
+struct mcs *mcs_get_pdata(int mcs_id);
+int mcs_get_blkcnt(void);
+int mcs_set_lmac_channels(int mcs_id, u16 base);
+
+int mcs_install_flowid_bypass_entry(struct mcs *mcs);
+void mcs_set_lmac_mode(struct mcs *mcs, int lmac_id, u8 mode);
+
+/* CN10K-B APIs */
+void cn10kb_mcs_set_hw_capabilities(struct mcs *mcs);
+void cn10kb_mcs_parser_cfg(struct mcs *mcs);
+
+/* CNF10K-B APIs */
+struct mcs_ops *cnf10kb_get_mac_ops(void);
+void cnf10kb_mcs_set_hw_capabilities(struct mcs *mcs);
+void cnf10kb_mcs_parser_cfg(struct mcs *mcs);
+
+#endif /* MCS_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c
new file mode 100644
index 000000000000..62c83a3eb1c8
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell MCS driver
+ *
+ * Copyright (C) 2022 Marvell.
+ */
+
+#include "mcs.h"
+#include "mcs_reg.h"
+
+static struct mcs_ops cnf10kb_mcs_ops   = {
+	.mcs_set_hw_capabilities	= cnf10kb_mcs_set_hw_capabilities,
+	.mcs_parser_cfg			= cnf10kb_mcs_parser_cfg,
+};
+
+struct mcs_ops *cnf10kb_get_mac_ops(void)
+{
+	return &cnf10kb_mcs_ops;
+}
+
+void cnf10kb_mcs_set_hw_capabilities(struct mcs *mcs)
+{
+	struct hwinfo *hw = mcs->hw;
+
+	hw->tcam_entries = 64;		/* TCAM entries */
+	hw->secy_entries  = 64;		/* SecY entries */
+	hw->sc_entries = 64;		/* SC CAM entries */
+	hw->sa_entries = 128;		/* SA entries */
+	hw->lmac_cnt = 4;		/* lmacs/ports per mcs block */
+	hw->mcs_x2p_intf = 1;		/* x2p clabration intf */
+	hw->mcs_blks = 7;		/* MCS blocks */
+}
+
+void cnf10kb_mcs_parser_cfg(struct mcs *mcs)
+{
+	u64 reg, val;
+
+	/* VLAN Ctag */
+	val = (0x8100ull & 0xFFFF) | BIT_ULL(20) | BIT_ULL(22);
+
+	reg = MCSX_PEX_RX_SLAVE_CUSTOM_TAGX(0);
+	mcs_reg_write(mcs, reg, val);
+
+	reg = MCSX_PEX_TX_SLAVE_CUSTOM_TAGX(0);
+	mcs_reg_write(mcs, reg, val);
+
+	/* VLAN STag */
+	val = (0x88a8ull & 0xFFFF) | BIT_ULL(20) | BIT_ULL(23);
+
+	/* RX */
+	reg = MCSX_PEX_RX_SLAVE_CUSTOM_TAGX(1);
+	mcs_reg_write(mcs, reg, val);
+
+	/* TX */
+	reg = MCSX_PEX_TX_SLAVE_CUSTOM_TAGX(1);
+	mcs_reg_write(mcs, reg, val);
+
+	/* Enable custom tage 0 and 1 and sectag */
+	val = BIT_ULL(0) | BIT_ULL(1) | BIT_ULL(12);
+
+	reg = MCSX_PEX_RX_SLAVE_ETYPE_ENABLE;
+	mcs_reg_write(mcs, reg, val);
+
+	reg = MCSX_PEX_TX_SLAVE_ETYPE_ENABLE;
+	mcs_reg_write(mcs, reg, val);
+}
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h b/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
new file mode 100644
index 000000000000..61bf8ab257f9
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Marvell MCS driver
+ *
+ * Copyright (C) 2022 Marvell.
+ */
+
+#ifndef MCS_REG_H
+#define MCS_REG_H
+
+#include <linux/bits.h>
+
+/* Registers */
+#define MCSX_IP_MODE					0x900c8ull
+
+#define MCSX_MCS_TOP_SLAVE_CHANNEL_CFG(a) ({		\
+	u64 offset;					\
+							\
+	offset = 0x808ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xa68ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_MIL_GLOBAL	({				\
+	u64 offset;					\
+							\
+	offset = 0x80000ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x60000ull;			\
+	offset; })
+
+#define MCSX_LINK_LMACX_CFG(a) ({			\
+	u64 offset;					\
+							\
+	offset = 0x90000ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x70000ull;			\
+	offset += (a) * 0x800ull;			\
+	offset; })
+
+#define MCSX_MIL_RX_GBL_STATUS ({			\
+	u64 offset;					\
+							\
+	offset = 0x800c8ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x600c8ull;			\
+	offset; })
+
+/* PAB */
+#define MCSX_PAB_RX_SLAVE_PORT_CFGX(a) ({	\
+	u64 offset;				\
+						\
+	offset = 0x1718ull;			\
+	if (mcs->hw->mcs_blks > 1)		\
+		offset = 0x280ull;		\
+	offset += (a) * 0x40ull;		\
+	offset; })
+
+#define MCSX_PAB_TX_SLAVE_PORT_CFGX(a)			(0x2930ull + (a) * 0x40ull)
+
+/* PEX registers */
+#define MCSX_PEX_RX_SLAVE_VLAN_CFGX(a)          (0x3b58ull + (a) * 0x8ull)
+#define MCSX_PEX_TX_SLAVE_VLAN_CFGX(a)          (0x46f8ull + (a) * 0x8ull)
+
+/* CNF10K-B */
+#define MCSX_PEX_RX_SLAVE_CUSTOM_TAGX(a)        (0x4c8ull + (a) * 0x8ull)
+#define MCSX_PEX_TX_SLAVE_CUSTOM_TAGX(a)        (0x748ull + (a) * 0x8ull)
+#define MCSX_PEX_RX_SLAVE_ETYPE_ENABLE          0x6e8ull
+#define MCSX_PEX_TX_SLAVE_ETYPE_ENABLE          0x968ull
+
+/* BEE */
+#define MCSX_BBE_RX_SLAVE_PADDING_CTL			0xe08ull
+#define MCSX_BBE_TX_SLAVE_PADDING_CTL			0x12f8ull
+#define MCSX_BBE_RX_SLAVE_CAL_ENTRY			0x180ull
+#define MCSX_BBE_RX_SLAVE_CAL_LEN			0x188ull
+#define MCSX_PAB_RX_SLAVE_FIFO_SKID_CFGX(a)		(0x290ull + (a) * 0x40ull)
+
+#endif
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
new file mode 100644
index 000000000000..c3f5b39bf17c
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell CN10K MCS driver
+ *
+ * Copyright (C) 2022 Marvell.
+ */
+
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include "mcs.h"
+#include "rvu.h"
+#include "lmac_common.h"
+
+static void rvu_mcs_set_lmac_bmap(struct rvu *rvu)
+{
+	struct mcs *mcs = mcs_get_pdata(0);
+	unsigned long lmac_bmap;
+	int cgx, lmac, port;
+
+	for (port = 0; port < mcs->hw->lmac_cnt; port++) {
+		cgx = port / rvu->hw->lmac_per_cgx;
+		lmac = port % rvu->hw->lmac_per_cgx;
+		if (!is_lmac_valid(rvu_cgx_pdata(cgx, rvu), lmac))
+			continue;
+		set_bit(port, &lmac_bmap);
+	}
+	mcs->hw->lmac_bmap = lmac_bmap;
+}
+
+int rvu_mcs_init(struct rvu *rvu)
+{
+	struct rvu_hwinfo *hw = rvu->hw;
+	int err = 0;
+
+	rvu->mcs_blk_cnt = mcs_get_blkcnt();
+
+	if (!rvu->mcs_blk_cnt)
+		return 0;
+
+	/* Needed only for CN10K-B */
+	if (rvu->mcs_blk_cnt == 1) {
+		err = mcs_set_lmac_channels(0, hw->cgx_chan_base);
+		if (err)
+			return err;
+		/* Set active lmacs */
+		rvu_mcs_set_lmac_bmap(rvu);
+	}
+
+	return err;
+}
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index 7282a826d81e..5d7464101dae 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -16,6 +16,7 @@
 #include "rvu.h"
 #include "rvu_reg.h"
 #include "ptp.h"
+#include "mcs.h"
 
 #include "rvu_trace.h"
 #include "rvu_npc_hash.h"
@@ -1159,6 +1160,12 @@ cpt:
 
 	rvu_program_channels(rvu);
 
+	err = rvu_mcs_init(rvu);
+	if (err) {
+		dev_err(rvu->dev, "%s: Failed to initialize mcs\n", __func__);
+		goto nix_err;
+	}
+
 	return 0;
 
 nix_err:
@@ -3354,12 +3361,18 @@ static int __init rvu_init_module(void)
 	if (err < 0)
 		goto ptp_err;
 
+	err = pci_register_driver(&mcs_driver);
+	if (err < 0)
+		goto mcs_err;
+
 	err =  pci_register_driver(&rvu_driver);
 	if (err < 0)
 		goto rvu_err;
 
 	return 0;
 rvu_err:
+	pci_unregister_driver(&mcs_driver);
+mcs_err:
 	pci_unregister_driver(&ptp_driver);
 ptp_err:
 	pci_unregister_driver(&cgx_driver);
@@ -3370,6 +3383,7 @@ ptp_err:
 static void __exit rvu_cleanup_module(void)
 {
 	pci_unregister_driver(&rvu_driver);
+	pci_unregister_driver(&mcs_driver);
 	pci_unregister_driver(&ptp_driver);
 	pci_unregister_driver(&cgx_driver);
 }
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index d15bc443335d..9a150dacb012 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -25,6 +25,8 @@
 /* Subsystem Device ID */
 #define PCI_SUBSYS_DEVID_96XX                  0xB200
 #define PCI_SUBSYS_DEVID_CN10K_A	       0xB900
+#define PCI_SUBSYS_DEVID_CNF10K_B              0xBC00
+#define PCI_SUBSYS_DEVID_CN10K_B               0xBD00
 
 /* PCI BAR nos */
 #define	PCI_AF_REG_BAR_NUM			0
@@ -497,6 +499,8 @@ struct rvu {
 
 	struct ptp		*ptp;
 
+	int			mcs_blk_cnt;
+
 #ifdef CONFIG_DEBUG_FS
 	struct rvu_debugfs	rvu_dbg;
 #endif
@@ -868,4 +872,8 @@ void rvu_switch_update_rules(struct rvu *rvu, u16 pcifunc);
 int rvu_npc_set_parse_mode(struct rvu *rvu, u16 pcifunc, u64 mode, u8 dir,
 			   u64 pkind, u8 var_len_off, u8 var_len_off_mask,
 			   u8 shift_dir);
+
+/* CN10K MCS */
+int rvu_mcs_init(struct rvu *rvu);
+
 #endif /* RVU_H */
-- 
cgit v1.2.3


From 080bbd19c9dd386fa5be2cdec6baaf01cf00db5e Mon Sep 17 00:00:00 2001
From: Geetha sowjanya <gakula@marvell.com>
Date: Sat, 1 Oct 2022 10:29:43 +0530
Subject: octeontx2-af: cn10k: mcs: Add mailboxes for port related operations

There are set of configurations to be done at MCS port level like
bringing port out of reset, making port as operational or bypass.
This patch adds all the port related mailbox message handlers
so that AF consumers can use them.

Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   | 111 +++++++++++++++++-
 drivers/net/ethernet/marvell/octeontx2/af/mcs.c    |  94 +++++++++++++++
 drivers/net/ethernet/marvell/octeontx2/af/mcs.h    |  10 ++
 .../net/ethernet/marvell/octeontx2/af/mcs_reg.h    |  36 ++++++
 .../net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c | 129 ++++++++++++++++++++-
 5 files changed, 376 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index e26c3b0c4dcb..207cd4fa6980 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -293,9 +293,21 @@ M(NIX_BANDPROF_ALLOC,	0x801d, nix_bandprof_alloc, nix_bandprof_alloc_req, \
 M(NIX_BANDPROF_FREE,	0x801e, nix_bandprof_free, nix_bandprof_free_req,   \
 				msg_rsp)				    \
 M(NIX_BANDPROF_GET_HWINFO, 0x801f, nix_bandprof_get_hwinfo, msg_req,		\
-				nix_bandprof_get_hwinfo_rsp)
-
-/* Messages initiated by AF (range 0xC00 - 0xDFF) */
+				nix_bandprof_get_hwinfo_rsp)		    \
+/* MCS mbox IDs (range 0xA000 - 0xBFFF) */					\
+M(MCS_SET_ACTIVE_LMAC,	0xa00a,	mcs_set_active_lmac, mcs_set_active_lmac,	\
+				msg_rsp)					\
+M(MCS_GET_HW_INFO,	0xa00b,	mcs_get_hw_info, msg_req, mcs_hw_info)		\
+M(MCS_SET_LMAC_MODE,	0xa013, mcs_set_lmac_mode, mcs_set_lmac_mode, msg_rsp)	\
+M(MCS_PORT_RESET,	0xa018, mcs_port_reset, mcs_port_reset_req, msg_rsp)	\
+M(MCS_PORT_CFG_SET,	0xa019, mcs_port_cfg_set, mcs_port_cfg_set_req, msg_rsp)\
+M(MCS_PORT_CFG_GET,	0xa020, mcs_port_cfg_get, mcs_port_cfg_get_req,		\
+				mcs_port_cfg_get_rsp)				\
+M(MCS_CUSTOM_TAG_CFG_GET, 0xa021, mcs_custom_tag_cfg_get,			\
+				  mcs_custom_tag_cfg_get_req,			\
+				  mcs_custom_tag_cfg_get_rsp)
+
+/* Messages initiated by AF (range 0xC00 - 0xEFF) */
 #define MBOX_UP_CGX_MESSAGES						\
 M(CGX_LINK_EVENT,	0xC00, cgx_link_event, cgx_link_info_msg, msg_rsp)
 
@@ -1657,4 +1669,97 @@ enum cgx_af_status {
 	LMAC_AF_ERR_EXACT_MATCH_TBL_LOOK_UP_FAILED = -1110,
 };
 
+enum mcs_direction {
+	MCS_RX,
+	MCS_TX,
+};
+
+struct mcs_hw_info {
+	struct mbox_msghdr hdr;
+	u8 num_mcs_blks;	/* Number of MCS blocks */
+	u8 tcam_entries;	/* RX/TX Tcam entries per mcs block */
+	u8 secy_entries;	/* RX/TX SECY entries per mcs block */
+	u8 sc_entries;		/* RX/TX SC CAM entries per mcs block */
+	u8 sa_entries;		/* PN table entries = SA entries */
+	u64 rsvd[16];
+};
+
+struct mcs_set_active_lmac {
+	struct mbox_msghdr hdr;
+	u32 lmac_bmap;	/* bitmap of active lmac per mcs block */
+	u8 mcs_id;
+	u16 chan_base; /* MCS channel base */
+	u64 rsvd;
+};
+
+struct mcs_set_lmac_mode {
+	struct mbox_msghdr hdr;
+	u8 mode;	/* 1:Bypass 0:Operational */
+	u8 lmac_id;
+	u8 mcs_id;
+	u64 rsvd;
+};
+
+struct mcs_port_reset_req {
+	struct mbox_msghdr hdr;
+	u8 reset;
+	u8 mcs_id;
+	u8 port_id;
+	u64 rsvd;
+};
+
+struct mcs_port_cfg_set_req {
+	struct mbox_msghdr hdr;
+	u8 cstm_tag_rel_mode_sel;
+	u8 custom_hdr_enb;
+	u8 fifo_skid;
+	u8 port_mode;
+	u8 port_id;
+	u8 mcs_id;
+	u64 rsvd;
+};
+
+struct mcs_port_cfg_get_req {
+	struct mbox_msghdr hdr;
+	u8 port_id;
+	u8 mcs_id;
+	u64 rsvd;
+};
+
+struct mcs_port_cfg_get_rsp {
+	struct mbox_msghdr hdr;
+	u8 cstm_tag_rel_mode_sel;
+	u8 custom_hdr_enb;
+	u8 fifo_skid;
+	u8 port_mode;
+	u8 port_id;
+	u8 mcs_id;
+	u64 rsvd;
+};
+
+struct mcs_custom_tag_cfg_get_req {
+	struct mbox_msghdr hdr;
+	u8 mcs_id;
+	u8 dir;
+	u64 rsvd;
+};
+
+struct mcs_custom_tag_cfg_get_rsp {
+	struct mbox_msghdr hdr;
+	u16 cstm_etype[8];
+	u8 cstm_indx[8];
+	u8 cstm_etype_en;
+	u8 mcs_id;
+	u8 dir;
+	u64 rsvd;
+};
+
+/* MCS mailbox error codes
+ * Range 1201 - 1300.
+ */
+enum mcs_af_status {
+	MCS_AF_ERR_INVALID_MCSID        = -1201,
+	MCS_AF_ERR_NOT_MAPPED           = -1202,
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
index 259ccb42af6b..555f3b220d20 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
@@ -107,6 +107,100 @@ struct mcs *mcs_get_pdata(int mcs_id)
 	return NULL;
 }
 
+void mcs_set_port_cfg(struct mcs *mcs, struct mcs_port_cfg_set_req *req)
+{
+	u64 val = 0;
+
+	mcs_reg_write(mcs, MCSX_PAB_RX_SLAVE_PORT_CFGX(req->port_id),
+		      req->port_mode & MCS_PORT_MODE_MASK);
+
+	req->cstm_tag_rel_mode_sel &= 0x3;
+
+	if (mcs->hw->mcs_blks > 1) {
+		req->fifo_skid &= MCS_PORT_FIFO_SKID_MASK;
+		val = (u32)req->fifo_skid << 0x10;
+		val |= req->fifo_skid;
+		mcs_reg_write(mcs, MCSX_PAB_RX_SLAVE_FIFO_SKID_CFGX(req->port_id), val);
+		mcs_reg_write(mcs, MCSX_PEX_TX_SLAVE_CUSTOM_TAG_REL_MODE_SEL(req->port_id),
+			      req->cstm_tag_rel_mode_sel);
+		val = mcs_reg_read(mcs, MCSX_PEX_RX_SLAVE_PEX_CONFIGURATION);
+
+		if (req->custom_hdr_enb)
+			val |= BIT_ULL(req->port_id);
+		else
+			val &= ~BIT_ULL(req->port_id);
+
+		mcs_reg_write(mcs, MCSX_PEX_RX_SLAVE_PEX_CONFIGURATION, val);
+	} else {
+		val = mcs_reg_read(mcs, MCSX_PEX_TX_SLAVE_PORT_CONFIG(req->port_id));
+		val |= (req->cstm_tag_rel_mode_sel << 2);
+		mcs_reg_write(mcs, MCSX_PEX_TX_SLAVE_PORT_CONFIG(req->port_id), val);
+	}
+}
+
+void mcs_get_port_cfg(struct mcs *mcs, struct mcs_port_cfg_get_req *req,
+		      struct mcs_port_cfg_get_rsp *rsp)
+{
+	u64 reg = 0;
+
+	rsp->port_mode = mcs_reg_read(mcs, MCSX_PAB_RX_SLAVE_PORT_CFGX(req->port_id)) &
+			 MCS_PORT_MODE_MASK;
+
+	if (mcs->hw->mcs_blks > 1) {
+		reg = MCSX_PAB_RX_SLAVE_FIFO_SKID_CFGX(req->port_id);
+		rsp->fifo_skid = mcs_reg_read(mcs, reg) & MCS_PORT_FIFO_SKID_MASK;
+		reg = MCSX_PEX_TX_SLAVE_CUSTOM_TAG_REL_MODE_SEL(req->port_id);
+		rsp->cstm_tag_rel_mode_sel = mcs_reg_read(mcs, reg) & 0x3;
+		if (mcs_reg_read(mcs, MCSX_PEX_RX_SLAVE_PEX_CONFIGURATION) & BIT_ULL(req->port_id))
+			rsp->custom_hdr_enb = 1;
+	} else {
+		reg = MCSX_PEX_TX_SLAVE_PORT_CONFIG(req->port_id);
+		rsp->cstm_tag_rel_mode_sel = mcs_reg_read(mcs, reg) >> 2;
+	}
+
+	rsp->port_id = req->port_id;
+	rsp->mcs_id = req->mcs_id;
+}
+
+void mcs_get_custom_tag_cfg(struct mcs *mcs, struct mcs_custom_tag_cfg_get_req *req,
+			    struct mcs_custom_tag_cfg_get_rsp *rsp)
+{
+	u64 reg = 0, val = 0;
+	u8 idx;
+
+	for (idx = 0; idx < MCS_MAX_CUSTOM_TAGS; idx++) {
+		if (mcs->hw->mcs_blks > 1)
+			reg  = (req->dir == MCS_RX) ? MCSX_PEX_RX_SLAVE_CUSTOM_TAGX(idx) :
+				MCSX_PEX_TX_SLAVE_CUSTOM_TAGX(idx);
+		else
+			reg = (req->dir == MCS_RX) ? MCSX_PEX_RX_SLAVE_VLAN_CFGX(idx) :
+				MCSX_PEX_TX_SLAVE_VLAN_CFGX(idx);
+
+		val = mcs_reg_read(mcs, reg);
+		if (mcs->hw->mcs_blks > 1) {
+			rsp->cstm_etype[idx] = val & GENMASK(15, 0);
+			rsp->cstm_indx[idx] = (val >> 0x16) & 0x3;
+			reg = (req->dir == MCS_RX) ? MCSX_PEX_RX_SLAVE_ETYPE_ENABLE :
+				MCSX_PEX_TX_SLAVE_ETYPE_ENABLE;
+			rsp->cstm_etype_en = mcs_reg_read(mcs, reg) & 0xFF;
+		} else {
+			rsp->cstm_etype[idx] = (val >> 0x1) & GENMASK(15, 0);
+			rsp->cstm_indx[idx] = (val >> 0x11) & 0x3;
+			rsp->cstm_etype_en |= (val & 0x1) << idx;
+		}
+	}
+
+	rsp->mcs_id = req->mcs_id;
+	rsp->dir = req->dir;
+}
+
+void mcs_reset_port(struct mcs *mcs, u8 port_id, u8 reset)
+{
+	u64 reg = MCSX_MCS_TOP_SLAVE_PORT_RESET(port_id);
+
+	mcs_reg_write(mcs, reg, reset & 0x1);
+}
+
 /* Set lmac to bypass/operational mode */
 void mcs_set_lmac_mode(struct mcs *mcs, int lmac_id, u8 mode)
 {
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.h b/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
index 002fee898e7b..c11d507a77e3 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
@@ -17,6 +17,10 @@
 
 #define MCS_ID_MASK			0x7
 
+#define MCS_PORT_MODE_MASK		0x3
+#define MCS_PORT_FIFO_SKID_MASK		0x3F
+#define MCS_MAX_CUSTOM_TAGS		0x8
+
 /* Reserved resources for default bypass entry */
 #define MCS_RSRC_RSVD_CNT		1
 
@@ -79,6 +83,12 @@ int mcs_set_lmac_channels(int mcs_id, u16 base);
 
 int mcs_install_flowid_bypass_entry(struct mcs *mcs);
 void mcs_set_lmac_mode(struct mcs *mcs, int lmac_id, u8 mode);
+void mcs_reset_port(struct mcs *mcs, u8 port_id, u8 reset);
+void mcs_set_port_cfg(struct mcs *mcs, struct mcs_port_cfg_set_req *req);
+void mcs_get_port_cfg(struct mcs *mcs, struct mcs_port_cfg_get_req *req,
+		      struct mcs_port_cfg_get_rsp *rsp);
+void mcs_get_custom_tag_cfg(struct mcs *mcs, struct mcs_custom_tag_cfg_get_req *req,
+			    struct mcs_custom_tag_cfg_get_rsp *rsp);
 
 /* CN10K-B APIs */
 void cn10kb_mcs_set_hw_capabilities(struct mcs *mcs);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h b/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
index 61bf8ab257f9..1ce3442a96d9 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
@@ -11,6 +11,15 @@
 
 /* Registers */
 #define MCSX_IP_MODE					0x900c8ull
+#define MCSX_MCS_TOP_SLAVE_PORT_RESET(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x408ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xa28ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
 
 #define MCSX_MCS_TOP_SLAVE_CHANNEL_CFG(a) ({		\
 	u64 offset;					\
@@ -29,6 +38,23 @@
 		offset = 0x60000ull;			\
 	offset; })
 
+#define MCSX_MIL_RX_LMACX_CFG(a) ({			\
+	u64 offset;					\
+							\
+	offset = 0x900a8ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x700a8ull;			\
+	offset += (a) * 0x800ull;			\
+	offset; })
+
+#define MCSX_HIL_GLOBAL ({				\
+	u64 offset;					\
+							\
+	offset = 0xc0000ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xa0000ull;			\
+	offset; })
+
 #define MCSX_LINK_LMACX_CFG(a) ({			\
 	u64 offset;					\
 							\
@@ -61,6 +87,16 @@
 /* PEX registers */
 #define MCSX_PEX_RX_SLAVE_VLAN_CFGX(a)          (0x3b58ull + (a) * 0x8ull)
 #define MCSX_PEX_TX_SLAVE_VLAN_CFGX(a)          (0x46f8ull + (a) * 0x8ull)
+#define MCSX_PEX_TX_SLAVE_CUSTOM_TAG_REL_MODE_SEL(a)	(0x788ull + (a) * 0x8ull)
+#define MCSX_PEX_TX_SLAVE_PORT_CONFIG(a)		(0x4738ull + (a) * 0x8ull)
+
+#define MCSX_PEX_RX_SLAVE_PEX_CONFIGURATION ({		\
+	u64 offset;					\
+							\
+	offset = 0x3b50ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x4c0ull;			\
+	offset; })
 
 /* CNF10K-B */
 #define MCSX_PEX_RX_SLAVE_CUSTOM_TAGX(a)        (0x4c8ull + (a) * 0x8ull)
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
index c3f5b39bf17c..9eaa8ee452d5 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
@@ -13,6 +13,126 @@
 #include "rvu.h"
 #include "lmac_common.h"
 
+int rvu_mbox_handler_mcs_set_lmac_mode(struct rvu *rvu,
+				       struct mcs_set_lmac_mode *req,
+				       struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	if (BIT_ULL(req->lmac_id) & mcs->hw->lmac_bmap)
+		mcs_set_lmac_mode(mcs, req->lmac_id, req->mode);
+
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_get_hw_info(struct rvu *rvu,
+				     struct msg_req *req,
+				     struct mcs_hw_info *rsp)
+{
+	struct mcs *mcs;
+
+	if (!rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_NOT_MAPPED;
+
+	/* MCS resources are same across all blocks */
+	mcs = mcs_get_pdata(0);
+	rsp->num_mcs_blks = rvu->mcs_blk_cnt;
+	rsp->tcam_entries = mcs->hw->tcam_entries;
+	rsp->secy_entries = mcs->hw->secy_entries;
+	rsp->sc_entries = mcs->hw->sc_entries;
+	rsp->sa_entries = mcs->hw->sa_entries;
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_port_reset(struct rvu *rvu, struct mcs_port_reset_req *req,
+				    struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	mcs_reset_port(mcs, req->port_id, req->reset);
+
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_set_active_lmac(struct rvu *rvu,
+					 struct mcs_set_active_lmac *req,
+					 struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+	if (!mcs)
+		return MCS_AF_ERR_NOT_MAPPED;
+
+	mcs->hw->lmac_bmap = req->lmac_bmap;
+	mcs_set_lmac_channels(req->mcs_id, req->chan_base);
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_port_cfg_set(struct rvu *rvu, struct mcs_port_cfg_set_req *req,
+				      struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	if (mcs->hw->lmac_cnt <= req->port_id || !(mcs->hw->lmac_bmap & BIT_ULL(req->port_id)))
+		return -EINVAL;
+
+	mcs_set_port_cfg(mcs, req);
+
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_port_cfg_get(struct rvu *rvu, struct mcs_port_cfg_get_req *req,
+				      struct mcs_port_cfg_get_rsp *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	if (mcs->hw->lmac_cnt <= req->port_id || !(mcs->hw->lmac_bmap & BIT_ULL(req->port_id)))
+		return -EINVAL;
+
+	mcs_get_port_cfg(mcs, req, rsp);
+
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_custom_tag_cfg_get(struct rvu *rvu, struct mcs_custom_tag_cfg_get_req *req,
+					    struct mcs_custom_tag_cfg_get_rsp *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	mcs_get_custom_tag_cfg(mcs, req, rsp);
+
+	return 0;
+}
+
 static void rvu_mcs_set_lmac_bmap(struct rvu *rvu)
 {
 	struct mcs *mcs = mcs_get_pdata(0);
@@ -32,7 +152,8 @@ static void rvu_mcs_set_lmac_bmap(struct rvu *rvu)
 int rvu_mcs_init(struct rvu *rvu)
 {
 	struct rvu_hwinfo *hw = rvu->hw;
-	int err = 0;
+	int lmac, err = 0, mcs_id;
+	struct mcs *mcs;
 
 	rvu->mcs_blk_cnt = mcs_get_blkcnt();
 
@@ -48,5 +169,11 @@ int rvu_mcs_init(struct rvu *rvu)
 		rvu_mcs_set_lmac_bmap(rvu);
 	}
 
+	for (mcs_id = 0; mcs_id < rvu->mcs_blk_cnt; mcs_id++) {
+		mcs = mcs_get_pdata(mcs_id);
+		for (lmac = 0; lmac < mcs->hw->lmac_cnt; lmac++)
+			mcs_set_lmac_mode(mcs, lmac, 0);
+	}
+
 	return err;
 }
-- 
cgit v1.2.3


From cfc14181d497cd7241d2aca7dcbca9039bf8d7dd Mon Sep 17 00:00:00 2001
From: Geetha sowjanya <gakula@marvell.com>
Date: Sat, 1 Oct 2022 10:29:44 +0530
Subject: octeontx2-af: cn10k: mcs: Manage the MCS block hardware resources

To establish a macsec connection association netdev driver
needs hardware resources like SecY, TCAM flows, SCs and SAs.
This patch manages allocating, freeing and configuring those
resources. AF consumers can request resources and configure them
via these mailbox messages. AF can allocate until it runs out of
hardware resources.

Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   | 211 ++++++++++
 drivers/net/ethernet/marvell/octeontx2/af/mcs.c    | 447 +++++++++++++++++++++
 drivers/net/ethernet/marvell/octeontx2/af/mcs.h    |  59 ++-
 .../ethernet/marvell/octeontx2/af/mcs_cnf10kb.c    |  55 +++
 .../net/ethernet/marvell/octeontx2/af/mcs_reg.h    | 385 ++++++++++++++++++
 .../net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c | 374 +++++++++++++++++
 6 files changed, 1530 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index 207cd4fa6980..3213b1512263 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -295,10 +295,38 @@ M(NIX_BANDPROF_FREE,	0x801e, nix_bandprof_free, nix_bandprof_free_req,   \
 M(NIX_BANDPROF_GET_HWINFO, 0x801f, nix_bandprof_get_hwinfo, msg_req,		\
 				nix_bandprof_get_hwinfo_rsp)		    \
 /* MCS mbox IDs (range 0xA000 - 0xBFFF) */					\
+M(MCS_ALLOC_RESOURCES,	0xa000, mcs_alloc_resources, mcs_alloc_rsrc_req,	\
+				mcs_alloc_rsrc_rsp)				\
+M(MCS_FREE_RESOURCES,	0xa001, mcs_free_resources, mcs_free_rsrc_req, msg_rsp) \
+M(MCS_FLOWID_ENTRY_WRITE, 0xa002, mcs_flowid_entry_write, mcs_flowid_entry_write_req,	\
+				msg_rsp)					\
+M(MCS_SECY_PLCY_WRITE,	0xa003, mcs_secy_plcy_write, mcs_secy_plcy_write_req,	\
+				msg_rsp)					\
+M(MCS_RX_SC_CAM_WRITE,	0xa004, mcs_rx_sc_cam_write, mcs_rx_sc_cam_write_req,	\
+				msg_rsp)					\
+M(MCS_SA_PLCY_WRITE,	0xa005, mcs_sa_plcy_write, mcs_sa_plcy_write_req,	\
+				msg_rsp)					\
+M(MCS_TX_SC_SA_MAP_WRITE, 0xa006, mcs_tx_sc_sa_map_write, mcs_tx_sc_sa_map,	\
+				  msg_rsp)					\
+M(MCS_RX_SC_SA_MAP_WRITE, 0xa007, mcs_rx_sc_sa_map_write, mcs_rx_sc_sa_map,	\
+				  msg_rsp)					\
+M(MCS_FLOWID_ENA_ENTRY,	0xa008, mcs_flowid_ena_entry, mcs_flowid_ena_dis_entry,	\
+				msg_rsp)					\
+M(MCS_PN_TABLE_WRITE,	0xa009, mcs_pn_table_write, mcs_pn_table_write_req,	\
+				msg_rsp)					\
 M(MCS_SET_ACTIVE_LMAC,	0xa00a,	mcs_set_active_lmac, mcs_set_active_lmac,	\
 				msg_rsp)					\
 M(MCS_GET_HW_INFO,	0xa00b,	mcs_get_hw_info, msg_req, mcs_hw_info)		\
 M(MCS_SET_LMAC_MODE,	0xa013, mcs_set_lmac_mode, mcs_set_lmac_mode, msg_rsp)	\
+M(MCS_SET_PN_THRESHOLD, 0xa014, mcs_set_pn_threshold, mcs_set_pn_threshold,	\
+				msg_rsp)					\
+M(MCS_ALLOC_CTRL_PKT_RULE, 0xa015, mcs_alloc_ctrl_pkt_rule,			\
+				   mcs_alloc_ctrl_pkt_rule_req,			\
+				   mcs_alloc_ctrl_pkt_rule_rsp)			\
+M(MCS_FREE_CTRL_PKT_RULE, 0xa016, mcs_free_ctrl_pkt_rule,			\
+				  mcs_free_ctrl_pkt_rule_req, msg_rsp)		\
+M(MCS_CTRL_PKT_RULE_WRITE, 0xa017, mcs_ctrl_pkt_rule_write,			\
+				   mcs_ctrl_pkt_rule_write_req, msg_rsp)	\
 M(MCS_PORT_RESET,	0xa018, mcs_port_reset, mcs_port_reset_req, msg_rsp)	\
 M(MCS_PORT_CFG_SET,	0xa019, mcs_port_cfg_set, mcs_port_cfg_set_req, msg_rsp)\
 M(MCS_PORT_CFG_GET,	0xa020, mcs_port_cfg_get, mcs_port_cfg_get_req,		\
@@ -1674,6 +1702,133 @@ enum mcs_direction {
 	MCS_TX,
 };
 
+enum mcs_rsrc_type {
+	MCS_RSRC_TYPE_FLOWID,
+	MCS_RSRC_TYPE_SECY,
+	MCS_RSRC_TYPE_SC,
+	MCS_RSRC_TYPE_SA,
+};
+
+struct mcs_alloc_rsrc_req {
+	struct mbox_msghdr hdr;
+	u8 rsrc_type;
+	u8 rsrc_cnt;	/* Resources count */
+	u8 mcs_id;	/* MCS block ID	*/
+	u8 dir;		/* Macsec ingress or egress side */
+	u8 all;		/* Allocate all resource type one each */
+	u64 rsvd;
+};
+
+struct mcs_alloc_rsrc_rsp {
+	struct mbox_msghdr hdr;
+	u8 flow_ids[128];	/* Index of reserved entries */
+	u8 secy_ids[128];
+	u8 sc_ids[128];
+	u8 sa_ids[256];
+	u8 rsrc_type;
+	u8 rsrc_cnt;		/* No of entries reserved */
+	u8 mcs_id;
+	u8 dir;
+	u8 all;
+	u8 rsvd[256];		/* reserved fields for future expansion */
+};
+
+struct mcs_free_rsrc_req {
+	struct mbox_msghdr hdr;
+	u8 rsrc_id;		/* Index of the entry to be freed */
+	u8 rsrc_type;
+	u8 mcs_id;
+	u8 dir;
+	u8 all;			/* Free all the cam resources */
+	u64 rsvd;
+};
+
+struct mcs_flowid_entry_write_req {
+	struct mbox_msghdr hdr;
+	u64 data[4];
+	u64 mask[4];
+	u64 sci;	/* CNF10K-B for tx_secy_mem_map */
+	u8 flow_id;
+	u8 secy_id;	/* secyid for which flowid is mapped */
+	u8 sc_id;	/* Valid if dir = MCS_TX, SC_CAM id mapped to flowid */
+	u8 ena;		/* Enable tcam entry */
+	u8 ctrl_pkt;
+	u8 mcs_id;
+	u8 dir;
+	u64 rsvd;
+};
+
+struct mcs_secy_plcy_write_req {
+	struct mbox_msghdr hdr;
+	u64 plcy;
+	u8 secy_id;
+	u8 mcs_id;
+	u8 dir;
+	u64 rsvd;
+};
+
+/* RX SC_CAM mapping */
+struct mcs_rx_sc_cam_write_req {
+	struct mbox_msghdr hdr;
+	u64 sci;	/* SCI */
+	u64 secy_id;	/* secy index mapped to SC */
+	u8 sc_id;	/* SC CAM entry index */
+	u8 mcs_id;
+	u64 rsvd;
+};
+
+struct mcs_sa_plcy_write_req {
+	struct mbox_msghdr hdr;
+	u64 plcy[2][9];		/* Support 2 SA policy */
+	u8 sa_index[2];
+	u8 sa_cnt;
+	u8 mcs_id;
+	u8 dir;
+	u64 rsvd;
+};
+
+struct mcs_tx_sc_sa_map {
+	struct mbox_msghdr hdr;
+	u8 sa_index0;
+	u8 sa_index1;
+	u8 rekey_ena;
+	u8 sa_index0_vld;
+	u8 sa_index1_vld;
+	u8 tx_sa_active;
+	u64 sectag_sci;
+	u8 sc_id;	/* used as index for SA_MEM_MAP */
+	u8 mcs_id;
+	u64 rsvd;
+};
+
+struct mcs_rx_sc_sa_map {
+	struct mbox_msghdr hdr;
+	u8 sa_index;
+	u8 sa_in_use;
+	u8 sc_id;
+	u8 an;		/* value range 0-3, sc_id + an used as index SA_MEM_MAP */
+	u8 mcs_id;
+	u64 rsvd;
+};
+
+struct mcs_flowid_ena_dis_entry {
+	struct mbox_msghdr hdr;
+	u8 flow_id;
+	u8 ena;
+	u8 mcs_id;
+	u8 dir;
+	u64 rsvd;
+};
+
+struct mcs_pn_table_write_req {
+	struct mbox_msghdr hdr;
+	u64 next_pn;
+	u8 pn_id;
+	u8 mcs_id;
+	u8 dir;
+	u64 rsvd;
+};
+
 struct mcs_hw_info {
 	struct mbox_msghdr hdr;
 	u8 num_mcs_blks;	/* Number of MCS blocks */
@@ -1762,4 +1917,60 @@ enum mcs_af_status {
 	MCS_AF_ERR_NOT_MAPPED           = -1202,
 };
 
+struct mcs_set_pn_threshold {
+	struct mbox_msghdr hdr;
+	u64 threshold;
+	u8 xpn; /* '1' for setting xpn threshold */
+	u8 mcs_id;
+	u8 dir;
+	u64 rsvd;
+};
+
+enum mcs_ctrl_pkt_rulew_type {
+	MCS_CTRL_PKT_RULE_TYPE_ETH,
+	MCS_CTRL_PKT_RULE_TYPE_DA,
+	MCS_CTRL_PKT_RULE_TYPE_RANGE,
+	MCS_CTRL_PKT_RULE_TYPE_COMBO,
+	MCS_CTRL_PKT_RULE_TYPE_MAC,
+};
+
+struct mcs_alloc_ctrl_pkt_rule_req {
+	struct mbox_msghdr hdr;
+	u8 rule_type;
+	u8 mcs_id;	/* MCS block ID	*/
+	u8 dir;		/* Macsec ingress or egress side */
+	u64 rsvd;
+};
+
+struct mcs_alloc_ctrl_pkt_rule_rsp {
+	struct mbox_msghdr hdr;
+	u8 rule_idx;
+	u8 rule_type;
+	u8 mcs_id;
+	u8 dir;
+	u64 rsvd;
+};
+
+struct mcs_free_ctrl_pkt_rule_req {
+	struct mbox_msghdr hdr;
+	u8 rule_idx;
+	u8 rule_type;
+	u8 mcs_id;
+	u8 dir;
+	u8 all;
+	u64 rsvd;
+};
+
+struct mcs_ctrl_pkt_rule_write_req {
+	struct mbox_msghdr hdr;
+	u64 data0;
+	u64 data1;
+	u64 data2;
+	u8 rule_idx;
+	u8 rule_type;
+	u8 mcs_id;
+	u8 dir;
+	u64 rsvd;
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
index 555f3b220d20..2f48fb98572e 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
@@ -24,6 +24,429 @@ static const struct pci_device_id mcs_id_table[] = {
 
 static LIST_HEAD(mcs_list);
 
+void mcs_pn_table_write(struct mcs *mcs, u8 pn_id, u64 next_pn, u8 dir)
+{
+	u64 reg;
+
+	if (dir == MCS_RX)
+		reg = MCSX_CPM_RX_SLAVE_SA_PN_TABLE_MEMX(pn_id);
+	else
+		reg = MCSX_CPM_TX_SLAVE_SA_PN_TABLE_MEMX(pn_id);
+	mcs_reg_write(mcs, reg, next_pn);
+}
+
+void cn10kb_mcs_tx_sa_mem_map_write(struct mcs *mcs, struct mcs_tx_sc_sa_map *map)
+{
+	u64 reg, val;
+
+	val = (map->sa_index0 & 0xFF) |
+	      (map->sa_index1 & 0xFF) << 9 |
+	      (map->rekey_ena & 0x1) << 18 |
+	      (map->sa_index0_vld & 0x1) << 19 |
+	      (map->sa_index1_vld & 0x1) << 20 |
+	      (map->tx_sa_active & 0x1) << 21 |
+	      map->sectag_sci << 22;
+	reg = MCSX_CPM_TX_SLAVE_SA_MAP_MEM_0X(map->sc_id);
+	mcs_reg_write(mcs, reg, val);
+
+	val = map->sectag_sci >> 42;
+	reg = MCSX_CPM_TX_SLAVE_SA_MAP_MEM_1X(map->sc_id);
+	mcs_reg_write(mcs, reg, val);
+}
+
+void cn10kb_mcs_rx_sa_mem_map_write(struct mcs *mcs, struct mcs_rx_sc_sa_map *map)
+{
+	u64 val, reg;
+
+	val = (map->sa_index & 0xFF) | map->sa_in_use << 9;
+
+	reg = MCSX_CPM_RX_SLAVE_SA_MAP_MEMX((4 * map->sc_id) + map->an);
+	mcs_reg_write(mcs, reg, val);
+}
+
+void mcs_sa_plcy_write(struct mcs *mcs, u64 *plcy, int sa_id, int dir)
+{
+	int reg_id;
+	u64 reg;
+
+	if (dir == MCS_RX) {
+		for (reg_id = 0; reg_id < 8; reg_id++) {
+			reg =  MCSX_CPM_RX_SLAVE_SA_PLCY_MEMX(reg_id, sa_id);
+			mcs_reg_write(mcs, reg, plcy[reg_id]);
+		}
+	} else {
+		for (reg_id = 0; reg_id < 9; reg_id++) {
+			reg =  MCSX_CPM_TX_SLAVE_SA_PLCY_MEMX(reg_id, sa_id);
+			mcs_reg_write(mcs, reg, plcy[reg_id]);
+		}
+	}
+}
+
+void mcs_ena_dis_sc_cam_entry(struct mcs *mcs, int sc_id, int ena)
+{
+	u64 reg, val;
+
+	reg = MCSX_CPM_RX_SLAVE_SC_CAM_ENA(0);
+	if (sc_id > 63)
+		reg = MCSX_CPM_RX_SLAVE_SC_CAM_ENA(1);
+
+	if (ena)
+		val = mcs_reg_read(mcs, reg) | BIT_ULL(sc_id);
+	else
+		val = mcs_reg_read(mcs, reg) & ~BIT_ULL(sc_id);
+
+	mcs_reg_write(mcs, reg, val);
+}
+
+void mcs_rx_sc_cam_write(struct mcs *mcs, u64 sci, u64 secy, int sc_id)
+{
+	mcs_reg_write(mcs, MCSX_CPM_RX_SLAVE_SC_CAMX(0, sc_id), sci);
+	mcs_reg_write(mcs, MCSX_CPM_RX_SLAVE_SC_CAMX(1, sc_id), secy);
+	/* Enable SC CAM */
+	mcs_ena_dis_sc_cam_entry(mcs, sc_id, true);
+}
+
+void mcs_secy_plcy_write(struct mcs *mcs, u64 plcy, int secy_id, int dir)
+{
+	u64 reg;
+
+	if (dir == MCS_RX)
+		reg = MCSX_CPM_RX_SLAVE_SECY_PLCY_MEM_0X(secy_id);
+	else
+		reg = MCSX_CPM_TX_SLAVE_SECY_PLCY_MEMX(secy_id);
+
+	mcs_reg_write(mcs, reg, plcy);
+
+	if (mcs->hw->mcs_blks == 1 && dir == MCS_RX)
+		mcs_reg_write(mcs, MCSX_CPM_RX_SLAVE_SECY_PLCY_MEM_1X(secy_id), 0x0ull);
+}
+
+void cn10kb_mcs_flowid_secy_map(struct mcs *mcs, struct secy_mem_map *map, int dir)
+{
+	u64 reg, val;
+
+	val = (map->secy & 0x7F) | (map->ctrl_pkt & 0x1) << 8;
+	if (dir == MCS_RX) {
+		reg = MCSX_CPM_RX_SLAVE_SECY_MAP_MEMX(map->flow_id);
+	} else {
+		val |= (map->sc & 0x7F) << 9;
+		reg = MCSX_CPM_TX_SLAVE_SECY_MAP_MEM_0X(map->flow_id);
+	}
+
+	mcs_reg_write(mcs, reg, val);
+}
+
+void mcs_ena_dis_flowid_entry(struct mcs *mcs, int flow_id, int dir, int ena)
+{
+	u64 reg, val;
+
+	if (dir == MCS_RX) {
+		reg = MCSX_CPM_RX_SLAVE_FLOWID_TCAM_ENA_0;
+		if (flow_id > 63)
+			reg = MCSX_CPM_RX_SLAVE_FLOWID_TCAM_ENA_1;
+	} else {
+		reg = MCSX_CPM_TX_SLAVE_FLOWID_TCAM_ENA_0;
+		if (flow_id > 63)
+			reg = MCSX_CPM_TX_SLAVE_FLOWID_TCAM_ENA_1;
+	}
+
+	/* Enable/Disable the tcam entry */
+	if (ena)
+		val = mcs_reg_read(mcs, reg) | BIT_ULL(flow_id);
+	else
+		val = mcs_reg_read(mcs, reg) & ~BIT_ULL(flow_id);
+
+	mcs_reg_write(mcs, reg, val);
+}
+
+void mcs_flowid_entry_write(struct mcs *mcs, u64 *data, u64 *mask, int flow_id, int dir)
+{
+	int reg_id;
+	u64 reg;
+
+	if (dir == MCS_RX) {
+		for (reg_id = 0; reg_id < 4; reg_id++) {
+			reg = MCSX_CPM_RX_SLAVE_FLOWID_TCAM_DATAX(reg_id, flow_id);
+			mcs_reg_write(mcs, reg, data[reg_id]);
+			reg = MCSX_CPM_RX_SLAVE_FLOWID_TCAM_MASKX(reg_id, flow_id);
+			mcs_reg_write(mcs, reg, mask[reg_id]);
+		}
+	} else {
+		for (reg_id = 0; reg_id < 4; reg_id++) {
+			reg = MCSX_CPM_TX_SLAVE_FLOWID_TCAM_DATAX(reg_id, flow_id);
+			mcs_reg_write(mcs, reg, data[reg_id]);
+			reg = MCSX_CPM_TX_SLAVE_FLOWID_TCAM_MASKX(reg_id, flow_id);
+			mcs_reg_write(mcs, reg, mask[reg_id]);
+		}
+	}
+}
+
+void mcs_clear_secy_plcy(struct mcs *mcs, int secy_id, int dir)
+{
+	struct mcs_rsrc_map *map;
+	int flow_id;
+
+	if (dir == MCS_RX)
+		map = &mcs->rx;
+	else
+		map = &mcs->tx;
+
+	/* Clear secy memory to zero */
+	mcs_secy_plcy_write(mcs, 0, secy_id, dir);
+
+	/* Disable the tcam entry using this secy */
+	for (flow_id = 0; flow_id < map->flow_ids.max; flow_id++) {
+		if (map->flowid2secy_map[flow_id] != secy_id)
+			continue;
+		mcs_ena_dis_flowid_entry(mcs, flow_id, dir, false);
+	}
+}
+
+int mcs_alloc_ctrlpktrule(struct rsrc_bmap *rsrc, u16 *pf_map, u16 offset, u16 pcifunc)
+{
+	int rsrc_id;
+
+	if (!rsrc->bmap)
+		return -EINVAL;
+
+	rsrc_id = bitmap_find_next_zero_area(rsrc->bmap, rsrc->max, offset, 1, 0);
+	if (rsrc_id >= rsrc->max)
+		return -ENOSPC;
+
+	bitmap_set(rsrc->bmap, rsrc_id, 1);
+	pf_map[rsrc_id] = pcifunc;
+
+	return rsrc_id;
+}
+
+int mcs_free_ctrlpktrule(struct mcs *mcs, struct mcs_free_ctrl_pkt_rule_req *req)
+{
+	u16 pcifunc = req->hdr.pcifunc;
+	struct mcs_rsrc_map *map;
+	u64 dis, reg;
+	int id, rc;
+
+	reg = (req->dir == MCS_RX) ? MCSX_PEX_RX_SLAVE_RULE_ENABLE : MCSX_PEX_TX_SLAVE_RULE_ENABLE;
+	map = (req->dir == MCS_RX) ? &mcs->rx : &mcs->tx;
+
+	if (req->all) {
+		for (id = 0; id < map->ctrlpktrule.max; id++) {
+			if (map->ctrlpktrule2pf_map[id] != pcifunc)
+				continue;
+			mcs_free_rsrc(&map->ctrlpktrule, map->ctrlpktrule2pf_map, id, pcifunc);
+			dis = mcs_reg_read(mcs, reg);
+			dis &= ~BIT_ULL(id);
+			mcs_reg_write(mcs, reg, dis);
+		}
+		return 0;
+	}
+
+	rc = mcs_free_rsrc(&map->ctrlpktrule, map->ctrlpktrule2pf_map, req->rule_idx, pcifunc);
+	dis = mcs_reg_read(mcs, reg);
+	dis &= ~BIT_ULL(req->rule_idx);
+	mcs_reg_write(mcs, reg, dis);
+
+	return rc;
+}
+
+int mcs_ctrlpktrule_write(struct mcs *mcs, struct mcs_ctrl_pkt_rule_write_req *req)
+{
+	u64 reg, enb;
+	u64 idx;
+
+	switch (req->rule_type) {
+	case MCS_CTRL_PKT_RULE_TYPE_ETH:
+		req->data0 &= GENMASK(15, 0);
+		if (req->data0 != ETH_P_PAE)
+			return -EINVAL;
+
+		idx = req->rule_idx - MCS_CTRLPKT_ETYPE_RULE_OFFSET;
+		reg = (req->dir == MCS_RX) ? MCSX_PEX_RX_SLAVE_RULE_ETYPE_CFGX(idx) :
+		      MCSX_PEX_TX_SLAVE_RULE_ETYPE_CFGX(idx);
+
+		mcs_reg_write(mcs, reg, req->data0);
+		break;
+	case MCS_CTRL_PKT_RULE_TYPE_DA:
+		if (!(req->data0 & BIT_ULL(40)))
+			return -EINVAL;
+
+		idx = req->rule_idx - MCS_CTRLPKT_DA_RULE_OFFSET;
+		reg = (req->dir == MCS_RX) ? MCSX_PEX_RX_SLAVE_RULE_DAX(idx) :
+		      MCSX_PEX_TX_SLAVE_RULE_DAX(idx);
+
+		mcs_reg_write(mcs, reg, req->data0 & GENMASK_ULL(47, 0));
+		break;
+	case MCS_CTRL_PKT_RULE_TYPE_RANGE:
+		if (!(req->data0 & BIT_ULL(40)) || !(req->data1 & BIT_ULL(40)))
+			return -EINVAL;
+
+		idx = req->rule_idx - MCS_CTRLPKT_DA_RANGE_RULE_OFFSET;
+		if (req->dir == MCS_RX) {
+			reg = MCSX_PEX_RX_SLAVE_RULE_DA_RANGE_MINX(idx);
+			mcs_reg_write(mcs, reg, req->data0 & GENMASK_ULL(47, 0));
+			reg = MCSX_PEX_RX_SLAVE_RULE_DA_RANGE_MAXX(idx);
+			mcs_reg_write(mcs, reg, req->data1 & GENMASK_ULL(47, 0));
+		} else {
+			reg = MCSX_PEX_TX_SLAVE_RULE_DA_RANGE_MINX(idx);
+			mcs_reg_write(mcs, reg, req->data0 & GENMASK_ULL(47, 0));
+			reg = MCSX_PEX_TX_SLAVE_RULE_DA_RANGE_MAXX(idx);
+			mcs_reg_write(mcs, reg, req->data1 & GENMASK_ULL(47, 0));
+		}
+		break;
+	case MCS_CTRL_PKT_RULE_TYPE_COMBO:
+		req->data2 &= GENMASK(15, 0);
+		if (req->data2 != ETH_P_PAE || !(req->data0 & BIT_ULL(40)) ||
+		    !(req->data1 & BIT_ULL(40)))
+			return -EINVAL;
+
+		idx = req->rule_idx - MCS_CTRLPKT_COMBO_RULE_OFFSET;
+		if (req->dir == MCS_RX) {
+			reg = MCSX_PEX_RX_SLAVE_RULE_COMBO_MINX(idx);
+			mcs_reg_write(mcs, reg, req->data0 & GENMASK_ULL(47, 0));
+			reg = MCSX_PEX_RX_SLAVE_RULE_COMBO_MAXX(idx);
+			mcs_reg_write(mcs, reg, req->data1 & GENMASK_ULL(47, 0));
+			reg = MCSX_PEX_RX_SLAVE_RULE_COMBO_ETX(idx);
+			mcs_reg_write(mcs, reg, req->data2);
+		} else {
+			reg = MCSX_PEX_TX_SLAVE_RULE_COMBO_MINX(idx);
+			mcs_reg_write(mcs, reg, req->data0 & GENMASK_ULL(47, 0));
+			reg = MCSX_PEX_TX_SLAVE_RULE_COMBO_MAXX(idx);
+			mcs_reg_write(mcs, reg, req->data1 & GENMASK_ULL(47, 0));
+			reg = MCSX_PEX_TX_SLAVE_RULE_COMBO_ETX(idx);
+			mcs_reg_write(mcs, reg, req->data2);
+		}
+		break;
+	case MCS_CTRL_PKT_RULE_TYPE_MAC:
+		if (!(req->data0 & BIT_ULL(40)))
+			return -EINVAL;
+
+		idx = req->rule_idx - MCS_CTRLPKT_MAC_EN_RULE_OFFSET;
+		reg = (req->dir == MCS_RX) ? MCSX_PEX_RX_SLAVE_RULE_MAC :
+		      MCSX_PEX_TX_SLAVE_RULE_MAC;
+
+		mcs_reg_write(mcs, reg, req->data0 & GENMASK_ULL(47, 0));
+		break;
+	}
+
+	reg = (req->dir == MCS_RX) ? MCSX_PEX_RX_SLAVE_RULE_ENABLE : MCSX_PEX_TX_SLAVE_RULE_ENABLE;
+
+	enb = mcs_reg_read(mcs, reg);
+	enb |= BIT_ULL(req->rule_idx);
+	mcs_reg_write(mcs, reg, enb);
+
+	return 0;
+}
+
+int mcs_free_rsrc(struct rsrc_bmap *rsrc, u16 *pf_map, int rsrc_id, u16 pcifunc)
+{
+	/* Check if the rsrc_id is mapped to PF/VF */
+	if (pf_map[rsrc_id] != pcifunc)
+		return -EINVAL;
+
+	rvu_free_rsrc(rsrc, rsrc_id);
+	pf_map[rsrc_id] = 0;
+	return 0;
+}
+
+/* Free all the cam resources mapped to pf */
+int mcs_free_all_rsrc(struct mcs *mcs, int dir, u16 pcifunc)
+{
+	struct mcs_rsrc_map *map;
+	int id;
+
+	if (dir == MCS_RX)
+		map = &mcs->rx;
+	else
+		map = &mcs->tx;
+
+	/* free tcam entries */
+	for (id = 0; id < map->flow_ids.max; id++) {
+		if (map->flowid2pf_map[id] != pcifunc)
+			continue;
+		mcs_free_rsrc(&map->flow_ids, map->flowid2pf_map,
+			      id, pcifunc);
+		mcs_ena_dis_flowid_entry(mcs, id, dir, false);
+	}
+
+	/* free secy entries */
+	for (id = 0; id < map->secy.max; id++) {
+		if (map->secy2pf_map[id] != pcifunc)
+			continue;
+		mcs_free_rsrc(&map->secy, map->secy2pf_map,
+			      id, pcifunc);
+		mcs_clear_secy_plcy(mcs, id, dir);
+	}
+
+	/* free sc entries */
+	for (id = 0; id < map->secy.max; id++) {
+		if (map->sc2pf_map[id] != pcifunc)
+			continue;
+		mcs_free_rsrc(&map->sc, map->sc2pf_map, id, pcifunc);
+
+		/* Disable SC CAM only on RX side */
+		if (dir == MCS_RX)
+			mcs_ena_dis_sc_cam_entry(mcs, id, false);
+	}
+
+	/* free sa entries */
+	for (id = 0; id < map->sa.max; id++) {
+		if (map->sa2pf_map[id] != pcifunc)
+			continue;
+		mcs_free_rsrc(&map->sa, map->sa2pf_map, id, pcifunc);
+	}
+	return 0;
+}
+
+int mcs_alloc_rsrc(struct rsrc_bmap *rsrc, u16 *pf_map, u16 pcifunc)
+{
+	int rsrc_id;
+
+	rsrc_id = rvu_alloc_rsrc(rsrc);
+	if (rsrc_id < 0)
+		return -ENOMEM;
+	pf_map[rsrc_id] = pcifunc;
+	return rsrc_id;
+}
+
+int mcs_alloc_all_rsrc(struct mcs *mcs, u8 *flow_id, u8 *secy_id,
+		       u8 *sc_id, u8 *sa1_id, u8 *sa2_id, u16 pcifunc, int dir)
+{
+	struct mcs_rsrc_map *map;
+	int id;
+
+	if (dir == MCS_RX)
+		map = &mcs->rx;
+	else
+		map = &mcs->tx;
+
+	id = mcs_alloc_rsrc(&map->flow_ids, map->flowid2pf_map, pcifunc);
+	if (id < 0)
+		return -ENOMEM;
+	*flow_id = id;
+
+	id = mcs_alloc_rsrc(&map->secy, map->secy2pf_map, pcifunc);
+	if (id < 0)
+		return -ENOMEM;
+	*secy_id = id;
+
+	id = mcs_alloc_rsrc(&map->sc, map->sc2pf_map, pcifunc);
+	if (id < 0)
+		return -ENOMEM;
+	*sc_id = id;
+
+	id =  mcs_alloc_rsrc(&map->sa, map->sa2pf_map, pcifunc);
+	if (id < 0)
+		return -ENOMEM;
+	*sa1_id = id;
+
+	id =  mcs_alloc_rsrc(&map->sa, map->sa2pf_map, pcifunc);
+	if (id < 0)
+		return -ENOMEM;
+	*sa2_id = id;
+
+	return 0;
+}
+
 static void *alloc_mem(struct mcs *mcs, int n)
 {
 	return devm_kcalloc(mcs->dev, n, sizeof(u16), GFP_KERNEL);
@@ -54,6 +477,10 @@ static int mcs_alloc_struct_mem(struct mcs *mcs, struct mcs_rsrc_map *res)
 	if (!res->flowid2secy_map)
 		return -ENOMEM;
 
+	res->ctrlpktrule2pf_map = alloc_mem(mcs, MCS_MAX_CTRLPKT_RULES);
+	if (!res->ctrlpktrule2pf_map)
+		return -ENOMEM;
+
 	res->flow_ids.max = hw->tcam_entries - MCS_RSRC_RSVD_CNT;
 	err = rvu_alloc_bitmap(&res->flow_ids);
 	if (err)
@@ -74,6 +501,11 @@ static int mcs_alloc_struct_mem(struct mcs *mcs, struct mcs_rsrc_map *res)
 	if (err)
 		return err;
 
+	res->ctrlpktrule.max = MCS_MAX_CTRLPKT_RULES;
+	err = rvu_alloc_bitmap(&res->ctrlpktrule);
+	if (err)
+		return err;
+
 	return 0;
 }
 
@@ -210,6 +642,18 @@ void mcs_set_lmac_mode(struct mcs *mcs, int lmac_id, u8 mode)
 	mcs_reg_write(mcs, reg, (u64)mode);
 }
 
+void mcs_pn_threshold_set(struct mcs *mcs, struct mcs_set_pn_threshold *pn)
+{
+	u64 reg;
+
+	if (pn->dir == MCS_RX)
+		reg = pn->xpn ? MCSX_CPM_RX_SLAVE_XPN_THRESHOLD : MCSX_CPM_RX_SLAVE_PN_THRESHOLD;
+	else
+		reg = pn->xpn ? MCSX_CPM_TX_SLAVE_XPN_THRESHOLD : MCSX_CPM_TX_SLAVE_PN_THRESHOLD;
+
+	mcs_reg_write(mcs, reg, pn->threshold);
+}
+
 void cn10kb_mcs_parser_cfg(struct mcs *mcs)
 {
 	u64 reg, val;
@@ -353,6 +797,9 @@ void cn10kb_mcs_set_hw_capabilities(struct mcs *mcs)
 static struct mcs_ops cn10kb_mcs_ops = {
 	.mcs_set_hw_capabilities	= cn10kb_mcs_set_hw_capabilities,
 	.mcs_parser_cfg			= cn10kb_mcs_parser_cfg,
+	.mcs_tx_sa_mem_map_write	= cn10kb_mcs_tx_sa_mem_map_write,
+	.mcs_rx_sa_mem_map_write	= cn10kb_mcs_rx_sa_mem_map_write,
+	.mcs_flowid_secy_map		= cn10kb_mcs_flowid_secy_map,
 };
 
 static int mcs_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.h b/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
index c11d507a77e3..615a3ad3cddb 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
@@ -21,19 +21,47 @@
 #define MCS_PORT_FIFO_SKID_MASK		0x3F
 #define MCS_MAX_CUSTOM_TAGS		0x8
 
+#define MCS_CTRLPKT_ETYPE_RULE_MAX	8
+#define MCS_CTRLPKT_DA_RULE_MAX		8
+#define MCS_CTRLPKT_DA_RANGE_RULE_MAX	4
+#define MCS_CTRLPKT_COMBO_RULE_MAX	4
+#define MCS_CTRLPKT_MAC_RULE_MAX	1
+
+#define MCS_MAX_CTRLPKT_RULES	(MCS_CTRLPKT_ETYPE_RULE_MAX + \
+				MCS_CTRLPKT_DA_RULE_MAX + \
+				MCS_CTRLPKT_DA_RANGE_RULE_MAX + \
+				MCS_CTRLPKT_COMBO_RULE_MAX + \
+				MCS_CTRLPKT_MAC_RULE_MAX)
+
+#define MCS_CTRLPKT_ETYPE_RULE_OFFSET		0
+#define MCS_CTRLPKT_DA_RULE_OFFSET		8
+#define MCS_CTRLPKT_DA_RANGE_RULE_OFFSET	16
+#define MCS_CTRLPKT_COMBO_RULE_OFFSET		20
+#define MCS_CTRLPKT_MAC_EN_RULE_OFFSET		24
+
 /* Reserved resources for default bypass entry */
 #define MCS_RSRC_RSVD_CNT		1
 
+struct secy_mem_map {
+	u8 flow_id;
+	u8 secy;
+	u8 ctrl_pkt;
+	u8 sc;
+	u64 sci;
+};
+
 struct mcs_rsrc_map {
 	u16 *flowid2pf_map;
 	u16 *secy2pf_map;
 	u16 *sc2pf_map;
 	u16 *sa2pf_map;
 	u16 *flowid2secy_map;	/* bitmap flowid mapped to secy*/
+	u16 *ctrlpktrule2pf_map;
 	struct rsrc_bmap	flow_ids;
 	struct rsrc_bmap	secy;
 	struct rsrc_bmap	sc;
 	struct rsrc_bmap	sa;
+	struct rsrc_bmap	ctrlpktrule;
 };
 
 struct hwinfo {
@@ -62,6 +90,9 @@ struct mcs {
 struct mcs_ops {
 	void	(*mcs_set_hw_capabilities)(struct mcs *mcs);
 	void	(*mcs_parser_cfg)(struct mcs *mcs);
+	void	(*mcs_tx_sa_mem_map_write)(struct mcs *mcs, struct mcs_tx_sc_sa_map *map);
+	void	(*mcs_rx_sa_mem_map_write)(struct mcs *mcs, struct mcs_rx_sc_sa_map *map);
+	void	(*mcs_flowid_secy_map)(struct mcs *mcs, struct secy_mem_map *map, int dir);
 };
 
 extern struct pci_driver mcs_driver;
@@ -80,7 +111,24 @@ static inline u64 mcs_reg_read(struct mcs *mcs, u64 offset)
 struct mcs *mcs_get_pdata(int mcs_id);
 int mcs_get_blkcnt(void);
 int mcs_set_lmac_channels(int mcs_id, u16 base);
-
+int mcs_alloc_rsrc(struct rsrc_bmap *rsrc, u16 *pf_map, u16 pcifunc);
+int mcs_free_rsrc(struct rsrc_bmap *rsrc, u16 *pf_map, int rsrc_id, u16 pcifunc);
+int mcs_alloc_all_rsrc(struct mcs *mcs, u8 *flowid, u8 *secy_id,
+		       u8 *sc_id, u8 *sa1_id, u8 *sa2_id, u16 pcifunc, int dir);
+int mcs_free_all_rsrc(struct mcs *mcs, int dir, u16 pcifunc);
+void mcs_clear_secy_plcy(struct mcs *mcs, int secy_id, int dir);
+void mcs_ena_dis_flowid_entry(struct mcs *mcs, int id, int dir, int ena);
+void mcs_ena_dis_sc_cam_entry(struct mcs *mcs, int id, int ena);
+void mcs_flowid_entry_write(struct mcs *mcs, u64 *data, u64 *mask, int id, int dir);
+void mcs_secy_plcy_write(struct mcs *mcs, u64 plcy, int id, int dir);
+void mcs_rx_sc_cam_write(struct mcs *mcs, u64 sci, u64 secy, int sc_id);
+void mcs_sa_plcy_write(struct mcs *mcs, u64 *plcy, int sa, int dir);
+void mcs_map_sc_to_sa(struct mcs *mcs, u64 *sa_map, int sc, int dir);
+void mcs_pn_table_write(struct mcs *mcs, u8 pn_id, u64 next_pn, u8 dir);
+void mcs_tx_sa_mem_map_write(struct mcs *mcs, struct mcs_tx_sc_sa_map *map);
+void mcs_flowid_secy_map(struct mcs *mcs, struct secy_mem_map *map, int dir);
+void mcs_rx_sa_mem_map_write(struct mcs *mcs, struct mcs_rx_sc_sa_map *map);
+void mcs_pn_threshold_set(struct mcs *mcs, struct mcs_set_pn_threshold *pn);
 int mcs_install_flowid_bypass_entry(struct mcs *mcs);
 void mcs_set_lmac_mode(struct mcs *mcs, int lmac_id, u8 mode);
 void mcs_reset_port(struct mcs *mcs, u8 port_id, u8 reset);
@@ -89,14 +137,23 @@ void mcs_get_port_cfg(struct mcs *mcs, struct mcs_port_cfg_get_req *req,
 		      struct mcs_port_cfg_get_rsp *rsp);
 void mcs_get_custom_tag_cfg(struct mcs *mcs, struct mcs_custom_tag_cfg_get_req *req,
 			    struct mcs_custom_tag_cfg_get_rsp *rsp);
+int mcs_alloc_ctrlpktrule(struct rsrc_bmap *rsrc, u16 *pf_map, u16 offset, u16 pcifunc);
+int mcs_free_ctrlpktrule(struct mcs *mcs, struct mcs_free_ctrl_pkt_rule_req *req);
+int mcs_ctrlpktrule_write(struct mcs *mcs, struct mcs_ctrl_pkt_rule_write_req *req);
 
 /* CN10K-B APIs */
 void cn10kb_mcs_set_hw_capabilities(struct mcs *mcs);
+void cn10kb_mcs_tx_sa_mem_map_write(struct mcs *mcs, struct mcs_tx_sc_sa_map *map);
+void cn10kb_mcs_flowid_secy_map(struct mcs *mcs, struct secy_mem_map *map, int dir);
+void cn10kb_mcs_rx_sa_mem_map_write(struct mcs *mcs, struct mcs_rx_sc_sa_map *map);
 void cn10kb_mcs_parser_cfg(struct mcs *mcs);
 
 /* CNF10K-B APIs */
 struct mcs_ops *cnf10kb_get_mac_ops(void);
 void cnf10kb_mcs_set_hw_capabilities(struct mcs *mcs);
+void cnf10kb_mcs_tx_sa_mem_map_write(struct mcs *mcs, struct mcs_tx_sc_sa_map *map);
+void cnf10kb_mcs_flowid_secy_map(struct mcs *mcs, struct secy_mem_map *map, int dir);
+void cnf10kb_mcs_rx_sa_mem_map_write(struct mcs *mcs, struct mcs_rx_sc_sa_map *map);
 void cnf10kb_mcs_parser_cfg(struct mcs *mcs);
 
 #endif /* MCS_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c
index 62c83a3eb1c8..f375402d67d5 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c
@@ -10,6 +10,9 @@
 static struct mcs_ops cnf10kb_mcs_ops   = {
 	.mcs_set_hw_capabilities	= cnf10kb_mcs_set_hw_capabilities,
 	.mcs_parser_cfg			= cnf10kb_mcs_parser_cfg,
+	.mcs_tx_sa_mem_map_write	= cnf10kb_mcs_tx_sa_mem_map_write,
+	.mcs_rx_sa_mem_map_write	= cnf10kb_mcs_rx_sa_mem_map_write,
+	.mcs_flowid_secy_map		= cnf10kb_mcs_flowid_secy_map,
 };
 
 struct mcs_ops *cnf10kb_get_mac_ops(void)
@@ -63,3 +66,55 @@ void cnf10kb_mcs_parser_cfg(struct mcs *mcs)
 	reg = MCSX_PEX_TX_SLAVE_ETYPE_ENABLE;
 	mcs_reg_write(mcs, reg, val);
 }
+
+void cnf10kb_mcs_flowid_secy_map(struct mcs *mcs, struct secy_mem_map *map, int dir)
+{
+	u64 reg, val;
+
+	val = (map->secy & 0x3F) | (map->ctrl_pkt & 0x1) << 6;
+	if (dir == MCS_RX) {
+		reg = MCSX_CPM_RX_SLAVE_SECY_MAP_MEMX(map->flow_id);
+	} else {
+		reg = MCSX_CPM_TX_SLAVE_SECY_MAP_MEM_0X(map->flow_id);
+		mcs_reg_write(mcs, reg, map->sci);
+		val |= (map->sc & 0x3F) << 7;
+		reg = MCSX_CPM_TX_SLAVE_SECY_MAP_MEM_1X(map->flow_id);
+	}
+
+	mcs_reg_write(mcs, reg, val);
+}
+
+void cnf10kb_mcs_tx_sa_mem_map_write(struct mcs *mcs, struct mcs_tx_sc_sa_map *map)
+{
+	u64 reg, val;
+
+	val = (map->sa_index0 & 0x7F) | (map->sa_index1 & 0x7F) << 7;
+
+	reg = MCSX_CPM_TX_SLAVE_SA_MAP_MEM_0X(map->sc_id);
+	mcs_reg_write(mcs, reg, val);
+
+	if (map->rekey_ena) {
+		reg = MCSX_CPM_TX_SLAVE_AUTO_REKEY_ENABLE_0;
+		val = mcs_reg_read(mcs, reg);
+		val |= BIT_ULL(map->sc_id);
+		mcs_reg_write(mcs, reg, val);
+	}
+
+	if (map->sa_index0_vld)
+		mcs_reg_write(mcs, MCSX_CPM_TX_SLAVE_SA_INDEX0_VLDX(map->sc_id), BIT_ULL(0));
+
+	if (map->sa_index1_vld)
+		mcs_reg_write(mcs, MCSX_CPM_TX_SLAVE_SA_INDEX1_VLDX(map->sc_id), BIT_ULL(0));
+
+	mcs_reg_write(mcs, MCSX_CPM_TX_SLAVE_TX_SA_ACTIVEX(map->sc_id), map->tx_sa_active);
+}
+
+void cnf10kb_mcs_rx_sa_mem_map_write(struct mcs *mcs, struct mcs_rx_sc_sa_map *map)
+{
+	u64 val, reg;
+
+	val = (map->sa_index & 0x7F) | (map->sa_in_use << 7);
+
+	reg = MCSX_CPM_RX_SLAVE_SA_MAP_MEMX((4 * map->sc_id) + map->an);
+	mcs_reg_write(mcs, reg, val);
+}
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h b/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
index 1ce3442a96d9..e192a68bff36 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
@@ -89,6 +89,163 @@
 #define MCSX_PEX_TX_SLAVE_VLAN_CFGX(a)          (0x46f8ull + (a) * 0x8ull)
 #define MCSX_PEX_TX_SLAVE_CUSTOM_TAG_REL_MODE_SEL(a)	(0x788ull + (a) * 0x8ull)
 #define MCSX_PEX_TX_SLAVE_PORT_CONFIG(a)		(0x4738ull + (a) * 0x8ull)
+#define MCSX_PEX_RX_SLAVE_RULE_ETYPE_CFGX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x3fc0ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x558ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_RX_SLAVE_RULE_DAX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4000ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x598ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_RX_SLAVE_RULE_DA_RANGE_MINX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4040ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x5d8ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_RX_SLAVE_RULE_DA_RANGE_MAXX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4048ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x5e0ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_RX_SLAVE_RULE_COMBO_MINX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4080ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x648ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_RX_SLAVE_RULE_COMBO_MAXX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4088ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x650ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_RX_SLAVE_RULE_COMBO_ETX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4090ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x658ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_RX_SLAVE_RULE_MAC ({	\
+	u64 offset;					\
+							\
+	offset = 0x40e0ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x6d8ull;			\
+	offset; })
+
+#define MCSX_PEX_RX_SLAVE_RULE_ENABLE ({	\
+	u64 offset;					\
+							\
+	offset = 0x40e8ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x6e0ull;			\
+	offset; })
+
+#define MCSX_PEX_TX_SLAVE_RULE_ETYPE_CFGX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4b60ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x7d8ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_TX_SLAVE_RULE_DAX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4ba0ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x818ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_TX_SLAVE_RULE_DA_RANGE_MINX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4be0ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x858ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_TX_SLAVE_RULE_DA_RANGE_MAXX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4be8ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x860ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_TX_SLAVE_RULE_COMBO_MINX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4c20ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x8c8ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_TX_SLAVE_RULE_COMBO_MAXX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4c28ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x8d0ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_TX_SLAVE_RULE_COMBO_ETX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x4c30ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x8d8ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_PEX_TX_SLAVE_RULE_MAC ({	\
+	u64 offset;					\
+							\
+	offset = 0x4c80ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x958ull;			\
+	offset; })
+
+#define MCSX_PEX_TX_SLAVE_RULE_ENABLE ({	\
+	u64 offset;					\
+							\
+	offset = 0x4c88ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x960ull;			\
+	offset; })
 
 #define MCSX_PEX_RX_SLAVE_PEX_CONFIGURATION ({		\
 	u64 offset;					\
@@ -111,4 +268,232 @@
 #define MCSX_BBE_RX_SLAVE_CAL_LEN			0x188ull
 #define MCSX_PAB_RX_SLAVE_FIFO_SKID_CFGX(a)		(0x290ull + (a) * 0x40ull)
 
+/* CPM registers */
+#define MCSX_CPM_RX_SLAVE_FLOWID_TCAM_DATAX(a, b) ({	\
+	u64 offset;					\
+							\
+	offset = 0x30740ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x3bf8ull;			\
+	offset += (a) * 0x8ull + (b) * 0x20ull;		\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_FLOWID_TCAM_MASKX(a, b) ({	\
+	u64 offset;					\
+							\
+	offset = 0x34740ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x43f8ull;			\
+	offset += (a) * 0x8ull + (b) * 0x20ull;		\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_FLOWID_TCAM_ENA_0 ({		\
+	u64 offset;					\
+							\
+	offset = 0x30700ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x3bd8ull;			\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_SC_CAMX(a, b)	({		\
+	u64 offset;					\
+							\
+	offset = 0x38780ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x4c08ull;			\
+	offset +=  (a) * 0x8ull + (b) * 0x10ull;	\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_SC_CAM_ENA(a)	({		\
+	u64 offset;					\
+							\
+	offset = 0x38740ull + (a) * 0x8ull;		\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x4bf8ull;			\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_SECY_MAP_MEMX(a) ({		\
+	u64 offset;					\
+							\
+	offset = 0x23ee0ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xbd0ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_SECY_PLCY_MEM_0X(a) ({	\
+	u64 offset;					\
+							\
+	offset = (0x246e0ull + (a) * 0x10ull);		\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = (0xdd0ull + (a) * 0x8ull);	\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_SA_KEY_LOCKOUTX(a) ({		\
+	u64 offset;					\
+							\
+	offset = 0x23E90ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xbb0ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_SA_MAP_MEMX(a) ({		\
+	u64 offset;					\
+							\
+	offset = 0x256e0ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xfd0ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_SA_PLCY_MEMX(a, b) ({		\
+	u64 offset;					\
+							\
+	offset = 0x27700ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x17d8ull;			\
+	offset +=  (a) * 0x8ull + (b) * 0x40ull;	\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_SA_PN_TABLE_MEMX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x2f700ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x37d8;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_XPN_THRESHOLD	({		\
+	u64 offset;					\
+							\
+	offset = 0x23e40ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xb90ull;			\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_PN_THRESHOLD	({		\
+	u64 offset;					\
+							\
+	offset = 0x23e48ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xb98ull;			\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_PN_THRESH_REACHEDX(a)	({	\
+	u64 offset;					\
+							\
+	offset = 0x23e50ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xba0ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_FLOWID_TCAM_ENA_1		0x30708ull
+#define MCSX_CPM_RX_SLAVE_SECY_PLCY_MEM_1X(a)		(0x246e8ull + (a) * 0x10ull)
+
+/* TX registers */
+#define MCSX_CPM_TX_SLAVE_FLOWID_TCAM_DATAX(a, b) ({	\
+	u64 offset;					\
+							\
+	offset = 0x51d50ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xa7c0ull;			\
+	offset += (a) * 0x8ull + (b) * 0x20ull;		\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_FLOWID_TCAM_MASKX(a, b) ({	\
+	u64 offset;					\
+							\
+	offset = 0x55d50ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xafc0ull;			\
+	offset += (a) * 0x8ull + (b) * 0x20ull;		\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_FLOWID_TCAM_ENA_0 ({		\
+	u64 offset;					\
+							\
+	offset = 0x51d10ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xa7a0ull;			\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_SECY_MAP_MEM_0X(a) ({		\
+	u64 offset;					\
+							\
+	offset = 0x3e508ull + (a) * 0x8ull;		\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x5550ull + (a) * 0x10ull;	\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_SECY_PLCY_MEMX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x3ed08ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x5950ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_SA_KEY_LOCKOUTX(a) ({		\
+	u64 offset;					\
+							\
+	offset = 0x3e4c0ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x5538ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_SA_MAP_MEM_0X(a) ({		\
+	u64 offset;					\
+							\
+	offset = 0x3fd10ull + (a) * 0x10ull;		\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x6150ull + (a) * 0x8ull;	\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_SA_PLCY_MEMX(a, b) ({		\
+	u64 offset;					\
+							\
+	offset = 0x40d10ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x63a0ull;			\
+	offset += (a) * 0x8ull + (b) * 0x80ull;		\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_SA_PN_TABLE_MEMX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x50d10ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xa3a0ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_XPN_THRESHOLD ({		\
+	u64 offset;					\
+							\
+	offset = 0x3e4b0ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x5528ull;			\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_PN_THRESHOLD ({		\
+	u64 offset;					\
+							\
+	offset = 0x3e4b8ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x5530ull;			\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_SA_MAP_MEM_1X(a)		(0x3fd18ull + (a) * 0x10ull)
+#define MCSX_CPM_TX_SLAVE_SECY_MAP_MEM_1X(a)		(0x5558ull + (a) * 0x10ull)
+#define MCSX_CPM_TX_SLAVE_FLOWID_TCAM_ENA_1		0x51d18ull
+#define MCSX_CPM_TX_SLAVE_TX_SA_ACTIVEX(a)		(0x5b50 + (a) * 0x8ull)
+#define MCSX_CPM_TX_SLAVE_SA_INDEX0_VLDX(a)		(0x5d50 + (a) * 0x8ull)
+#define MCSX_CPM_TX_SLAVE_SA_INDEX1_VLDX(a)		(0x5f50 + (a) * 0x8ull)
+#define MCSX_CPM_TX_SLAVE_AUTO_REKEY_ENABLE_0		0x5500ull
+
 #endif
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
index 9eaa8ee452d5..3c307e707121 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
@@ -133,6 +133,380 @@ int rvu_mbox_handler_mcs_custom_tag_cfg_get(struct rvu *rvu, struct mcs_custom_t
 	return 0;
 }
 
+int rvu_mbox_handler_mcs_flowid_ena_entry(struct rvu *rvu,
+					  struct mcs_flowid_ena_dis_entry *req,
+					  struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+	mcs_ena_dis_flowid_entry(mcs, req->flow_id, req->dir, req->ena);
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_pn_table_write(struct rvu *rvu,
+					struct mcs_pn_table_write_req *req,
+					struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+	mcs_pn_table_write(mcs, req->pn_id, req->next_pn, req->dir);
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_set_pn_threshold(struct rvu *rvu,
+					  struct mcs_set_pn_threshold *req,
+					  struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	mcs_pn_threshold_set(mcs, req);
+
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_rx_sc_sa_map_write(struct rvu *rvu,
+					    struct mcs_rx_sc_sa_map *req,
+					    struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+	mcs->mcs_ops->mcs_rx_sa_mem_map_write(mcs, req);
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_tx_sc_sa_map_write(struct rvu *rvu,
+					    struct mcs_tx_sc_sa_map *req,
+					    struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+	mcs->mcs_ops->mcs_tx_sa_mem_map_write(mcs, req);
+
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_sa_plcy_write(struct rvu *rvu,
+				       struct mcs_sa_plcy_write_req *req,
+				       struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+	int i;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	for (i = 0; i < req->sa_cnt; i++)
+		mcs_sa_plcy_write(mcs, &req->plcy[i][0],
+				  req->sa_index[i], req->dir);
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_rx_sc_cam_write(struct rvu *rvu,
+					 struct mcs_rx_sc_cam_write_req *req,
+					 struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+	mcs_rx_sc_cam_write(mcs, req->sci, req->secy_id, req->sc_id);
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_secy_plcy_write(struct rvu *rvu,
+					 struct mcs_secy_plcy_write_req *req,
+					 struct msg_rsp *rsp)
+{	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	mcs_secy_plcy_write(mcs, req->plcy,
+			    req->secy_id, req->dir);
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_flowid_entry_write(struct rvu *rvu,
+					    struct mcs_flowid_entry_write_req *req,
+					    struct msg_rsp *rsp)
+{
+	struct secy_mem_map map;
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	/* TODO validate the flowid */
+	mcs_flowid_entry_write(mcs, req->data, req->mask,
+			       req->flow_id, req->dir);
+	map.secy = req->secy_id;
+	map.sc = req->sc_id;
+	map.ctrl_pkt = req->ctrl_pkt;
+	map.flow_id = req->flow_id;
+	map.sci = req->sci;
+	mcs->mcs_ops->mcs_flowid_secy_map(mcs, &map, req->dir);
+	if (req->ena)
+		mcs_ena_dis_flowid_entry(mcs, req->flow_id,
+					 req->dir, true);
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_free_resources(struct rvu *rvu,
+					struct mcs_free_rsrc_req *req,
+					struct msg_rsp *rsp)
+{
+	u16 pcifunc = req->hdr.pcifunc;
+	struct mcs_rsrc_map *map;
+	struct mcs *mcs;
+	int rc;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	if (req->dir == MCS_RX)
+		map = &mcs->rx;
+	else
+		map = &mcs->tx;
+
+	mutex_lock(&rvu->rsrc_lock);
+	/* Free all the cam resources mapped to PF/VF */
+	if (req->all) {
+		rc = mcs_free_all_rsrc(mcs, req->dir, pcifunc);
+		goto exit;
+	}
+
+	switch (req->rsrc_type) {
+	case MCS_RSRC_TYPE_FLOWID:
+		rc = mcs_free_rsrc(&map->flow_ids, map->flowid2pf_map, req->rsrc_id, pcifunc);
+		mcs_ena_dis_flowid_entry(mcs, req->rsrc_id, req->dir, false);
+		break;
+	case MCS_RSRC_TYPE_SECY:
+		rc =  mcs_free_rsrc(&map->secy, map->secy2pf_map, req->rsrc_id, pcifunc);
+		mcs_clear_secy_plcy(mcs, req->rsrc_id, req->dir);
+		break;
+	case MCS_RSRC_TYPE_SC:
+		rc = mcs_free_rsrc(&map->sc, map->sc2pf_map, req->rsrc_id, pcifunc);
+		/* Disable SC CAM only on RX side */
+		if (req->dir == MCS_RX)
+			mcs_ena_dis_sc_cam_entry(mcs, req->rsrc_id, false);
+		break;
+	case MCS_RSRC_TYPE_SA:
+		rc = mcs_free_rsrc(&map->sa, map->sa2pf_map, req->rsrc_id, pcifunc);
+		break;
+	}
+exit:
+	mutex_unlock(&rvu->rsrc_lock);
+	return rc;
+}
+
+int rvu_mbox_handler_mcs_alloc_resources(struct rvu *rvu,
+					 struct mcs_alloc_rsrc_req *req,
+					 struct mcs_alloc_rsrc_rsp *rsp)
+{
+	u16 pcifunc = req->hdr.pcifunc;
+	struct mcs_rsrc_map *map;
+	struct mcs *mcs;
+	int rsrc_id, i;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	if (req->dir == MCS_RX)
+		map = &mcs->rx;
+	else
+		map = &mcs->tx;
+
+	mutex_lock(&rvu->rsrc_lock);
+
+	if (req->all) {
+		rsrc_id = mcs_alloc_all_rsrc(mcs, &rsp->flow_ids[0],
+					     &rsp->secy_ids[0],
+					     &rsp->sc_ids[0],
+					     &rsp->sa_ids[0],
+					     &rsp->sa_ids[1],
+					     pcifunc, req->dir);
+		goto exit;
+	}
+
+	switch (req->rsrc_type) {
+	case MCS_RSRC_TYPE_FLOWID:
+		for (i = 0; i < req->rsrc_cnt; i++) {
+			rsrc_id = mcs_alloc_rsrc(&map->flow_ids, map->flowid2pf_map, pcifunc);
+			if (rsrc_id < 0)
+				goto exit;
+			rsp->flow_ids[i] = rsrc_id;
+			rsp->rsrc_cnt++;
+		}
+		break;
+	case MCS_RSRC_TYPE_SECY:
+		for (i = 0; i < req->rsrc_cnt; i++) {
+			rsrc_id = mcs_alloc_rsrc(&map->secy, map->secy2pf_map, pcifunc);
+			if (rsrc_id < 0)
+				goto exit;
+			rsp->secy_ids[i] = rsrc_id;
+			rsp->rsrc_cnt++;
+		}
+		break;
+	case MCS_RSRC_TYPE_SC:
+		for (i = 0; i < req->rsrc_cnt; i++) {
+			rsrc_id = mcs_alloc_rsrc(&map->sc, map->sc2pf_map, pcifunc);
+			if (rsrc_id < 0)
+				goto exit;
+			rsp->sc_ids[i] = rsrc_id;
+			rsp->rsrc_cnt++;
+		}
+		break;
+	case MCS_RSRC_TYPE_SA:
+		for (i = 0; i < req->rsrc_cnt; i++) {
+			rsrc_id = mcs_alloc_rsrc(&map->sa, map->sa2pf_map, pcifunc);
+			if (rsrc_id < 0)
+				goto exit;
+			rsp->sa_ids[i] = rsrc_id;
+			rsp->rsrc_cnt++;
+		}
+		break;
+	}
+
+	rsp->rsrc_type = req->rsrc_type;
+	rsp->dir = req->dir;
+	rsp->mcs_id = req->mcs_id;
+	rsp->all = req->all;
+
+exit:
+	if (rsrc_id < 0)
+		dev_err(rvu->dev, "Failed to allocate the mcs resources for PCIFUNC:%d\n", pcifunc);
+	mutex_unlock(&rvu->rsrc_lock);
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_alloc_ctrl_pkt_rule(struct rvu *rvu,
+					     struct mcs_alloc_ctrl_pkt_rule_req *req,
+					     struct mcs_alloc_ctrl_pkt_rule_rsp *rsp)
+{
+	u16 pcifunc = req->hdr.pcifunc;
+	struct mcs_rsrc_map *map;
+	struct mcs *mcs;
+	int rsrc_id;
+	u16 offset;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	map = (req->dir == MCS_RX) ? &mcs->rx : &mcs->tx;
+
+	mutex_lock(&rvu->rsrc_lock);
+
+	switch (req->rule_type) {
+	case MCS_CTRL_PKT_RULE_TYPE_ETH:
+		offset = MCS_CTRLPKT_ETYPE_RULE_OFFSET;
+		break;
+	case MCS_CTRL_PKT_RULE_TYPE_DA:
+		offset = MCS_CTRLPKT_DA_RULE_OFFSET;
+		break;
+	case MCS_CTRL_PKT_RULE_TYPE_RANGE:
+		offset = MCS_CTRLPKT_DA_RANGE_RULE_OFFSET;
+		break;
+	case MCS_CTRL_PKT_RULE_TYPE_COMBO:
+		offset = MCS_CTRLPKT_COMBO_RULE_OFFSET;
+		break;
+	case MCS_CTRL_PKT_RULE_TYPE_MAC:
+		offset = MCS_CTRLPKT_MAC_EN_RULE_OFFSET;
+		break;
+	}
+
+	rsrc_id = mcs_alloc_ctrlpktrule(&map->ctrlpktrule, map->ctrlpktrule2pf_map, offset,
+					pcifunc);
+	if (rsrc_id < 0)
+		goto exit;
+
+	rsp->rule_idx = rsrc_id;
+	rsp->rule_type = req->rule_type;
+	rsp->dir = req->dir;
+	rsp->mcs_id = req->mcs_id;
+
+	mutex_unlock(&rvu->rsrc_lock);
+	return 0;
+exit:
+	if (rsrc_id < 0)
+		dev_err(rvu->dev, "Failed to allocate the mcs ctrl pkt rule for PCIFUNC:%d\n",
+			pcifunc);
+	mutex_unlock(&rvu->rsrc_lock);
+	return rsrc_id;
+}
+
+int rvu_mbox_handler_mcs_free_ctrl_pkt_rule(struct rvu *rvu,
+					    struct mcs_free_ctrl_pkt_rule_req *req,
+					    struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+	int rc;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	mutex_lock(&rvu->rsrc_lock);
+
+	rc = mcs_free_ctrlpktrule(mcs, req);
+
+	mutex_unlock(&rvu->rsrc_lock);
+
+	return rc;
+}
+
+int rvu_mbox_handler_mcs_ctrl_pkt_rule_write(struct rvu *rvu,
+					     struct mcs_ctrl_pkt_rule_write_req *req,
+					     struct msg_rsp *rsp)
+{
+	struct mcs *mcs;
+	int rc;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	rc = mcs_ctrlpktrule_write(mcs, req);
+
+	return rc;
+}
+
 static void rvu_mcs_set_lmac_bmap(struct rvu *rvu)
 {
 	struct mcs *mcs = mcs_get_pdata(0);
-- 
cgit v1.2.3


From bd69476e86fcce8b9e408ed801ac5a794ce8b13d Mon Sep 17 00:00:00 2001
From: Geetha sowjanya <gakula@marvell.com>
Date: Sat, 1 Oct 2022 10:29:45 +0530
Subject: octeontx2-af: cn10k: mcs: Install a default TCAM for normal traffic

Out of all the TCAM entries, reserve last TX and RX TCAM flow
entry(low priority) so that normal traffic can be sent out and
received. The traffic which needs macsec processing hits the
high priority TCAM flows. Also install a FLR handler to free
the allocated resources for PF/VF.

Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/octeontx2/af/mcs.c    | 45 ++++++++++++++++++++++
 .../net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c | 23 +++++++++++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h    |  1 +
 3 files changed, 69 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
index 2f48fb98572e..5c8a5bc6711f 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
@@ -181,6 +181,51 @@ void mcs_flowid_entry_write(struct mcs *mcs, u64 *data, u64 *mask, int flow_id,
 	}
 }
 
+int mcs_install_flowid_bypass_entry(struct mcs *mcs)
+{
+	int flow_id, secy_id, reg_id;
+	struct secy_mem_map map;
+	u64 reg, plcy = 0;
+
+	/* Flow entry */
+	flow_id = mcs->hw->tcam_entries - MCS_RSRC_RSVD_CNT;
+	for (reg_id = 0; reg_id < 4; reg_id++) {
+		reg = MCSX_CPM_RX_SLAVE_FLOWID_TCAM_MASKX(reg_id, flow_id);
+		mcs_reg_write(mcs, reg, GENMASK_ULL(63, 0));
+	}
+	for (reg_id = 0; reg_id < 4; reg_id++) {
+		reg = MCSX_CPM_TX_SLAVE_FLOWID_TCAM_MASKX(reg_id, flow_id);
+		mcs_reg_write(mcs, reg, GENMASK_ULL(63, 0));
+	}
+	/* secy */
+	secy_id = mcs->hw->secy_entries - MCS_RSRC_RSVD_CNT;
+
+	/* Set validate frames to NULL and enable control port */
+	plcy = 0x7ull;
+	if (mcs->hw->mcs_blks > 1)
+		plcy = BIT_ULL(0) | 0x3ull << 4;
+	mcs_secy_plcy_write(mcs, plcy, secy_id, MCS_RX);
+
+	/* Enable control port and set mtu to max */
+	plcy = BIT_ULL(0) | GENMASK_ULL(43, 28);
+	if (mcs->hw->mcs_blks > 1)
+		plcy = BIT_ULL(0) | GENMASK_ULL(63, 48);
+	mcs_secy_plcy_write(mcs, plcy, secy_id, MCS_TX);
+
+	/* Map flowid to secy */
+	map.secy = secy_id;
+	map.ctrl_pkt = 0;
+	map.flow_id = flow_id;
+	mcs->mcs_ops->mcs_flowid_secy_map(mcs, &map, MCS_RX);
+	map.sc = secy_id;
+	mcs->mcs_ops->mcs_flowid_secy_map(mcs, &map, MCS_TX);
+
+	/* Enable Flowid entry */
+	mcs_ena_dis_flowid_entry(mcs, flow_id, MCS_RX, true);
+	mcs_ena_dis_flowid_entry(mcs, flow_id, MCS_TX, true);
+	return 0;
+}
+
 void mcs_clear_secy_plcy(struct mcs *mcs, int secy_id, int dir)
 {
 	struct mcs_rsrc_map *map;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
index 3c307e707121..8a7d4556c28f 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
@@ -133,6 +133,27 @@ int rvu_mbox_handler_mcs_custom_tag_cfg_get(struct rvu *rvu, struct mcs_custom_t
 	return 0;
 }
 
+int rvu_mcs_flr_handler(struct rvu *rvu, u16 pcifunc)
+{
+	struct mcs *mcs;
+	int mcs_id;
+
+	/* CNF10K-B mcs0-6 are mapped to RPM2-8*/
+	if (rvu->mcs_blk_cnt > 1) {
+		for (mcs_id = 0; mcs_id < rvu->mcs_blk_cnt; mcs_id++) {
+			mcs = mcs_get_pdata(mcs_id);
+			mcs_free_all_rsrc(mcs, MCS_RX, pcifunc);
+			mcs_free_all_rsrc(mcs, MCS_TX, pcifunc);
+		}
+	} else {
+		/* CN10K-B has only one mcs block */
+		mcs = mcs_get_pdata(0);
+		mcs_free_all_rsrc(mcs, MCS_RX, pcifunc);
+		mcs_free_all_rsrc(mcs, MCS_TX, pcifunc);
+	}
+	return 0;
+}
+
 int rvu_mbox_handler_mcs_flowid_ena_entry(struct rvu *rvu,
 					  struct mcs_flowid_ena_dis_entry *req,
 					  struct msg_rsp *rsp)
@@ -543,8 +564,10 @@ int rvu_mcs_init(struct rvu *rvu)
 		rvu_mcs_set_lmac_bmap(rvu);
 	}
 
+	/* Install default tcam bypass entry and set port to operational mode */
 	for (mcs_id = 0; mcs_id < rvu->mcs_blk_cnt; mcs_id++) {
 		mcs = mcs_get_pdata(mcs_id);
+		mcs_install_flowid_bypass_entry(mcs);
 		for (lmac = 0; lmac < mcs->hw->lmac_cnt; lmac++)
 			mcs_set_lmac_mode(mcs, lmac, 0);
 	}
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 9a150dacb012..4aefe47134d0 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -875,5 +875,6 @@ int rvu_npc_set_parse_mode(struct rvu *rvu, u16 pcifunc, u64 mode, u8 dir,
 
 /* CN10K MCS */
 int rvu_mcs_init(struct rvu *rvu);
+int rvu_mcs_flr_handler(struct rvu *rvu, u16 pcifunc);
 
 #endif /* RVU_H */
-- 
cgit v1.2.3


From 9312150af8da446d54a4bf73cfcab6896ccba84b Mon Sep 17 00:00:00 2001
From: Geetha sowjanya <gakula@marvell.com>
Date: Sat, 1 Oct 2022 10:29:46 +0530
Subject: octeontx2-af: cn10k: mcs: Support for stats collection

Add mailbox messages to return the resource stats to the
caller. Stats of SecY, SC and SAs as per the macsec standard,
TCAM flow id hits/miss, mailbox to clear the stats are
implemented.

Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Ankur Dwivedi <adwivedi@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   | 111 ++++++
 drivers/net/ethernet/marvell/octeontx2/af/mcs.c    | 310 +++++++++++++++
 drivers/net/ethernet/marvell/octeontx2/af/mcs.h    |  13 +
 .../ethernet/marvell/octeontx2/af/mcs_cnf10kb.c    |  26 ++
 .../net/ethernet/marvell/octeontx2/af/mcs_reg.h    | 435 +++++++++++++++++++++
 .../net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c | 153 ++++++++
 6 files changed, 1048 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index 3213b1512263..e01a705b5012 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -317,6 +317,15 @@ M(MCS_PN_TABLE_WRITE,	0xa009, mcs_pn_table_write, mcs_pn_table_write_req,	\
 M(MCS_SET_ACTIVE_LMAC,	0xa00a,	mcs_set_active_lmac, mcs_set_active_lmac,	\
 				msg_rsp)					\
 M(MCS_GET_HW_INFO,	0xa00b,	mcs_get_hw_info, msg_req, mcs_hw_info)		\
+M(MCS_GET_FLOWID_STATS, 0xa00c, mcs_get_flowid_stats, mcs_stats_req,		\
+				mcs_flowid_stats)				\
+M(MCS_GET_SECY_STATS,	0xa00d, mcs_get_secy_stats, mcs_stats_req,		\
+				mcs_secy_stats)					\
+M(MCS_GET_SC_STATS,	0xa00e, mcs_get_sc_stats, mcs_stats_req, mcs_sc_stats)	\
+M(MCS_GET_SA_STATS,	0xa00f, mcs_get_sa_stats, mcs_stats_req, mcs_sa_stats)	\
+M(MCS_GET_PORT_STATS,	0xa010, mcs_get_port_stats, mcs_stats_req,		\
+				mcs_port_stats)					\
+M(MCS_CLEAR_STATS,	0xa011,	mcs_clear_stats, mcs_clear_stats, msg_rsp)	\
 M(MCS_SET_LMAC_MODE,	0xa013, mcs_set_lmac_mode, mcs_set_lmac_mode, msg_rsp)	\
 M(MCS_SET_PN_THRESHOLD, 0xa014, mcs_set_pn_threshold, mcs_set_pn_threshold,	\
 				msg_rsp)					\
@@ -1973,4 +1982,106 @@ struct mcs_ctrl_pkt_rule_write_req {
 	u64 rsvd;
 };
 
+struct mcs_stats_req {
+	struct mbox_msghdr hdr;
+	u8 id;
+	u8 mcs_id;
+	u8 dir;
+	u64 rsvd;
+};
+
+struct mcs_flowid_stats {
+	struct mbox_msghdr hdr;
+	u64 tcam_hit_cnt;
+	u64 rsvd;
+};
+
+struct mcs_secy_stats {
+	struct mbox_msghdr hdr;
+	u64 ctl_pkt_bcast_cnt;
+	u64 ctl_pkt_mcast_cnt;
+	u64 ctl_pkt_ucast_cnt;
+	u64 ctl_octet_cnt;
+	u64 unctl_pkt_bcast_cnt;
+	u64 unctl_pkt_mcast_cnt;
+	u64 unctl_pkt_ucast_cnt;
+	u64 unctl_octet_cnt;
+	/* Valid only for RX */
+	u64 octet_decrypted_cnt;
+	u64 octet_validated_cnt;
+	u64 pkt_port_disabled_cnt;
+	u64 pkt_badtag_cnt;
+	u64 pkt_nosa_cnt;
+	u64 pkt_nosaerror_cnt;
+	u64 pkt_tagged_ctl_cnt;
+	u64 pkt_untaged_cnt;
+	u64 pkt_ctl_cnt;	/* CN10K-B */
+	u64 pkt_notag_cnt;	/* CNF10K-B */
+	/* Valid only for TX */
+	u64 octet_encrypted_cnt;
+	u64 octet_protected_cnt;
+	u64 pkt_noactivesa_cnt;
+	u64 pkt_toolong_cnt;
+	u64 pkt_untagged_cnt;
+	u64 rsvd[4];
+};
+
+struct mcs_port_stats {
+	struct mbox_msghdr hdr;
+	u64 tcam_miss_cnt;
+	u64 parser_err_cnt;
+	u64 preempt_err_cnt;  /* CNF10K-B */
+	u64 sectag_insert_err_cnt;
+	u64 rsvd[4];
+};
+
+/* Only for CN10K-B */
+struct mcs_sa_stats {
+	struct mbox_msghdr hdr;
+	/* RX */
+	u64 pkt_invalid_cnt;
+	u64 pkt_nosaerror_cnt;
+	u64 pkt_notvalid_cnt;
+	u64 pkt_ok_cnt;
+	u64 pkt_nosa_cnt;
+	/* TX */
+	u64 pkt_encrypt_cnt;
+	u64 pkt_protected_cnt;
+	u64 rsvd[4];
+};
+
+struct mcs_sc_stats {
+	struct mbox_msghdr hdr;
+	/* RX */
+	u64 hit_cnt;
+	u64 pkt_invalid_cnt;
+	u64 pkt_late_cnt;
+	u64 pkt_notvalid_cnt;
+	u64 pkt_unchecked_cnt;
+	u64 pkt_delay_cnt;	/* CNF10K-B */
+	u64 pkt_ok_cnt;		/* CNF10K-B */
+	u64 octet_decrypt_cnt;	/* CN10K-B */
+	u64 octet_validate_cnt;	/* CN10K-B */
+	/* TX */
+	u64 pkt_encrypt_cnt;
+	u64 pkt_protected_cnt;
+	u64 octet_encrypt_cnt;		/* CN10K-B */
+	u64 octet_protected_cnt;	/* CN10K-B */
+	u64 rsvd[4];
+};
+
+struct mcs_clear_stats {
+	struct mbox_msghdr hdr;
+#define MCS_FLOWID_STATS	0
+#define MCS_SECY_STATS		1
+#define MCS_SC_STATS		2
+#define MCS_SA_STATS		3
+#define MCS_PORT_STATS		4
+	u8 type;	/* FLOWID, SECY, SC, SA, PORT */
+	u8 id;		/* type = PORT, If id = FF(invalid) port no is derived from pcifunc */
+	u8 mcs_id;
+	u8 dir;
+	u8 all;		/* All resources stats mapped to PF are cleared */
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
index 5c8a5bc6711f..002ccb0a1710 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
@@ -24,6 +24,311 @@ static const struct pci_device_id mcs_id_table[] = {
 
 static LIST_HEAD(mcs_list);
 
+void mcs_get_tx_secy_stats(struct mcs *mcs, struct mcs_secy_stats *stats, int id)
+{
+	u64 reg;
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_IFOUTCTLBCPKTSX(id);
+	stats->ctl_pkt_bcast_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_IFOUTCTLMCPKTSX(id);
+	stats->ctl_pkt_mcast_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_IFOUTCTLOCTETSX(id);
+	stats->ctl_octet_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_IFOUTCTLUCPKTSX(id);
+	stats->ctl_pkt_ucast_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_IFOUTUNCTLBCPKTSX(id);
+	stats->unctl_pkt_bcast_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_IFOUTUNCTLMCPKTSX(id);
+	stats->unctl_pkt_mcast_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_IFOUTUNCTLOCTETSX(id);
+	stats->unctl_octet_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_IFOUTUNCTLUCPKTSX(id);
+	stats->unctl_pkt_ucast_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_OUTOCTETSSECYENCRYPTEDX(id);
+	stats->octet_encrypted_cnt =  mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_OUTOCTETSSECYPROTECTEDX(id);
+	stats->octet_protected_cnt =  mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSECYNOACTIVESAX(id);
+	stats->pkt_noactivesa_cnt =  mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSECYTOOLONGX(id);
+	stats->pkt_toolong_cnt =  mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSECYUNTAGGEDX(id);
+	stats->pkt_untagged_cnt =  mcs_reg_read(mcs, reg);
+}
+
+void mcs_get_rx_secy_stats(struct mcs *mcs, struct mcs_secy_stats *stats, int id)
+{
+	u64 reg;
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_IFINCTLBCPKTSX(id);
+	stats->ctl_pkt_bcast_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_IFINCTLMCPKTSX(id);
+	stats->ctl_pkt_mcast_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_IFINCTLOCTETSX(id);
+	stats->ctl_octet_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_IFINCTLUCPKTSX(id);
+	stats->ctl_pkt_ucast_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_IFINUNCTLBCPKTSX(id);
+	stats->unctl_pkt_bcast_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_IFINUNCTLMCPKTSX(id);
+	stats->unctl_pkt_mcast_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_IFINUNCTLOCTETSX(id);
+	stats->unctl_octet_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_IFINUNCTLUCPKTSX(id);
+	stats->unctl_pkt_ucast_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_INOCTETSSECYDECRYPTEDX(id);
+	stats->octet_decrypted_cnt =  mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_INOCTETSSECYVALIDATEX(id);
+	stats->octet_validated_cnt =  mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSCTRLPORTDISABLEDX(id);
+	stats->pkt_port_disabled_cnt =  mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYBADTAGX(id);
+	stats->pkt_badtag_cnt =  mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYNOSAX(id);
+	stats->pkt_nosa_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYNOSAERRORX(id);
+	stats->pkt_nosaerror_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYTAGGEDCTLX(id);
+	stats->pkt_tagged_ctl_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYUNTAGGEDORNOTAGX(id);
+	stats->pkt_untaged_cnt = mcs_reg_read(mcs, reg);
+
+	reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYCTLX(id);
+	stats->pkt_ctl_cnt = mcs_reg_read(mcs, reg);
+
+	if (mcs->hw->mcs_blks > 1) {
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYNOTAGX(id);
+		stats->pkt_notag_cnt = mcs_reg_read(mcs, reg);
+	}
+}
+
+void mcs_get_flowid_stats(struct mcs *mcs, struct mcs_flowid_stats *stats,
+			  int id, int dir)
+{
+	u64 reg;
+
+	if (dir == MCS_RX)
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSFLOWIDTCAMHITX(id);
+	else
+		reg = MCSX_CSE_TX_MEM_SLAVE_OUTPKTSFLOWIDTCAMHITX(id);
+
+	stats->tcam_hit_cnt = mcs_reg_read(mcs, reg);
+}
+
+void mcs_get_port_stats(struct mcs *mcs, struct mcs_port_stats *stats,
+			int id, int dir)
+{
+	u64 reg;
+
+	if (dir == MCS_RX) {
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSFLOWIDTCAMMISSX(id);
+		stats->tcam_miss_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSPARSEERRX(id);
+		stats->parser_err_cnt = mcs_reg_read(mcs, reg);
+		if (mcs->hw->mcs_blks > 1) {
+			reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSEARLYPREEMPTERRX(id);
+			stats->preempt_err_cnt = mcs_reg_read(mcs, reg);
+		}
+	} else {
+		reg = MCSX_CSE_TX_MEM_SLAVE_OUTPKTSFLOWIDTCAMMISSX(id);
+		stats->tcam_miss_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_TX_MEM_SLAVE_OUTPKTSPARSEERRX(id);
+		stats->parser_err_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSECTAGINSERTIONERRX(id);
+		stats->sectag_insert_err_cnt = mcs_reg_read(mcs, reg);
+	}
+}
+
+void mcs_get_sa_stats(struct mcs *mcs, struct mcs_sa_stats *stats, int id, int dir)
+{
+	u64 reg;
+
+	if (dir == MCS_RX) {
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSAINVALIDX(id);
+		stats->pkt_invalid_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSANOTUSINGSAERRORX(id);
+		stats->pkt_nosaerror_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSANOTVALIDX(id);
+		stats->pkt_notvalid_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSAOKX(id);
+		stats->pkt_ok_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSAUNUSEDSAX(id);
+		stats->pkt_nosa_cnt = mcs_reg_read(mcs, reg);
+	} else {
+		reg = MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSAENCRYPTEDX(id);
+		stats->pkt_encrypt_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSAPROTECTEDX(id);
+		stats->pkt_protected_cnt = mcs_reg_read(mcs, reg);
+	}
+}
+
+void mcs_get_sc_stats(struct mcs *mcs, struct mcs_sc_stats *stats,
+		      int id, int dir)
+{
+	u64 reg;
+
+	if (dir == MCS_RX) {
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSCCAMHITX(id);
+		stats->hit_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSCINVALIDX(id);
+		stats->pkt_invalid_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSCLATEORDELAYEDX(id);
+		stats->pkt_late_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSCNOTVALIDX(id);
+		stats->pkt_notvalid_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSCUNCHECKEDOROKX(id);
+		stats->pkt_unchecked_cnt = mcs_reg_read(mcs, reg);
+
+		if (mcs->hw->mcs_blks > 1) {
+			reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSCDELAYEDX(id);
+			stats->pkt_delay_cnt = mcs_reg_read(mcs, reg);
+
+			reg = MCSX_CSE_RX_MEM_SLAVE_INPKTSSCOKX(id);
+			stats->pkt_ok_cnt = mcs_reg_read(mcs, reg);
+		}
+		if (mcs->hw->mcs_blks == 1) {
+			reg = MCSX_CSE_RX_MEM_SLAVE_INOCTETSSCDECRYPTEDX(id);
+			stats->octet_decrypt_cnt = mcs_reg_read(mcs, reg);
+
+			reg = MCSX_CSE_RX_MEM_SLAVE_INOCTETSSCVALIDATEX(id);
+			stats->octet_validate_cnt = mcs_reg_read(mcs, reg);
+		}
+	} else {
+		reg = MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSCENCRYPTEDX(id);
+		stats->pkt_encrypt_cnt = mcs_reg_read(mcs, reg);
+
+		reg = MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSCPROTECTEDX(id);
+		stats->pkt_protected_cnt = mcs_reg_read(mcs, reg);
+
+		if (mcs->hw->mcs_blks == 1) {
+			reg = MCSX_CSE_TX_MEM_SLAVE_OUTOCTETSSCENCRYPTEDX(id);
+			stats->octet_encrypt_cnt = mcs_reg_read(mcs, reg);
+
+			reg = MCSX_CSE_TX_MEM_SLAVE_OUTOCTETSSCPROTECTEDX(id);
+			stats->octet_protected_cnt = mcs_reg_read(mcs, reg);
+		}
+	}
+}
+
+void mcs_clear_stats(struct mcs *mcs, u8 type, u8 id, int dir)
+{
+	struct mcs_flowid_stats flowid_st;
+	struct mcs_port_stats port_st;
+	struct mcs_secy_stats secy_st;
+	struct mcs_sc_stats sc_st;
+	struct mcs_sa_stats sa_st;
+	u64 reg;
+
+	if (dir == MCS_RX)
+		reg = MCSX_CSE_RX_SLAVE_CTRL;
+	else
+		reg = MCSX_CSE_TX_SLAVE_CTRL;
+
+	mcs_reg_write(mcs, reg, BIT_ULL(0));
+
+	switch (type) {
+	case MCS_FLOWID_STATS:
+		mcs_get_flowid_stats(mcs, &flowid_st, id, dir);
+		break;
+	case MCS_SECY_STATS:
+		if (dir == MCS_RX)
+			mcs_get_rx_secy_stats(mcs, &secy_st, id);
+		else
+			mcs_get_tx_secy_stats(mcs, &secy_st, id);
+		break;
+	case MCS_SC_STATS:
+		mcs_get_sc_stats(mcs, &sc_st, id, dir);
+		break;
+	case MCS_SA_STATS:
+		mcs_get_sa_stats(mcs, &sa_st, id, dir);
+		break;
+	case MCS_PORT_STATS:
+		mcs_get_port_stats(mcs, &port_st, id, dir);
+		break;
+	}
+
+	mcs_reg_write(mcs, reg, 0x0);
+}
+
+int mcs_clear_all_stats(struct mcs *mcs, u16 pcifunc, int dir)
+{
+	struct mcs_rsrc_map *map;
+	int id;
+
+	if (dir == MCS_RX)
+		map = &mcs->rx;
+	else
+		map = &mcs->tx;
+
+	/* Clear FLOWID stats */
+	for (id = 0; id < map->flow_ids.max; id++) {
+		if (map->flowid2pf_map[id] != pcifunc)
+			continue;
+		mcs_clear_stats(mcs, MCS_FLOWID_STATS, id, dir);
+	}
+
+	/* Clear SECY stats */
+	for (id = 0; id < map->secy.max; id++) {
+		if (map->secy2pf_map[id] != pcifunc)
+			continue;
+		mcs_clear_stats(mcs, MCS_SECY_STATS, id, dir);
+	}
+
+	/* Clear SC stats */
+	for (id = 0; id < map->secy.max; id++) {
+		if (map->sc2pf_map[id] != pcifunc)
+			continue;
+		mcs_clear_stats(mcs, MCS_SC_STATS, id, dir);
+	}
+
+	/* Clear SA stats */
+	for (id = 0; id < map->sa.max; id++) {
+		if (map->sa2pf_map[id] != pcifunc)
+			continue;
+		mcs_clear_stats(mcs, MCS_SA_STATS, id, dir);
+	}
+	return 0;
+}
+
 void mcs_pn_table_write(struct mcs *mcs, u8 pn_id, u64 next_pn, u8 dir)
 {
 	u64 reg;
@@ -816,6 +1121,10 @@ static void mcs_global_cfg(struct mcs *mcs)
 	/* Disable external bypass */
 	mcs_set_external_bypass(mcs, false);
 
+	/* Reset TX/RX stats memory */
+	mcs_reg_write(mcs, MCSX_CSE_RX_SLAVE_STATS_CLEAR, 0x1F);
+	mcs_reg_write(mcs, MCSX_CSE_TX_SLAVE_STATS_CLEAR, 0x1F);
+
 	/* Set MCS to perform standard IEEE802.1AE macsec processing */
 	if (mcs->hw->mcs_blks == 1) {
 		mcs_reg_write(mcs, MCSX_IP_MODE, BIT_ULL(3));
@@ -921,6 +1230,7 @@ static int mcs_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	mcs->mcs_ops->mcs_parser_cfg(mcs);
 
 	list_add(&mcs->mcs_list, &mcs_list);
+	mutex_init(&mcs->stats_lock);
 
 	return 0;
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.h b/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
index 615a3ad3cddb..28600ef3ad34 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
@@ -85,6 +85,8 @@ struct mcs {
 	u8			mcs_id;
 	struct mcs_ops		*mcs_ops;
 	struct list_head	mcs_list;
+	/* Lock for mcs stats */
+	struct mutex		stats_lock;
 };
 
 struct mcs_ops {
@@ -156,4 +158,15 @@ void cnf10kb_mcs_flowid_secy_map(struct mcs *mcs, struct secy_mem_map *map, int
 void cnf10kb_mcs_rx_sa_mem_map_write(struct mcs *mcs, struct mcs_rx_sc_sa_map *map);
 void cnf10kb_mcs_parser_cfg(struct mcs *mcs);
 
+/* Stats APIs */
+void mcs_get_sc_stats(struct mcs *mcs, struct mcs_sc_stats *stats, int id, int dir);
+void mcs_get_sa_stats(struct mcs *mcs, struct mcs_sa_stats *stats, int id, int dir);
+void mcs_get_port_stats(struct mcs *mcs, struct mcs_port_stats *stats, int id, int dir);
+void mcs_get_flowid_stats(struct mcs *mcs, struct mcs_flowid_stats *stats, int id, int dir);
+void mcs_get_rx_secy_stats(struct mcs *mcs, struct mcs_secy_stats *stats, int id);
+void mcs_get_tx_secy_stats(struct mcs *mcs, struct mcs_secy_stats *stats, int id);
+void mcs_clear_stats(struct mcs *mcs, u8 type, u8 id, int dir);
+int mcs_clear_all_stats(struct mcs *mcs, u16 pcifunc, int dir);
+int mcs_set_force_clk_en(struct mcs *mcs, bool set);
+
 #endif /* MCS_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c
index f375402d67d5..5ed5deb78fc4 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c
@@ -118,3 +118,29 @@ void cnf10kb_mcs_rx_sa_mem_map_write(struct mcs *mcs, struct mcs_rx_sc_sa_map *m
 	reg = MCSX_CPM_RX_SLAVE_SA_MAP_MEMX((4 * map->sc_id) + map->an);
 	mcs_reg_write(mcs, reg, val);
 }
+
+int mcs_set_force_clk_en(struct mcs *mcs, bool set)
+{
+	unsigned long timeout = jiffies + usecs_to_jiffies(2000);
+	u64 val;
+
+	val = mcs_reg_read(mcs, MCSX_MIL_GLOBAL);
+
+	if (set) {
+		val |= BIT_ULL(4);
+		mcs_reg_write(mcs, MCSX_MIL_GLOBAL, val);
+
+		/* Poll till mcsx_mil_ip_gbl_status.mcs_ip_stats_ready value is 1 */
+		while (!(mcs_reg_read(mcs, MCSX_MIL_IP_GBL_STATUS) & BIT_ULL(0))) {
+			if (time_after(jiffies, timeout)) {
+				dev_err(mcs->dev, "MCS set force clk enable failed\n");
+				break;
+			}
+		}
+	} else {
+		val &= ~BIT_ULL(4);
+		mcs_reg_write(mcs, MCSX_MIL_GLOBAL, val);
+	}
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h b/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
index e192a68bff36..12be9f997316 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
@@ -72,6 +72,14 @@
 		offset = 0x600c8ull;			\
 	offset; })
 
+#define MCSX_MIL_IP_GBL_STATUS ({			\
+	u64 offset;					\
+							\
+	offset = 0x800d0ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x600d0ull;			\
+	offset; })
+
 /* PAB */
 #define MCSX_PAB_RX_SLAVE_PORT_CFGX(a) ({	\
 	u64 offset;				\
@@ -496,4 +504,431 @@
 #define MCSX_CPM_TX_SLAVE_SA_INDEX1_VLDX(a)		(0x5f50 + (a) * 0x8ull)
 #define MCSX_CPM_TX_SLAVE_AUTO_REKEY_ENABLE_0		0x5500ull
 
+/* CSE */
+#define MCSX_CSE_RX_MEM_SLAVE_IFINCTLBCPKTSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x9e80ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xc218ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_IFINCTLMCPKTSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x9680ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xc018ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_IFINCTLOCTETSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x6e80ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xbc18ull;			\
+	offset +=  (a) * 0x8ull;			\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_IFINCTLUCPKTSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x8e80ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xbe18ull;			\
+	offset +=  (a) * 0x8ull;			\
+	offset; })
+
+#define	MCSX_CSE_RX_MEM_SLAVE_IFINUNCTLBCPKTSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x8680ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xca18ull;			\
+	offset +=  (a) * 0x8ull;			\
+	offset; })
+
+#define	MCSX_CSE_RX_MEM_SLAVE_IFINUNCTLMCPKTSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x7e80ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xc818ull;			\
+	offset +=  (a) * 0x8ull;			\
+	offset; })
+
+#define	MCSX_CSE_RX_MEM_SLAVE_IFINUNCTLOCTETSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x6680ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xc418ull;			\
+	offset +=  (a) * 0x8ull;			\
+	offset; })
+
+#define	MCSX_CSE_RX_MEM_SLAVE_IFINUNCTLUCPKTSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x7680ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xc618ull;			\
+	offset +=  (a) * 0x8ull;			\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INOCTETSSECYDECRYPTEDX(a) ({ \
+	u64 offset;					\
+							\
+	offset = 0x5e80ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xdc18ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INOCTETSSECYVALIDATEX(a)({ \
+	u64 offset;					\
+							\
+	offset = 0x5680ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xda18ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSCTRLPORTDISABLEDX(a) ({ \
+	u64 offset;					\
+							\
+	offset = 0xd680ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xce18ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSFLOWIDTCAMHITX(a) ({ \
+	u64 offset;					\
+							\
+	offset = 0x16a80ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xec78ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSFLOWIDTCAMMISSX(a) ({ \
+	u64 offset;					\
+							\
+	offset = 0x16680ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xec38ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSPARSEERRX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x16880ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xec18ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSCCAMHITX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0xfe80ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xde18ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSCINVALIDX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x10680ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xe418ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSCNOTVALIDX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x10e80ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xe218ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYBADTAGX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0xae80ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xd418ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYNOSAX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0xc680ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xd618ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYNOSAERRORX(a) ({ \
+	u64 offset;					\
+							\
+	offset = 0xce80ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xd818ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYTAGGEDCTLX(a) ({ \
+	u64 offset;					\
+							\
+	offset = 0xbe80ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xcc18ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_RX_SLAVE_CTRL	({			\
+	u64 offset;					\
+							\
+	offset = 0x52a0ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x9c0ull;			\
+	offset; })
+
+#define MCSX_CSE_RX_SLAVE_STATS_CLEAR	({		\
+	u64 offset;					\
+							\
+	offset = 0x52b8ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x9d8ull;			\
+	offset; })
+
+#define MCSX_CSE_RX_MEM_SLAVE_INOCTETSSCDECRYPTEDX(a)	(0xe680ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INOCTETSSCVALIDATEX(a)	(0xde80ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYUNTAGGEDORNOTAGX(a)	(0xa680ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYNOTAGX(a)	(0xd218 + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYUNTAGGEDX(a)	(0xd018ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSCUNCHECKEDOROKX(a)	(0xee80ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSECYCTLX(a)		(0xb680ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSCLATEORDELAYEDX(a) (0xf680ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSAINVALIDX(a)	(0x12680ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSANOTUSINGSAERRORX(a) (0x15680ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSANOTVALIDX(a)	(0x13680ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSAOKX(a)		(0x11680ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSAUNUSEDSAX(a)	(0x14680ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSEARLYPREEMPTERRX(a) (0xec58ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSCOKX(a)		(0xea18ull + (a) * 0x8ull)
+#define MCSX_CSE_RX_MEM_SLAVE_INPKTSSCDELAYEDX(a)	(0xe618ull + (a) * 0x8ull)
+
+/* CSE TX */
+#define MCSX_CSE_TX_MEM_SLAVE_IFOUTCOMMONOCTETSX(a)	(0x18440ull + (a) * 0x8ull)
+#define MCSX_CSE_TX_MEM_SLAVE_IFOUTCTLBCPKTSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x1c440ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xf478ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_IFOUTCTLMCPKTSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x1bc40ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xf278ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_IFOUTCTLOCTETSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x19440ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xee78ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_IFOUTCTLUCPKTSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x1b440ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xf078ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_IFOUTUNCTLBCPKTSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x1ac40ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xfc78ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_IFOUTUNCTLMCPKTSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x1a440ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xfa78ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_IFOUTUNCTLOCTETSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x18c40ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xf678ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_IFOUTUNCTLUCPKTSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x19c40ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xf878ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTOCTETSSECYENCRYPTEDX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x17c40ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x10878ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTOCTETSSECYPROTECTEDX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x17440ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x10678ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSCTRLPORTDISABLEDX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x1e440ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xfe78ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSFLOWIDTCAMHITX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x23240ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x10ed8ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSFLOWIDTCAMMISSX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x22c40ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x10e98ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSPARSEERRX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x22e40ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x10e78ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSCENCRYPTEDX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x20440ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x10c78ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSCPROTECTEDX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x1fc40ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x10a78ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSECTAGINSERTIONERRX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x23040ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x110d8ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSECYNOACTIVESAX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x1dc40ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x10278ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSECYTOOLONGX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x1d440ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x10478ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSECYUNTAGGEDX(a) ({	\
+	u64 offset;					\
+							\
+	offset = 0x1cc40ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0x10078ull;			\
+	offset += (a) * 0x8ull;				\
+	offset; })
+
+#define MCSX_CSE_TX_SLAVE_CTRL	({	\
+	u64 offset;					\
+							\
+	offset = 0x54a0ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xa00ull;			\
+	offset; })
+
+#define MCSX_CSE_TX_SLAVE_STATS_CLEAR ({		\
+	u64 offset;					\
+							\
+	offset = 0x54b8ull;				\
+	if (mcs->hw->mcs_blks > 1)			\
+		offset = 0xa18ull;			\
+	offset; })
+
+#define MCSX_CSE_TX_MEM_SLAVE_OUTOCTETSSCENCRYPTEDX(a)	(0x1f440ull + (a) * 0x8ull)
+#define MCSX_CSE_TX_MEM_SLAVE_OUTOCTETSSCPROTECTEDX(a)	(0x1ec40ull + (a) * 0x8ull)
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSEARLYPREEMPTERRX(a) (0x10eb8ull + (a) * 0x8ull)
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSAENCRYPTEDX(a)	(0x21c40ull + (a) * 0x8ull)
+#define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSAPROTECTEDX(a)	(0x20c40ull + (a) * 0x8ull)
+
 #endif
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
index 8a7d4556c28f..939c9b65f078 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
@@ -64,6 +64,159 @@ int rvu_mbox_handler_mcs_port_reset(struct rvu *rvu, struct mcs_port_reset_req *
 	return 0;
 }
 
+int rvu_mbox_handler_mcs_clear_stats(struct rvu *rvu,
+				     struct mcs_clear_stats *req,
+				     struct msg_rsp *rsp)
+{
+	u16 pcifunc = req->hdr.pcifunc;
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	mutex_lock(&mcs->stats_lock);
+	if (req->all)
+		mcs_clear_all_stats(mcs, pcifunc, req->dir);
+	else
+		mcs_clear_stats(mcs, req->type, req->id, req->dir);
+
+	mutex_unlock(&mcs->stats_lock);
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_get_flowid_stats(struct rvu *rvu,
+					  struct mcs_stats_req *req,
+					  struct mcs_flowid_stats *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	/* In CNF10K-B, before reading the statistics,
+	 * MCSX_MIL_GLOBAL.FORCE_CLK_EN_IP needs to be set
+	 * to get accurate statistics
+	 */
+	if (mcs->hw->mcs_blks > 1)
+		mcs_set_force_clk_en(mcs, true);
+
+	mutex_lock(&mcs->stats_lock);
+	mcs_get_flowid_stats(mcs, rsp, req->id, req->dir);
+	mutex_unlock(&mcs->stats_lock);
+
+	/* Clear MCSX_MIL_GLOBAL.FORCE_CLK_EN_IP after reading
+	 * the statistics
+	 */
+	if (mcs->hw->mcs_blks > 1)
+		mcs_set_force_clk_en(mcs, false);
+
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_get_secy_stats(struct rvu *rvu,
+					struct mcs_stats_req *req,
+					struct mcs_secy_stats *rsp)
+{	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	if (mcs->hw->mcs_blks > 1)
+		mcs_set_force_clk_en(mcs, true);
+
+	mutex_lock(&mcs->stats_lock);
+
+	if (req->dir == MCS_RX)
+		mcs_get_rx_secy_stats(mcs, rsp, req->id);
+	else
+		mcs_get_tx_secy_stats(mcs, rsp, req->id);
+
+	mutex_unlock(&mcs->stats_lock);
+
+	if (mcs->hw->mcs_blks > 1)
+		mcs_set_force_clk_en(mcs, false);
+
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_get_sc_stats(struct rvu *rvu,
+				      struct mcs_stats_req *req,
+				      struct mcs_sc_stats *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	if (mcs->hw->mcs_blks > 1)
+		mcs_set_force_clk_en(mcs, true);
+
+	mutex_lock(&mcs->stats_lock);
+	mcs_get_sc_stats(mcs, rsp, req->id, req->dir);
+	mutex_unlock(&mcs->stats_lock);
+
+	if (mcs->hw->mcs_blks > 1)
+		mcs_set_force_clk_en(mcs, false);
+
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_get_sa_stats(struct rvu *rvu,
+				      struct mcs_stats_req *req,
+				      struct mcs_sa_stats *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	if (mcs->hw->mcs_blks > 1)
+		mcs_set_force_clk_en(mcs, true);
+
+	mutex_lock(&mcs->stats_lock);
+	mcs_get_sa_stats(mcs, rsp, req->id, req->dir);
+	mutex_unlock(&mcs->stats_lock);
+
+	if (mcs->hw->mcs_blks > 1)
+		mcs_set_force_clk_en(mcs, false);
+
+	return 0;
+}
+
+int rvu_mbox_handler_mcs_get_port_stats(struct rvu *rvu,
+					struct mcs_stats_req *req,
+					struct mcs_port_stats *rsp)
+{
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	if (mcs->hw->mcs_blks > 1)
+		mcs_set_force_clk_en(mcs, true);
+
+	mutex_lock(&mcs->stats_lock);
+	mcs_get_port_stats(mcs, rsp, req->id, req->dir);
+	mutex_unlock(&mcs->stats_lock);
+
+	if (mcs->hw->mcs_blks > 1)
+		mcs_set_force_clk_en(mcs, false);
+
+	return 0;
+}
+
 int rvu_mbox_handler_mcs_set_active_lmac(struct rvu *rvu,
 					 struct mcs_set_active_lmac *req,
 					 struct msg_rsp *rsp)
-- 
cgit v1.2.3


From 6c635f78c4749d09f2fd100e1a4d328fca3f5bd7 Mon Sep 17 00:00:00 2001
From: Geetha sowjanya <gakula@marvell.com>
Date: Sat, 1 Oct 2022 10:29:47 +0530
Subject: octeontx2-af: cn10k: mcs: Handle MCS block interrupts

Hardware triggers an interrupt for events like PN wrap to zero,
PN crosses set threshold. This interrupt is received
by the MCS_AF. MCS AF then finds the PF/VF to which SA is mapped
and notifies them using mcs_intr_notify mbox message.

PF/VF using mcs_intr_cfg mbox can configure the list
of interrupts for which they want to receive the
notification from AF.

Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Vamsi Attunuru <vattunuru@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/octeontx2/af/mbox.h   |  38 +++
 drivers/net/ethernet/marvell/octeontx2/af/mcs.c    | 337 +++++++++++++++++++++
 drivers/net/ethernet/marvell/octeontx2/af/mcs.h    |  74 +++++
 .../ethernet/marvell/octeontx2/af/mcs_cnf10kb.c    |  86 +++++-
 .../net/ethernet/marvell/octeontx2/af/mcs_reg.h    | 168 ++++++++++
 .../net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c | 160 ++++++++++
 drivers/net/ethernet/marvell/octeontx2/af/rvu.c    |   6 +-
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h    |   8 +
 8 files changed, 865 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
index e01a705b5012..8d5d5a0f68c4 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h
@@ -326,6 +326,7 @@ M(MCS_GET_SA_STATS,	0xa00f, mcs_get_sa_stats, mcs_stats_req, mcs_sa_stats)	\
 M(MCS_GET_PORT_STATS,	0xa010, mcs_get_port_stats, mcs_stats_req,		\
 				mcs_port_stats)					\
 M(MCS_CLEAR_STATS,	0xa011,	mcs_clear_stats, mcs_clear_stats, msg_rsp)	\
+M(MCS_INTR_CFG,		0xa012, mcs_intr_cfg, mcs_intr_cfg, msg_rsp)		\
 M(MCS_SET_LMAC_MODE,	0xa013, mcs_set_lmac_mode, mcs_set_lmac_mode, msg_rsp)	\
 M(MCS_SET_PN_THRESHOLD, 0xa014, mcs_set_pn_threshold, mcs_set_pn_threshold,	\
 				msg_rsp)					\
@@ -351,11 +352,15 @@ M(CGX_LINK_EVENT,	0xC00, cgx_link_event, cgx_link_info_msg, msg_rsp)
 #define MBOX_UP_CPT_MESSAGES						\
 M(CPT_INST_LMTST,	0xD00, cpt_inst_lmtst, cpt_inst_lmtst_req, msg_rsp)
 
+#define MBOX_UP_MCS_MESSAGES						\
+M(MCS_INTR_NOTIFY,	0xE00, mcs_intr_notify, mcs_intr_info, msg_rsp)
+
 enum {
 #define M(_name, _id, _1, _2, _3) MBOX_MSG_ ## _name = _id,
 MBOX_MESSAGES
 MBOX_UP_CGX_MESSAGES
 MBOX_UP_CPT_MESSAGES
+MBOX_UP_MCS_MESSAGES
 #undef M
 };
 
@@ -2084,4 +2089,37 @@ struct mcs_clear_stats {
 	u8 all;		/* All resources stats mapped to PF are cleared */
 };
 
+struct mcs_intr_cfg {
+	struct mbox_msghdr hdr;
+#define MCS_CPM_RX_SECTAG_V_EQ1_INT		BIT_ULL(0)
+#define MCS_CPM_RX_SECTAG_E_EQ0_C_EQ1_INT	BIT_ULL(1)
+#define MCS_CPM_RX_SECTAG_SL_GTE48_INT		BIT_ULL(2)
+#define MCS_CPM_RX_SECTAG_ES_EQ1_SC_EQ1_INT	BIT_ULL(3)
+#define MCS_CPM_RX_SECTAG_SC_EQ1_SCB_EQ1_INT	BIT_ULL(4)
+#define MCS_CPM_RX_PACKET_XPN_EQ0_INT		BIT_ULL(5)
+#define MCS_CPM_RX_PN_THRESH_REACHED_INT	BIT_ULL(6)
+#define MCS_CPM_TX_PACKET_XPN_EQ0_INT		BIT_ULL(7)
+#define MCS_CPM_TX_PN_THRESH_REACHED_INT	BIT_ULL(8)
+#define MCS_CPM_TX_SA_NOT_VALID_INT		BIT_ULL(9)
+#define MCS_BBE_RX_DFIFO_OVERFLOW_INT		BIT_ULL(10)
+#define MCS_BBE_RX_PLFIFO_OVERFLOW_INT		BIT_ULL(11)
+#define MCS_BBE_TX_DFIFO_OVERFLOW_INT		BIT_ULL(12)
+#define MCS_BBE_TX_PLFIFO_OVERFLOW_INT		BIT_ULL(13)
+#define MCS_PAB_RX_CHAN_OVERFLOW_INT		BIT_ULL(14)
+#define MCS_PAB_TX_CHAN_OVERFLOW_INT		BIT_ULL(15)
+	u64 intr_mask;		/* Interrupt enable mask */
+	u8 mcs_id;
+	u8 lmac_id;
+	u64 rsvd;
+};
+
+struct mcs_intr_info {
+	struct mbox_msghdr hdr;
+	u64 intr_mask;
+	int sa_id;
+	u8 mcs_id;
+	u8 lmac_id;
+	u64 rsvd;
+};
+
 #endif /* MBOX_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
index 002ccb0a1710..5ba618aed6ad 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.c
@@ -797,6 +797,289 @@ int mcs_alloc_all_rsrc(struct mcs *mcs, u8 *flow_id, u8 *secy_id,
 	return 0;
 }
 
+static void cn10kb_mcs_tx_pn_wrapped_handler(struct mcs *mcs)
+{
+	struct mcs_intr_event event = { 0 };
+	struct rsrc_bmap *sc_bmap;
+	u64 val;
+	int sc;
+
+	sc_bmap = &mcs->tx.sc;
+
+	event.mcs_id = mcs->mcs_id;
+	event.intr_mask = MCS_CPM_TX_PACKET_XPN_EQ0_INT;
+
+	for_each_set_bit(sc, sc_bmap->bmap, mcs->hw->sc_entries) {
+		val = mcs_reg_read(mcs, MCSX_CPM_TX_SLAVE_SA_MAP_MEM_0X(sc));
+
+		if (mcs->tx_sa_active[sc])
+			/* SA_index1 was used and got expired */
+			event.sa_id = (val >> 9) & 0xFF;
+		else
+			/* SA_index0 was used and got expired */
+			event.sa_id = val & 0xFF;
+
+		event.pcifunc = mcs->tx.sa2pf_map[event.sa_id];
+		mcs_add_intr_wq_entry(mcs, &event);
+	}
+}
+
+static void cn10kb_mcs_tx_pn_thresh_reached_handler(struct mcs *mcs)
+{
+	struct mcs_intr_event event = { 0 };
+	struct rsrc_bmap *sc_bmap;
+	u64 val, status;
+	int sc;
+
+	sc_bmap = &mcs->tx.sc;
+
+	event.mcs_id = mcs->mcs_id;
+	event.intr_mask = MCS_CPM_TX_PN_THRESH_REACHED_INT;
+
+	/* TX SA interrupt is raised only if autorekey is enabled.
+	 * MCS_CPM_TX_SLAVE_SA_MAP_MEM_0X[sc].tx_sa_active bit gets toggled if
+	 * one of two SAs mapped to SC gets expired. If tx_sa_active=0 implies
+	 * SA in SA_index1 got expired else SA in SA_index0 got expired.
+	 */
+	for_each_set_bit(sc, sc_bmap->bmap, mcs->hw->sc_entries) {
+		val = mcs_reg_read(mcs, MCSX_CPM_TX_SLAVE_SA_MAP_MEM_0X(sc));
+		/* Auto rekey is enable */
+		if (!((val >> 18) & 0x1))
+			continue;
+
+		status = (val >> 21) & 0x1;
+
+		/* Check if tx_sa_active status had changed */
+		if (status == mcs->tx_sa_active[sc])
+			continue;
+		/* SA_index0 is expired */
+		if (status)
+			event.sa_id = val & 0xFF;
+		else
+			event.sa_id = (val >> 9) & 0xFF;
+
+		event.pcifunc = mcs->tx.sa2pf_map[event.sa_id];
+		mcs_add_intr_wq_entry(mcs, &event);
+	}
+}
+
+static void mcs_rx_pn_thresh_reached_handler(struct mcs *mcs)
+{
+	struct mcs_intr_event event = { 0 };
+	int sa, reg;
+	u64 intr;
+
+	/* Check expired SAs */
+	for (reg = 0; reg < (mcs->hw->sa_entries / 64); reg++) {
+		/* Bit high in *PN_THRESH_REACHEDX implies
+		 * corresponding SAs are expired.
+		 */
+		intr = mcs_reg_read(mcs, MCSX_CPM_RX_SLAVE_PN_THRESH_REACHEDX(reg));
+		for (sa = 0; sa < 64; sa++) {
+			if (!(intr & BIT_ULL(sa)))
+				continue;
+
+			event.mcs_id = mcs->mcs_id;
+			event.intr_mask = MCS_CPM_RX_PN_THRESH_REACHED_INT;
+			event.sa_id = sa + (reg * 64);
+			event.pcifunc = mcs->rx.sa2pf_map[event.sa_id];
+			mcs_add_intr_wq_entry(mcs, &event);
+		}
+	}
+}
+
+static void mcs_rx_misc_intr_handler(struct mcs *mcs, u64 intr)
+{
+	struct mcs_intr_event event = { 0 };
+
+	event.mcs_id = mcs->mcs_id;
+	event.pcifunc = mcs->pf_map[0];
+
+	if (intr & MCS_CPM_RX_INT_SECTAG_V_EQ1)
+		event.intr_mask = MCS_CPM_RX_SECTAG_V_EQ1_INT;
+	if (intr & MCS_CPM_RX_INT_SECTAG_E_EQ0_C_EQ1)
+		event.intr_mask |= MCS_CPM_RX_SECTAG_E_EQ0_C_EQ1_INT;
+	if (intr & MCS_CPM_RX_INT_SL_GTE48)
+		event.intr_mask |= MCS_CPM_RX_SECTAG_SL_GTE48_INT;
+	if (intr & MCS_CPM_RX_INT_ES_EQ1_SC_EQ1)
+		event.intr_mask |= MCS_CPM_RX_SECTAG_ES_EQ1_SC_EQ1_INT;
+	if (intr & MCS_CPM_RX_INT_SC_EQ1_SCB_EQ1)
+		event.intr_mask |= MCS_CPM_RX_SECTAG_SC_EQ1_SCB_EQ1_INT;
+	if (intr & MCS_CPM_RX_INT_PACKET_XPN_EQ0)
+		event.intr_mask |= MCS_CPM_RX_PACKET_XPN_EQ0_INT;
+
+	mcs_add_intr_wq_entry(mcs, &event);
+}
+
+static void mcs_tx_misc_intr_handler(struct mcs *mcs, u64 intr)
+{
+	struct mcs_intr_event event = { 0 };
+
+	if (!(intr & MCS_CPM_TX_INT_SA_NOT_VALID))
+		return;
+
+	event.mcs_id = mcs->mcs_id;
+	event.pcifunc = mcs->pf_map[0];
+
+	event.intr_mask = MCS_CPM_TX_SA_NOT_VALID_INT;
+
+	mcs_add_intr_wq_entry(mcs, &event);
+}
+
+static void mcs_bbe_intr_handler(struct mcs *mcs, u64 intr, enum mcs_direction dir)
+{
+	struct mcs_intr_event event = { 0 };
+	int i;
+
+	if (!(intr & MCS_BBE_INT_MASK))
+		return;
+
+	event.mcs_id = mcs->mcs_id;
+	event.pcifunc = mcs->pf_map[0];
+
+	for (i = 0; i < MCS_MAX_BBE_INT; i++) {
+		if (!(intr & BIT_ULL(i)))
+			continue;
+
+		/* Lower nibble denotes data fifo overflow interrupts and
+		 * upper nibble indicates policy fifo overflow interrupts.
+		 */
+		if (intr & 0xFULL)
+			event.intr_mask = (dir == MCS_RX) ?
+					  MCS_BBE_RX_DFIFO_OVERFLOW_INT :
+					  MCS_BBE_TX_DFIFO_OVERFLOW_INT;
+		else
+			event.intr_mask = (dir == MCS_RX) ?
+					  MCS_BBE_RX_PLFIFO_OVERFLOW_INT :
+					  MCS_BBE_RX_PLFIFO_OVERFLOW_INT;
+
+		/* Notify the lmac_id info which ran into BBE fatal error */
+		event.lmac_id = i & 0x3ULL;
+		mcs_add_intr_wq_entry(mcs, &event);
+	}
+}
+
+static void mcs_pab_intr_handler(struct mcs *mcs, u64 intr, enum mcs_direction dir)
+{
+	struct mcs_intr_event event = { 0 };
+	int i;
+
+	if (!(intr & MCS_PAB_INT_MASK))
+		return;
+
+	event.mcs_id = mcs->mcs_id;
+	event.pcifunc = mcs->pf_map[0];
+
+	for (i = 0; i < MCS_MAX_PAB_INT; i++) {
+		if (!(intr & BIT_ULL(i)))
+			continue;
+
+		event.intr_mask = (dir == MCS_RX) ? MCS_PAB_RX_CHAN_OVERFLOW_INT :
+				  MCS_PAB_TX_CHAN_OVERFLOW_INT;
+
+		/* Notify the lmac_id info which ran into PAB fatal error */
+		event.lmac_id = i;
+		mcs_add_intr_wq_entry(mcs, &event);
+	}
+}
+
+static irqreturn_t mcs_ip_intr_handler(int irq, void *mcs_irq)
+{
+	struct mcs *mcs = (struct mcs *)mcs_irq;
+	u64 intr, cpm_intr, bbe_intr, pab_intr;
+
+	/* Disable and clear the interrupt */
+	mcs_reg_write(mcs, MCSX_IP_INT_ENA_W1C, BIT_ULL(0));
+	mcs_reg_write(mcs, MCSX_IP_INT, BIT_ULL(0));
+
+	/* Check which block has interrupt*/
+	intr = mcs_reg_read(mcs, MCSX_TOP_SLAVE_INT_SUM);
+
+	/* CPM RX */
+	if (intr & MCS_CPM_RX_INT_ENA) {
+		/* Check for PN thresh interrupt bit */
+		cpm_intr = mcs_reg_read(mcs, MCSX_CPM_RX_SLAVE_RX_INT);
+
+		if (cpm_intr & MCS_CPM_RX_INT_PN_THRESH_REACHED)
+			mcs_rx_pn_thresh_reached_handler(mcs);
+
+		if (cpm_intr & MCS_CPM_RX_INT_ALL)
+			mcs_rx_misc_intr_handler(mcs, cpm_intr);
+
+		/* Clear the interrupt */
+		mcs_reg_write(mcs, MCSX_CPM_RX_SLAVE_RX_INT, cpm_intr);
+	}
+
+	/* CPM TX */
+	if (intr & MCS_CPM_TX_INT_ENA) {
+		cpm_intr = mcs_reg_read(mcs, MCSX_CPM_TX_SLAVE_TX_INT);
+
+		if (cpm_intr & MCS_CPM_TX_INT_PN_THRESH_REACHED) {
+			if (mcs->hw->mcs_blks > 1)
+				cnf10kb_mcs_tx_pn_thresh_reached_handler(mcs);
+			else
+				cn10kb_mcs_tx_pn_thresh_reached_handler(mcs);
+		}
+
+		if (cpm_intr & MCS_CPM_TX_INT_SA_NOT_VALID)
+			mcs_tx_misc_intr_handler(mcs, cpm_intr);
+
+		if (cpm_intr & MCS_CPM_TX_INT_PACKET_XPN_EQ0) {
+			if (mcs->hw->mcs_blks > 1)
+				cnf10kb_mcs_tx_pn_wrapped_handler(mcs);
+			else
+				cn10kb_mcs_tx_pn_wrapped_handler(mcs);
+		}
+		/* Clear the interrupt */
+		mcs_reg_write(mcs, MCSX_CPM_TX_SLAVE_TX_INT, cpm_intr);
+	}
+
+	/* BBE RX */
+	if (intr & MCS_BBE_RX_INT_ENA) {
+		bbe_intr = mcs_reg_read(mcs, MCSX_BBE_RX_SLAVE_BBE_INT);
+		mcs_bbe_intr_handler(mcs, bbe_intr, MCS_RX);
+
+		/* Clear the interrupt */
+		mcs_reg_write(mcs, MCSX_BBE_RX_SLAVE_BBE_INT_INTR_RW, 0);
+		mcs_reg_write(mcs, MCSX_BBE_RX_SLAVE_BBE_INT, bbe_intr);
+	}
+
+	/* BBE TX */
+	if (intr & MCS_BBE_TX_INT_ENA) {
+		bbe_intr = mcs_reg_read(mcs, MCSX_BBE_TX_SLAVE_BBE_INT);
+		mcs_bbe_intr_handler(mcs, bbe_intr, MCS_TX);
+
+		/* Clear the interrupt */
+		mcs_reg_write(mcs, MCSX_BBE_TX_SLAVE_BBE_INT_INTR_RW, 0);
+		mcs_reg_write(mcs, MCSX_BBE_TX_SLAVE_BBE_INT, bbe_intr);
+	}
+
+	/* PAB RX */
+	if (intr & MCS_PAB_RX_INT_ENA) {
+		pab_intr = mcs_reg_read(mcs, MCSX_PAB_RX_SLAVE_PAB_INT);
+		mcs_pab_intr_handler(mcs, pab_intr, MCS_RX);
+
+		/* Clear the interrupt */
+		mcs_reg_write(mcs, MCSX_PAB_RX_SLAVE_PAB_INT_INTR_RW, 0);
+		mcs_reg_write(mcs, MCSX_PAB_RX_SLAVE_PAB_INT, pab_intr);
+	}
+
+	/* PAB TX */
+	if (intr & MCS_PAB_TX_INT_ENA) {
+		pab_intr = mcs_reg_read(mcs, MCSX_PAB_TX_SLAVE_PAB_INT);
+		mcs_pab_intr_handler(mcs, pab_intr, MCS_TX);
+
+		/* Clear the interrupt */
+		mcs_reg_write(mcs, MCSX_PAB_TX_SLAVE_PAB_INT_INTR_RW, 0);
+		mcs_reg_write(mcs, MCSX_PAB_TX_SLAVE_PAB_INT, pab_intr);
+	}
+
+	/* Enable the interrupt */
+	mcs_reg_write(mcs, MCSX_IP_INT_ENA_W1S, BIT_ULL(0));
+
+	return IRQ_HANDLED;
+}
+
 static void *alloc_mem(struct mcs *mcs, int n)
 {
 	return devm_kcalloc(mcs->dev, n, sizeof(u16), GFP_KERNEL);
@@ -859,6 +1142,56 @@ static int mcs_alloc_struct_mem(struct mcs *mcs, struct mcs_rsrc_map *res)
 	return 0;
 }
 
+static int mcs_register_interrupts(struct mcs *mcs)
+{
+	int ret = 0;
+
+	mcs->num_vec = pci_msix_vec_count(mcs->pdev);
+
+	ret = pci_alloc_irq_vectors(mcs->pdev, mcs->num_vec,
+				    mcs->num_vec, PCI_IRQ_MSIX);
+	if (ret < 0) {
+		dev_err(mcs->dev, "MCS Request for %d msix vector failed err:%d\n",
+			mcs->num_vec, ret);
+		return ret;
+	}
+
+	ret = request_irq(pci_irq_vector(mcs->pdev, MCS_INT_VEC_IP),
+			  mcs_ip_intr_handler, 0, "MCS_IP", mcs);
+	if (ret) {
+		dev_err(mcs->dev, "MCS IP irq registration failed\n");
+		goto exit;
+	}
+
+	/* MCS enable IP interrupts */
+	mcs_reg_write(mcs, MCSX_IP_INT_ENA_W1S, BIT_ULL(0));
+
+	/* Enable CPM Rx/Tx interrupts */
+	mcs_reg_write(mcs, MCSX_TOP_SLAVE_INT_SUM_ENB,
+		      MCS_CPM_RX_INT_ENA | MCS_CPM_TX_INT_ENA |
+		      MCS_BBE_RX_INT_ENA | MCS_BBE_TX_INT_ENA |
+		      MCS_PAB_RX_INT_ENA | MCS_PAB_TX_INT_ENA);
+
+	mcs_reg_write(mcs, MCSX_CPM_TX_SLAVE_TX_INT_ENB, 0x7ULL);
+	mcs_reg_write(mcs, MCSX_CPM_RX_SLAVE_RX_INT_ENB, 0x7FULL);
+
+	mcs_reg_write(mcs, MCSX_BBE_RX_SLAVE_BBE_INT_ENB, 0xff);
+	mcs_reg_write(mcs, MCSX_BBE_TX_SLAVE_BBE_INT_ENB, 0xff);
+
+	mcs_reg_write(mcs, MCSX_PAB_RX_SLAVE_PAB_INT_ENB, 0xff);
+	mcs_reg_write(mcs, MCSX_PAB_TX_SLAVE_PAB_INT_ENB, 0xff);
+
+	mcs->tx_sa_active = alloc_mem(mcs, mcs->hw->sc_entries);
+	if (!mcs->tx_sa_active)
+		goto exit;
+
+	return ret;
+exit:
+	pci_free_irq_vectors(mcs->pdev);
+	mcs->num_vec = 0;
+	return ret;
+}
+
 int mcs_get_blkcnt(void)
 {
 	struct mcs *mcs;
@@ -1229,6 +1562,10 @@ static int mcs_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	/* Parser configuration */
 	mcs->mcs_ops->mcs_parser_cfg(mcs);
 
+	err = mcs_register_interrupts(mcs);
+	if (err)
+		goto exit;
+
 	list_add(&mcs->mcs_list, &mcs_list);
 	mutex_init(&mcs->stats_lock);
 
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs.h b/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
index 28600ef3ad34..64dc2b80e15d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs.h
@@ -16,6 +16,7 @@
 #define MCSX_LINK_LMAC_BASE_MASK	GENMASK_ULL(11, 0)
 
 #define MCS_ID_MASK			0x7
+#define MCS_MAX_PFS                     128
 
 #define MCS_PORT_MODE_MASK		0x3
 #define MCS_PORT_FIFO_SKID_MASK		0x3F
@@ -42,6 +43,69 @@
 /* Reserved resources for default bypass entry */
 #define MCS_RSRC_RSVD_CNT		1
 
+/* MCS Interrupt Vector Enumeration */
+enum mcs_int_vec_e {
+	MCS_INT_VEC_MIL_RX_GBL		= 0x0,
+	MCS_INT_VEC_MIL_RX_LMACX	= 0x1,
+	MCS_INT_VEC_MIL_TX_LMACX	= 0x5,
+	MCS_INT_VEC_HIL_RX_GBL		= 0x9,
+	MCS_INT_VEC_HIL_RX_LMACX	= 0xa,
+	MCS_INT_VEC_HIL_TX_GBL		= 0xe,
+	MCS_INT_VEC_HIL_TX_LMACX	= 0xf,
+	MCS_INT_VEC_IP			= 0x13,
+	MCS_INT_VEC_CNT			= 0x14,
+};
+
+#define MCS_MAX_BBE_INT			8ULL
+#define MCS_BBE_INT_MASK		0xFFULL
+
+#define MCS_MAX_PAB_INT			4ULL
+#define MCS_PAB_INT_MASK		0xFULL
+
+#define MCS_BBE_RX_INT_ENA		BIT_ULL(0)
+#define MCS_BBE_TX_INT_ENA		BIT_ULL(1)
+#define MCS_CPM_RX_INT_ENA		BIT_ULL(2)
+#define MCS_CPM_TX_INT_ENA		BIT_ULL(3)
+#define MCS_PAB_RX_INT_ENA		BIT_ULL(4)
+#define MCS_PAB_TX_INT_ENA		BIT_ULL(5)
+
+#define MCS_CPM_TX_INT_PACKET_XPN_EQ0		BIT_ULL(0)
+#define MCS_CPM_TX_INT_PN_THRESH_REACHED	BIT_ULL(1)
+#define MCS_CPM_TX_INT_SA_NOT_VALID		BIT_ULL(2)
+
+#define MCS_CPM_RX_INT_SECTAG_V_EQ1		BIT_ULL(0)
+#define MCS_CPM_RX_INT_SECTAG_E_EQ0_C_EQ1	BIT_ULL(1)
+#define MCS_CPM_RX_INT_SL_GTE48			BIT_ULL(2)
+#define MCS_CPM_RX_INT_ES_EQ1_SC_EQ1		BIT_ULL(3)
+#define MCS_CPM_RX_INT_SC_EQ1_SCB_EQ1		BIT_ULL(4)
+#define MCS_CPM_RX_INT_PACKET_XPN_EQ0		BIT_ULL(5)
+#define MCS_CPM_RX_INT_PN_THRESH_REACHED	BIT_ULL(6)
+
+#define MCS_CPM_RX_INT_ALL	(MCS_CPM_RX_INT_SECTAG_V_EQ1 |		\
+				 MCS_CPM_RX_INT_SECTAG_E_EQ0_C_EQ1 |    \
+				 MCS_CPM_RX_INT_SL_GTE48 |		\
+				 MCS_CPM_RX_INT_ES_EQ1_SC_EQ1 |		\
+				 MCS_CPM_RX_INT_SC_EQ1_SCB_EQ1 |	\
+				 MCS_CPM_RX_INT_PACKET_XPN_EQ0 |	\
+				 MCS_CPM_RX_INT_PN_THRESH_REACHED)
+
+struct mcs_pfvf {
+	u64 intr_mask;	/* Enabled Interrupt mask */
+};
+
+struct mcs_intr_event {
+	u16 pcifunc;
+	u64 intr_mask;
+	u64 sa_id;
+	u8 mcs_id;
+	u8 lmac_id;
+};
+
+struct mcs_intrq_entry {
+	struct list_head node;
+	struct mcs_intr_event intr_event;
+};
+
 struct secy_mem_map {
 	u8 flow_id;
 	u8 secy;
@@ -82,11 +146,17 @@ struct mcs {
 	struct hwinfo		*hw;
 	struct mcs_rsrc_map	tx;
 	struct mcs_rsrc_map	rx;
+	u16                     pf_map[MCS_MAX_PFS]; /* List of PCIFUNC mapped to MCS */
 	u8			mcs_id;
 	struct mcs_ops		*mcs_ops;
 	struct list_head	mcs_list;
 	/* Lock for mcs stats */
 	struct mutex		stats_lock;
+	struct mcs_pfvf		*pf;
+	struct mcs_pfvf		*vf;
+	u16			num_vec;
+	void			*rvu;
+	u16			*tx_sa_active;
 };
 
 struct mcs_ops {
@@ -157,6 +227,8 @@ void cnf10kb_mcs_tx_sa_mem_map_write(struct mcs *mcs, struct mcs_tx_sc_sa_map *m
 void cnf10kb_mcs_flowid_secy_map(struct mcs *mcs, struct secy_mem_map *map, int dir);
 void cnf10kb_mcs_rx_sa_mem_map_write(struct mcs *mcs, struct mcs_rx_sc_sa_map *map);
 void cnf10kb_mcs_parser_cfg(struct mcs *mcs);
+void cnf10kb_mcs_tx_pn_thresh_reached_handler(struct mcs *mcs);
+void cnf10kb_mcs_tx_pn_wrapped_handler(struct mcs *mcs);
 
 /* Stats APIs */
 void mcs_get_sc_stats(struct mcs *mcs, struct mcs_sc_stats *stats, int id, int dir);
@@ -169,4 +241,6 @@ void mcs_clear_stats(struct mcs *mcs, u8 type, u8 id, int dir);
 int mcs_clear_all_stats(struct mcs *mcs, u16 pcifunc, int dir);
 int mcs_set_force_clk_en(struct mcs *mcs, bool set);
 
+int mcs_add_intr_wq_entry(struct mcs *mcs, struct mcs_intr_event *event);
+
 #endif /* MCS_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c
index 5ed5deb78fc4..7b6205414428 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_cnf10kb.c
@@ -93,18 +93,18 @@ void cnf10kb_mcs_tx_sa_mem_map_write(struct mcs *mcs, struct mcs_tx_sc_sa_map *m
 	reg = MCSX_CPM_TX_SLAVE_SA_MAP_MEM_0X(map->sc_id);
 	mcs_reg_write(mcs, reg, val);
 
-	if (map->rekey_ena) {
-		reg = MCSX_CPM_TX_SLAVE_AUTO_REKEY_ENABLE_0;
-		val = mcs_reg_read(mcs, reg);
+	reg = MCSX_CPM_TX_SLAVE_AUTO_REKEY_ENABLE_0;
+	val = mcs_reg_read(mcs, reg);
+
+	if (map->rekey_ena)
 		val |= BIT_ULL(map->sc_id);
-		mcs_reg_write(mcs, reg, val);
-	}
+	else
+		val &= ~BIT_ULL(map->sc_id);
 
-	if (map->sa_index0_vld)
-		mcs_reg_write(mcs, MCSX_CPM_TX_SLAVE_SA_INDEX0_VLDX(map->sc_id), BIT_ULL(0));
+	mcs_reg_write(mcs, reg, val);
 
-	if (map->sa_index1_vld)
-		mcs_reg_write(mcs, MCSX_CPM_TX_SLAVE_SA_INDEX1_VLDX(map->sc_id), BIT_ULL(0));
+	mcs_reg_write(mcs, MCSX_CPM_TX_SLAVE_SA_INDEX0_VLDX(map->sc_id), map->sa_index0_vld);
+	mcs_reg_write(mcs, MCSX_CPM_TX_SLAVE_SA_INDEX1_VLDX(map->sc_id), map->sa_index1_vld);
 
 	mcs_reg_write(mcs, MCSX_CPM_TX_SLAVE_TX_SA_ACTIVEX(map->sc_id), map->tx_sa_active);
 }
@@ -144,3 +144,71 @@ int mcs_set_force_clk_en(struct mcs *mcs, bool set)
 
 	return 0;
 }
+
+/* TX SA interrupt is raised only if autorekey is enabled.
+ * MCS_CPM_TX_SLAVE_SA_MAP_MEM_0X[sc].tx_sa_active bit gets toggled if
+ * one of two SAs mapped to SC gets expired. If tx_sa_active=0 implies
+ * SA in SA_index1 got expired else SA in SA_index0 got expired.
+ */
+void cnf10kb_mcs_tx_pn_thresh_reached_handler(struct mcs *mcs)
+{
+	struct mcs_intr_event event;
+	struct rsrc_bmap *sc_bmap;
+	unsigned long rekey_ena;
+	u64 val, sa_status;
+	int sc;
+
+	sc_bmap = &mcs->tx.sc;
+
+	event.mcs_id = mcs->mcs_id;
+	event.intr_mask = MCS_CPM_TX_PN_THRESH_REACHED_INT;
+
+	rekey_ena = mcs_reg_read(mcs, MCSX_CPM_TX_SLAVE_AUTO_REKEY_ENABLE_0);
+
+	for_each_set_bit(sc, sc_bmap->bmap, mcs->hw->sc_entries) {
+		/* Auto rekey is enable */
+		if (!test_bit(sc, &rekey_ena))
+			continue;
+		sa_status = mcs_reg_read(mcs, MCSX_CPM_TX_SLAVE_TX_SA_ACTIVEX(sc));
+		/* Check if tx_sa_active status had changed */
+		if (sa_status == mcs->tx_sa_active[sc])
+			continue;
+
+		/* SA_index0 is expired */
+		val = mcs_reg_read(mcs, MCSX_CPM_TX_SLAVE_SA_MAP_MEM_0X(sc));
+		if (sa_status)
+			event.sa_id = val & 0x7F;
+		else
+			event.sa_id = (val >> 7) & 0x7F;
+
+		event.pcifunc = mcs->tx.sa2pf_map[event.sa_id];
+		mcs_add_intr_wq_entry(mcs, &event);
+	}
+}
+
+void cnf10kb_mcs_tx_pn_wrapped_handler(struct mcs *mcs)
+{
+	struct mcs_intr_event event = { 0 };
+	struct rsrc_bmap *sc_bmap;
+	u64 val;
+	int sc;
+
+	sc_bmap = &mcs->tx.sc;
+
+	event.mcs_id = mcs->mcs_id;
+	event.intr_mask = MCS_CPM_TX_PACKET_XPN_EQ0_INT;
+
+	for_each_set_bit(sc, sc_bmap->bmap, mcs->hw->sc_entries) {
+		val = mcs_reg_read(mcs, MCSX_CPM_TX_SLAVE_SA_MAP_MEM_0X(sc));
+
+		if (mcs->tx_sa_active[sc])
+			/* SA_index1 was used and got expired */
+			event.sa_id = (val >> 7) & 0x7F;
+		else
+			/* SA_index0 was used and got expired */
+			event.sa_id = val & 0x7F;
+
+		event.pcifunc = mcs->tx.sa2pf_map[event.sa_id];
+		mcs_add_intr_wq_entry(mcs, &event);
+	}
+}
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h b/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
index 12be9f997316..c95a8b8f5eaf 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_reg.h
@@ -276,6 +276,102 @@
 #define MCSX_BBE_RX_SLAVE_CAL_LEN			0x188ull
 #define MCSX_PAB_RX_SLAVE_FIFO_SKID_CFGX(a)		(0x290ull + (a) * 0x40ull)
 
+#define MCSX_BBE_RX_SLAVE_BBE_INT ({	\
+	u64 offset;			\
+					\
+	offset = 0xe00ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x160ull;	\
+	offset; })
+
+#define MCSX_BBE_RX_SLAVE_BBE_INT_ENB ({	\
+	u64 offset;			\
+					\
+	offset = 0xe08ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x168ull;	\
+	offset; })
+
+#define MCSX_BBE_RX_SLAVE_BBE_INT_INTR_RW ({	\
+	u64 offset;			\
+					\
+	offset = 0xe08ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x178ull;	\
+	offset; })
+
+#define MCSX_BBE_TX_SLAVE_BBE_INT ({	\
+	u64 offset;			\
+					\
+	offset = 0x1278ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x1e0ull;	\
+	offset; })
+
+#define MCSX_BBE_TX_SLAVE_BBE_INT_INTR_RW ({	\
+	u64 offset;			\
+					\
+	offset = 0x1278ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x1f8ull;	\
+	offset; })
+
+#define MCSX_BBE_TX_SLAVE_BBE_INT_ENB ({	\
+	u64 offset;			\
+					\
+	offset = 0x1280ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x1e8ull;	\
+	offset; })
+
+#define MCSX_PAB_RX_SLAVE_PAB_INT ({	\
+	u64 offset;			\
+					\
+	offset = 0x16f0ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x260ull;	\
+	offset; })
+
+#define MCSX_PAB_RX_SLAVE_PAB_INT_ENB ({	\
+	u64 offset;			\
+					\
+	offset = 0x16f8ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x268ull;	\
+	offset; })
+
+#define MCSX_PAB_RX_SLAVE_PAB_INT_INTR_RW ({	\
+	u64 offset;			\
+					\
+	offset = 0x16f8ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x278ull;	\
+	offset; })
+
+#define MCSX_PAB_TX_SLAVE_PAB_INT ({	\
+	u64 offset;			\
+					\
+	offset = 0x2908ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x380ull;	\
+	offset; })
+
+#define MCSX_PAB_TX_SLAVE_PAB_INT_ENB ({	\
+	u64 offset;			\
+					\
+	offset = 0x2910ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x388ull;	\
+	offset; })
+
+#define MCSX_PAB_TX_SLAVE_PAB_INT_INTR_RW ({	\
+	u64 offset;			\
+					\
+	offset = 0x16f8ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x398ull;	\
+	offset; })
+
 /* CPM registers */
 #define MCSX_CPM_RX_SLAVE_FLOWID_TCAM_DATAX(a, b) ({	\
 	u64 offset;					\
@@ -931,4 +1027,76 @@
 #define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSAENCRYPTEDX(a)	(0x21c40ull + (a) * 0x8ull)
 #define MCSX_CSE_TX_MEM_SLAVE_OUTPKTSSAPROTECTEDX(a)	(0x20c40ull + (a) * 0x8ull)
 
+#define MCSX_IP_INT ({			\
+	u64 offset;			\
+					\
+	offset = 0x80028ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x60028ull;	\
+	offset; })
+
+#define MCSX_IP_INT_ENA_W1S ({		\
+	u64 offset;			\
+					\
+	offset = 0x80040ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x60040ull;	\
+	offset; })
+
+#define MCSX_IP_INT_ENA_W1C ({		\
+	u64 offset;			\
+					\
+	offset = 0x80038ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x60038ull;	\
+	offset; })
+
+#define MCSX_TOP_SLAVE_INT_SUM ({	\
+	u64 offset;			\
+					\
+	offset = 0xc20ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0xab8ull;	\
+	offset; })
+
+#define MCSX_TOP_SLAVE_INT_SUM_ENB ({	\
+	u64 offset;			\
+					\
+	offset = 0xc28ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0xac0ull;	\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_RX_INT ({	\
+	u64 offset;			\
+					\
+	offset = 0x23c00ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x0ad8ull;	\
+	offset; })
+
+#define MCSX_CPM_RX_SLAVE_RX_INT_ENB ({	\
+	u64 offset;			\
+					\
+	offset = 0x23c08ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0xae0ull;	\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_TX_INT ({	\
+	u64 offset;			\
+					\
+	offset = 0x3d490ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x54a0ull;	\
+	offset; })
+
+#define MCSX_CPM_TX_SLAVE_TX_INT_ENB ({	\
+	u64 offset;			\
+					\
+	offset = 0x3d498ull;		\
+	if (mcs->hw->mcs_blks > 1)	\
+		offset = 0x54a8ull;	\
+	offset; })
+
 #endif
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
index 939c9b65f078..fa8029a94068 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/mcs_rvu_if.c
@@ -13,6 +13,25 @@
 #include "rvu.h"
 #include "lmac_common.h"
 
+#define M(_name, _id, _fn_name, _req_type, _rsp_type)			\
+static struct _req_type __maybe_unused					\
+*otx2_mbox_alloc_msg_ ## _fn_name(struct rvu *rvu, int devid)		\
+{									\
+	struct _req_type *req;						\
+									\
+	req = (struct _req_type *)otx2_mbox_alloc_msg_rsp(		\
+		&rvu->afpf_wq_info.mbox_up, devid, sizeof(struct _req_type), \
+		sizeof(struct _rsp_type));				\
+	if (!req)							\
+		return NULL;						\
+	req->hdr.sig = OTX2_MBOX_REQ_SIG;				\
+	req->hdr.id = _id;						\
+	return req;							\
+}
+
+MBOX_UP_MCS_MESSAGES
+#undef M
+
 int rvu_mbox_handler_mcs_set_lmac_mode(struct rvu *rvu,
 				       struct mcs_set_lmac_mode *req,
 				       struct msg_rsp *rsp)
@@ -30,6 +49,114 @@ int rvu_mbox_handler_mcs_set_lmac_mode(struct rvu *rvu,
 	return 0;
 }
 
+int mcs_add_intr_wq_entry(struct mcs *mcs, struct mcs_intr_event *event)
+{
+	struct mcs_intrq_entry *qentry;
+	u16 pcifunc = event->pcifunc;
+	struct rvu *rvu = mcs->rvu;
+	struct mcs_pfvf *pfvf;
+
+	/* Check if it is PF or VF */
+	if (pcifunc & RVU_PFVF_FUNC_MASK)
+		pfvf = &mcs->vf[rvu_get_hwvf(rvu, pcifunc)];
+	else
+		pfvf = &mcs->pf[rvu_get_pf(pcifunc)];
+
+	event->intr_mask &= pfvf->intr_mask;
+
+	/* Check PF/VF interrupt notification is enabled */
+	if (!(pfvf->intr_mask && event->intr_mask))
+		return 0;
+
+	qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC);
+	if (!qentry)
+		return -ENOMEM;
+
+	qentry->intr_event = *event;
+	spin_lock(&rvu->mcs_intrq_lock);
+	list_add_tail(&qentry->node, &rvu->mcs_intrq_head);
+	spin_unlock(&rvu->mcs_intrq_lock);
+	queue_work(rvu->mcs_intr_wq, &rvu->mcs_intr_work);
+
+	return 0;
+}
+
+static int mcs_notify_pfvf(struct mcs_intr_event *event, struct rvu *rvu)
+{
+	struct mcs_intr_info *req;
+	int err, pf;
+
+	pf = rvu_get_pf(event->pcifunc);
+
+	req = otx2_mbox_alloc_msg_mcs_intr_notify(rvu, pf);
+	if (!req)
+		return -ENOMEM;
+
+	req->mcs_id = event->mcs_id;
+	req->intr_mask = event->intr_mask;
+	req->sa_id = event->sa_id;
+	req->hdr.pcifunc = event->pcifunc;
+	req->lmac_id = event->lmac_id;
+
+	otx2_mbox_msg_send(&rvu->afpf_wq_info.mbox_up, pf);
+	err = otx2_mbox_wait_for_rsp(&rvu->afpf_wq_info.mbox_up, pf);
+	if (err)
+		dev_warn(rvu->dev, "MCS notification to pf %d failed\n", pf);
+
+	return 0;
+}
+
+static void mcs_intr_handler_task(struct work_struct *work)
+{
+	struct rvu *rvu = container_of(work, struct rvu, mcs_intr_work);
+	struct mcs_intrq_entry *qentry;
+	struct mcs_intr_event *event;
+	unsigned long flags;
+
+	do {
+		spin_lock_irqsave(&rvu->mcs_intrq_lock, flags);
+		qentry = list_first_entry_or_null(&rvu->mcs_intrq_head,
+						  struct mcs_intrq_entry,
+						  node);
+		if (qentry)
+			list_del(&qentry->node);
+
+		spin_unlock_irqrestore(&rvu->mcs_intrq_lock, flags);
+		if (!qentry)
+			break; /* nothing more to process */
+
+		event = &qentry->intr_event;
+
+		mcs_notify_pfvf(event, rvu);
+		kfree(qentry);
+	} while (1);
+}
+
+int rvu_mbox_handler_mcs_intr_cfg(struct rvu *rvu,
+				  struct mcs_intr_cfg *req,
+				  struct msg_rsp *rsp)
+{
+	u16 pcifunc = req->hdr.pcifunc;
+	struct mcs_pfvf *pfvf;
+	struct mcs *mcs;
+
+	if (req->mcs_id >= rvu->mcs_blk_cnt)
+		return MCS_AF_ERR_INVALID_MCSID;
+
+	mcs = mcs_get_pdata(req->mcs_id);
+
+	/* Check if it is PF or VF */
+	if (pcifunc & RVU_PFVF_FUNC_MASK)
+		pfvf = &mcs->vf[rvu_get_hwvf(rvu, pcifunc)];
+	else
+		pfvf = &mcs->pf[rvu_get_pf(pcifunc)];
+
+	mcs->pf_map[0] = pcifunc;
+	pfvf->intr_mask = req->intr_mask;
+
+	return 0;
+}
+
 int rvu_mbox_handler_mcs_get_hw_info(struct rvu *rvu,
 				     struct msg_req *req,
 				     struct mcs_hw_info *rsp)
@@ -376,6 +503,7 @@ int rvu_mbox_handler_mcs_tx_sc_sa_map_write(struct rvu *rvu,
 
 	mcs = mcs_get_pdata(req->mcs_id);
 	mcs->mcs_ops->mcs_tx_sa_mem_map_write(mcs, req);
+	mcs->tx_sa_active[req->sc_id] = req->tx_sa_active;
 
 	return 0;
 }
@@ -723,7 +851,39 @@ int rvu_mcs_init(struct rvu *rvu)
 		mcs_install_flowid_bypass_entry(mcs);
 		for (lmac = 0; lmac < mcs->hw->lmac_cnt; lmac++)
 			mcs_set_lmac_mode(mcs, lmac, 0);
+
+		mcs->rvu = rvu;
+
+		/* Allocated memory for PFVF data */
+		mcs->pf = devm_kcalloc(mcs->dev, hw->total_pfs,
+				       sizeof(struct mcs_pfvf), GFP_KERNEL);
+		if (!mcs->pf)
+			return -ENOMEM;
+
+		mcs->vf = devm_kcalloc(mcs->dev, hw->total_vfs,
+				       sizeof(struct mcs_pfvf), GFP_KERNEL);
+		if (!mcs->vf)
+			return -ENOMEM;
+	}
+
+	/* Initialize the wq for handling mcs interrupts */
+	INIT_LIST_HEAD(&rvu->mcs_intrq_head);
+	INIT_WORK(&rvu->mcs_intr_work, mcs_intr_handler_task);
+	rvu->mcs_intr_wq = alloc_workqueue("mcs_intr_wq", 0, 0);
+	if (!rvu->mcs_intr_wq) {
+		dev_err(rvu->dev, "mcs alloc workqueue failed\n");
+		return -ENOMEM;
 	}
 
 	return err;
 }
+
+void rvu_mcs_exit(struct rvu *rvu)
+{
+	if (!rvu->mcs_intr_wq)
+		return;
+
+	flush_workqueue(rvu->mcs_intr_wq);
+	destroy_workqueue(rvu->mcs_intr_wq);
+	rvu->mcs_intr_wq = NULL;
+}
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
index 5d7464101dae..3f5e09b77d4b 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c
@@ -24,8 +24,6 @@
 #define DRV_NAME	"rvu_af"
 #define DRV_STRING      "Marvell OcteonTX2 RVU Admin Function Driver"
 
-static int rvu_get_hwvf(struct rvu *rvu, int pcifunc);
-
 static void rvu_set_msix_offset(struct rvu *rvu, struct rvu_pfvf *pfvf,
 				struct rvu_block *block, int lf);
 static void rvu_clear_msix_offset(struct rvu *rvu, struct rvu_pfvf *pfvf,
@@ -419,7 +417,7 @@ void rvu_get_pf_numvfs(struct rvu *rvu, int pf, int *numvfs, int *hwvf)
 		*hwvf = cfg & 0xFFF;
 }
 
-static int rvu_get_hwvf(struct rvu *rvu, int pcifunc)
+int rvu_get_hwvf(struct rvu *rvu, int pcifunc)
 {
 	int pf, func;
 	u64 cfg;
@@ -3300,6 +3298,7 @@ err_mbox:
 err_hwsetup:
 	rvu_cgx_exit(rvu);
 	rvu_fwdata_exit(rvu);
+	rvu_mcs_exit(rvu);
 	rvu_reset_all_blocks(rvu);
 	rvu_free_hw_resources(rvu);
 	rvu_clear_rvum_blk_revid(rvu);
@@ -3326,6 +3325,7 @@ static void rvu_remove(struct pci_dev *pdev)
 	rvu_flr_wq_destroy(rvu);
 	rvu_cgx_exit(rvu);
 	rvu_fwdata_exit(rvu);
+	rvu_mcs_exit(rvu);
 	rvu_mbox_destroy(&rvu->afpf_wq_info);
 	rvu_disable_sriov(rvu);
 	rvu_reset_all_blocks(rvu);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index 4aefe47134d0..d0268c45a94d 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -508,6 +508,12 @@ struct rvu {
 
 	/* RVU switch implementation over NPC with DMAC rules */
 	struct rvu_switch	rswitch;
+
+	struct			work_struct mcs_intr_work;
+	struct			workqueue_struct *mcs_intr_wq;
+	struct list_head	mcs_intrq_head;
+	/* mcs interrupt queue lock */
+	spinlock_t		mcs_intrq_lock;
 };
 
 static inline void rvu_write64(struct rvu *rvu, u64 block, u64 offset, u64 val)
@@ -872,9 +878,11 @@ void rvu_switch_update_rules(struct rvu *rvu, u16 pcifunc);
 int rvu_npc_set_parse_mode(struct rvu *rvu, u16 pcifunc, u64 mode, u8 dir,
 			   u64 pkind, u8 var_len_off, u8 var_len_off_mask,
 			   u8 shift_dir);
+int rvu_get_hwvf(struct rvu *rvu, int pcifunc);
 
 /* CN10K MCS */
 int rvu_mcs_init(struct rvu *rvu);
 int rvu_mcs_flr_handler(struct rvu *rvu, u16 pcifunc);
+void rvu_mcs_exit(struct rvu *rvu);
 
 #endif /* RVU_H */
-- 
cgit v1.2.3


From d06c2aba51631bf6cd32a2f8f1edd67c110ade8a Mon Sep 17 00:00:00 2001
From: Geetha sowjanya <gakula@marvell.com>
Date: Sat, 1 Oct 2022 10:29:48 +0530
Subject: octeontx2-af: cn10k: mcs: Add debugfs support

This patch adds debugfs entry to dump MCS secy, sc,
sa, flowid and port stats. This helps in debugging
the packet path and to figure out where exactly packet
was dropped.

Signed-off-by: Geetha sowjanya <gakula@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/octeontx2/af/rvu.h    |   4 +
 .../ethernet/marvell/octeontx2/af/rvu_debugfs.c    | 346 +++++++++++++++++++++
 2 files changed, 350 insertions(+)

diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
index d0268c45a94d..76474385a602 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@ -64,6 +64,10 @@ struct rvu_debugfs {
 	struct dentry *nix;
 	struct dentry *npc;
 	struct dentry *cpt;
+	struct dentry *mcs_root;
+	struct dentry *mcs;
+	struct dentry *mcs_rx;
+	struct dentry *mcs_tx;
 	struct dump_ctx npa_aura_ctx;
 	struct dump_ctx npa_pool_ctx;
 	struct dump_ctx nix_cq_ctx;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
index f42a09f04b25..a1970ebedf95 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
@@ -19,6 +19,7 @@
 #include "lmac_common.h"
 #include "npc.h"
 #include "rvu_npc_hash.h"
+#include "mcs.h"
 
 #define DEBUGFS_DIR_NAME "octeontx2"
 
@@ -227,6 +228,350 @@ static const struct file_operations rvu_dbg_##name##_fops = { \
 
 static void print_nix_qsize(struct seq_file *filp, struct rvu_pfvf *pfvf);
 
+static int rvu_dbg_mcs_port_stats_display(struct seq_file *filp, void *unused, int dir)
+{
+	struct mcs *mcs = filp->private;
+	struct mcs_port_stats stats;
+	int lmac;
+
+	seq_puts(filp, "\n port stats\n");
+	mutex_lock(&mcs->stats_lock);
+	for_each_set_bit(lmac, &mcs->hw->lmac_bmap, mcs->hw->lmac_cnt) {
+		mcs_get_port_stats(mcs, &stats, lmac, dir);
+		seq_printf(filp, "port%d: Tcam Miss: %lld\n", lmac, stats.tcam_miss_cnt);
+		seq_printf(filp, "port%d: Parser errors: %lld\n", lmac, stats.parser_err_cnt);
+
+		if (dir == MCS_RX && mcs->hw->mcs_blks > 1)
+			seq_printf(filp, "port%d: Preempt error: %lld\n", lmac,
+				   stats.preempt_err_cnt);
+		if (dir == MCS_TX)
+			seq_printf(filp, "port%d: Sectag insert error: %lld\n", lmac,
+				   stats.sectag_insert_err_cnt);
+	}
+	mutex_unlock(&mcs->stats_lock);
+	return 0;
+}
+
+static int rvu_dbg_mcs_rx_port_stats_display(struct seq_file *filp, void *unused)
+{
+	return rvu_dbg_mcs_port_stats_display(filp, unused, MCS_RX);
+}
+
+RVU_DEBUG_SEQ_FOPS(mcs_rx_port_stats, mcs_rx_port_stats_display, NULL);
+
+static int rvu_dbg_mcs_tx_port_stats_display(struct seq_file *filp, void *unused)
+{
+	return rvu_dbg_mcs_port_stats_display(filp, unused, MCS_TX);
+}
+
+RVU_DEBUG_SEQ_FOPS(mcs_tx_port_stats, mcs_tx_port_stats_display, NULL);
+
+static int rvu_dbg_mcs_sa_stats_display(struct seq_file *filp, void *unused, int dir)
+{
+	struct mcs *mcs = filp->private;
+	struct mcs_sa_stats stats;
+	struct rsrc_bmap *map;
+	int sa_id;
+
+	if (dir == MCS_TX) {
+		map = &mcs->tx.sa;
+		mutex_lock(&mcs->stats_lock);
+		for_each_set_bit(sa_id, map->bmap, mcs->hw->sa_entries) {
+			seq_puts(filp, "\n TX SA stats\n");
+			mcs_get_sa_stats(mcs, &stats, sa_id, MCS_TX);
+			seq_printf(filp, "sa%d: Pkts encrypted: %lld\n", sa_id,
+				   stats.pkt_encrypt_cnt);
+
+			seq_printf(filp, "sa%d: Pkts protected: %lld\n", sa_id,
+				   stats.pkt_protected_cnt);
+		}
+		mutex_unlock(&mcs->stats_lock);
+		return 0;
+	}
+
+	/* RX stats */
+	map = &mcs->rx.sa;
+	mutex_lock(&mcs->stats_lock);
+	for_each_set_bit(sa_id, map->bmap, mcs->hw->sa_entries) {
+		seq_puts(filp, "\n RX SA stats\n");
+		mcs_get_sa_stats(mcs, &stats, sa_id, MCS_RX);
+		seq_printf(filp, "sa%d: Invalid pkts: %lld\n", sa_id, stats.pkt_invalid_cnt);
+		seq_printf(filp, "sa%d: Pkts no sa error: %lld\n", sa_id, stats.pkt_nosaerror_cnt);
+		seq_printf(filp, "sa%d: Pkts not valid: %lld\n", sa_id, stats.pkt_notvalid_cnt);
+		seq_printf(filp, "sa%d: Pkts ok: %lld\n", sa_id, stats.pkt_ok_cnt);
+		seq_printf(filp, "sa%d: Pkts no sa: %lld\n", sa_id, stats.pkt_nosa_cnt);
+	}
+	mutex_unlock(&mcs->stats_lock);
+	return 0;
+}
+
+static int rvu_dbg_mcs_rx_sa_stats_display(struct seq_file *filp, void *unused)
+{
+	return rvu_dbg_mcs_sa_stats_display(filp, unused, MCS_RX);
+}
+
+RVU_DEBUG_SEQ_FOPS(mcs_rx_sa_stats, mcs_rx_sa_stats_display, NULL);
+
+static int rvu_dbg_mcs_tx_sa_stats_display(struct seq_file *filp, void *unused)
+{
+	return rvu_dbg_mcs_sa_stats_display(filp, unused, MCS_TX);
+}
+
+RVU_DEBUG_SEQ_FOPS(mcs_tx_sa_stats, mcs_tx_sa_stats_display, NULL);
+
+static int rvu_dbg_mcs_tx_sc_stats_display(struct seq_file *filp, void *unused)
+{
+	struct mcs *mcs = filp->private;
+	struct mcs_sc_stats stats;
+	struct rsrc_bmap *map;
+	int sc_id;
+
+	map = &mcs->tx.sc;
+	seq_puts(filp, "\n SC stats\n");
+
+	mutex_lock(&mcs->stats_lock);
+	for_each_set_bit(sc_id, map->bmap, mcs->hw->sc_entries) {
+		mcs_get_sc_stats(mcs, &stats, sc_id, MCS_TX);
+		seq_printf(filp, "\n=======sc%d======\n\n", sc_id);
+		seq_printf(filp, "sc%d: Pkts encrypted: %lld\n", sc_id, stats.pkt_encrypt_cnt);
+		seq_printf(filp, "sc%d: Pkts protected: %lld\n", sc_id, stats.pkt_protected_cnt);
+
+		if (mcs->hw->mcs_blks == 1) {
+			seq_printf(filp, "sc%d: Octets encrypted: %lld\n", sc_id,
+				   stats.octet_encrypt_cnt);
+			seq_printf(filp, "sc%d: Octets protected: %lld\n", sc_id,
+				   stats.octet_protected_cnt);
+		}
+	}
+	mutex_unlock(&mcs->stats_lock);
+	return 0;
+}
+
+RVU_DEBUG_SEQ_FOPS(mcs_tx_sc_stats, mcs_tx_sc_stats_display, NULL);
+
+static int rvu_dbg_mcs_rx_sc_stats_display(struct seq_file *filp, void *unused)
+{
+	struct mcs *mcs = filp->private;
+	struct mcs_sc_stats stats;
+	struct rsrc_bmap *map;
+	int sc_id;
+
+	map = &mcs->rx.sc;
+	seq_puts(filp, "\n SC stats\n");
+
+	mutex_lock(&mcs->stats_lock);
+	for_each_set_bit(sc_id, map->bmap, mcs->hw->sc_entries) {
+		mcs_get_sc_stats(mcs, &stats, sc_id, MCS_RX);
+		seq_printf(filp, "\n=======sc%d======\n\n", sc_id);
+		seq_printf(filp, "sc%d: Cam hits: %lld\n", sc_id, stats.hit_cnt);
+		seq_printf(filp, "sc%d: Invalid pkts: %lld\n", sc_id, stats.pkt_invalid_cnt);
+		seq_printf(filp, "sc%d: Late pkts: %lld\n", sc_id, stats.pkt_late_cnt);
+		seq_printf(filp, "sc%d: Notvalid pkts: %lld\n", sc_id, stats.pkt_notvalid_cnt);
+		seq_printf(filp, "sc%d: Unchecked pkts: %lld\n", sc_id, stats.pkt_unchecked_cnt);
+
+		if (mcs->hw->mcs_blks > 1) {
+			seq_printf(filp, "sc%d: Delay pkts: %lld\n", sc_id, stats.pkt_delay_cnt);
+			seq_printf(filp, "sc%d: Pkts ok: %lld\n", sc_id, stats.pkt_ok_cnt);
+		}
+		if (mcs->hw->mcs_blks == 1) {
+			seq_printf(filp, "sc%d: Octets decrypted: %lld\n", sc_id,
+				   stats.octet_decrypt_cnt);
+			seq_printf(filp, "sc%d: Octets validated: %lld\n", sc_id,
+				   stats.octet_validate_cnt);
+		}
+	}
+	mutex_unlock(&mcs->stats_lock);
+	return 0;
+}
+
+RVU_DEBUG_SEQ_FOPS(mcs_rx_sc_stats, mcs_rx_sc_stats_display, NULL);
+
+static int rvu_dbg_mcs_flowid_stats_display(struct seq_file *filp, void *unused, int dir)
+{
+	struct mcs *mcs = filp->private;
+	struct mcs_flowid_stats stats;
+	struct rsrc_bmap *map;
+	int flow_id;
+
+	seq_puts(filp, "\n Flowid stats\n");
+
+	if (dir == MCS_RX)
+		map = &mcs->rx.flow_ids;
+	else
+		map = &mcs->tx.flow_ids;
+
+	mutex_lock(&mcs->stats_lock);
+	for_each_set_bit(flow_id, map->bmap, mcs->hw->tcam_entries) {
+		mcs_get_flowid_stats(mcs, &stats, flow_id, dir);
+		seq_printf(filp, "Flowid%d: Hit:%lld\n", flow_id, stats.tcam_hit_cnt);
+	}
+	mutex_unlock(&mcs->stats_lock);
+	return 0;
+}
+
+static int rvu_dbg_mcs_tx_flowid_stats_display(struct seq_file *filp, void *unused)
+{
+	return rvu_dbg_mcs_flowid_stats_display(filp, unused, MCS_TX);
+}
+
+RVU_DEBUG_SEQ_FOPS(mcs_tx_flowid_stats, mcs_tx_flowid_stats_display, NULL);
+
+static int rvu_dbg_mcs_rx_flowid_stats_display(struct seq_file *filp, void *unused)
+{
+	return rvu_dbg_mcs_flowid_stats_display(filp, unused, MCS_RX);
+}
+
+RVU_DEBUG_SEQ_FOPS(mcs_rx_flowid_stats, mcs_rx_flowid_stats_display, NULL);
+
+static int rvu_dbg_mcs_tx_secy_stats_display(struct seq_file *filp, void *unused)
+{
+	struct mcs *mcs = filp->private;
+	struct mcs_secy_stats stats;
+	struct rsrc_bmap *map;
+	int secy_id;
+
+	map = &mcs->tx.secy;
+	seq_puts(filp, "\n MCS TX secy stats\n");
+
+	mutex_lock(&mcs->stats_lock);
+	for_each_set_bit(secy_id, map->bmap, mcs->hw->secy_entries) {
+		mcs_get_tx_secy_stats(mcs, &stats, secy_id);
+		seq_printf(filp, "\n=======Secy%d======\n\n", secy_id);
+		seq_printf(filp, "secy%d: Ctrl bcast pkts: %lld\n", secy_id,
+			   stats.ctl_pkt_bcast_cnt);
+		seq_printf(filp, "secy%d: Ctrl Mcast pkts: %lld\n", secy_id,
+			   stats.ctl_pkt_mcast_cnt);
+		seq_printf(filp, "secy%d: Ctrl ucast pkts: %lld\n", secy_id,
+			   stats.ctl_pkt_ucast_cnt);
+		seq_printf(filp, "secy%d: Ctrl octets: %lld\n", secy_id, stats.ctl_octet_cnt);
+		seq_printf(filp, "secy%d: Unctrl bcast cnt: %lld\n", secy_id,
+			   stats.unctl_pkt_bcast_cnt);
+		seq_printf(filp, "secy%d: Unctrl mcast pkts: %lld\n", secy_id,
+			   stats.unctl_pkt_mcast_cnt);
+		seq_printf(filp, "secy%d: Unctrl ucast pkts: %lld\n", secy_id,
+			   stats.unctl_pkt_ucast_cnt);
+		seq_printf(filp, "secy%d: Unctrl octets: %lld\n", secy_id, stats.unctl_octet_cnt);
+		seq_printf(filp, "secy%d: Octet encrypted: %lld\n", secy_id,
+			   stats.octet_encrypted_cnt);
+		seq_printf(filp, "secy%d: octet protected: %lld\n", secy_id,
+			   stats.octet_protected_cnt);
+		seq_printf(filp, "secy%d: Pkts on active sa: %lld\n", secy_id,
+			   stats.pkt_noactivesa_cnt);
+		seq_printf(filp, "secy%d: Pkts too long: %lld\n", secy_id, stats.pkt_toolong_cnt);
+		seq_printf(filp, "secy%d: Pkts untagged: %lld\n", secy_id, stats.pkt_untagged_cnt);
+	}
+	mutex_unlock(&mcs->stats_lock);
+	return 0;
+}
+
+RVU_DEBUG_SEQ_FOPS(mcs_tx_secy_stats, mcs_tx_secy_stats_display, NULL);
+
+static int rvu_dbg_mcs_rx_secy_stats_display(struct seq_file *filp, void *unused)
+{
+	struct mcs *mcs = filp->private;
+	struct mcs_secy_stats stats;
+	struct rsrc_bmap *map;
+	int secy_id;
+
+	map = &mcs->rx.secy;
+	seq_puts(filp, "\n MCS secy stats\n");
+
+	mutex_lock(&mcs->stats_lock);
+	for_each_set_bit(secy_id, map->bmap, mcs->hw->secy_entries) {
+		mcs_get_rx_secy_stats(mcs, &stats, secy_id);
+		seq_printf(filp, "\n=======Secy%d======\n\n", secy_id);
+		seq_printf(filp, "secy%d: Ctrl bcast pkts: %lld\n", secy_id,
+			   stats.ctl_pkt_bcast_cnt);
+		seq_printf(filp, "secy%d: Ctrl Mcast pkts: %lld\n", secy_id,
+			   stats.ctl_pkt_mcast_cnt);
+		seq_printf(filp, "secy%d: Ctrl ucast pkts: %lld\n", secy_id,
+			   stats.ctl_pkt_ucast_cnt);
+		seq_printf(filp, "secy%d: Ctrl octets: %lld\n", secy_id, stats.ctl_octet_cnt);
+		seq_printf(filp, "secy%d: Unctrl bcast cnt: %lld\n", secy_id,
+			   stats.unctl_pkt_bcast_cnt);
+		seq_printf(filp, "secy%d: Unctrl mcast pkts: %lld\n", secy_id,
+			   stats.unctl_pkt_mcast_cnt);
+		seq_printf(filp, "secy%d: Unctrl ucast pkts: %lld\n", secy_id,
+			   stats.unctl_pkt_ucast_cnt);
+		seq_printf(filp, "secy%d: Unctrl octets: %lld\n", secy_id, stats.unctl_octet_cnt);
+		seq_printf(filp, "secy%d: Octet decrypted: %lld\n", secy_id,
+			   stats.octet_decrypted_cnt);
+		seq_printf(filp, "secy%d: octet validated: %lld\n", secy_id,
+			   stats.octet_validated_cnt);
+		seq_printf(filp, "secy%d: Pkts on disable port: %lld\n", secy_id,
+			   stats.pkt_port_disabled_cnt);
+		seq_printf(filp, "secy%d: Octets validated: %lld\n", secy_id, stats.pkt_badtag_cnt);
+		seq_printf(filp, "secy%d: Octets validated: %lld\n", secy_id, stats.pkt_nosa_cnt);
+		seq_printf(filp, "secy%d: Pkts with nosaerror: %lld\n", secy_id,
+			   stats.pkt_nosaerror_cnt);
+		seq_printf(filp, "secy%d: Tagged ctrl pkts: %lld\n", secy_id,
+			   stats.pkt_tagged_ctl_cnt);
+		seq_printf(filp, "secy%d: Untaged pkts: %lld\n", secy_id, stats.pkt_untaged_cnt);
+		seq_printf(filp, "secy%d: Ctrl pkts: %lld\n", secy_id, stats.pkt_ctl_cnt);
+		if (mcs->hw->mcs_blks > 1)
+			seq_printf(filp, "secy%d: pkts notag: %lld\n", secy_id,
+				   stats.pkt_notag_cnt);
+	}
+	mutex_unlock(&mcs->stats_lock);
+	return 0;
+}
+
+RVU_DEBUG_SEQ_FOPS(mcs_rx_secy_stats, mcs_rx_secy_stats_display, NULL);
+
+static void rvu_dbg_mcs_init(struct rvu *rvu)
+{
+	struct mcs *mcs;
+	char dname[10];
+	int i;
+
+	if (!rvu->mcs_blk_cnt)
+		return;
+
+	rvu->rvu_dbg.mcs_root = debugfs_create_dir("mcs", rvu->rvu_dbg.root);
+
+	for (i = 0; i < rvu->mcs_blk_cnt; i++) {
+		mcs = mcs_get_pdata(i);
+
+		sprintf(dname, "mcs%d", i);
+		rvu->rvu_dbg.mcs = debugfs_create_dir(dname,
+						      rvu->rvu_dbg.mcs_root);
+
+		rvu->rvu_dbg.mcs_rx = debugfs_create_dir("rx_stats", rvu->rvu_dbg.mcs);
+
+		debugfs_create_file("flowid", 0600, rvu->rvu_dbg.mcs_rx, mcs,
+				    &rvu_dbg_mcs_rx_flowid_stats_fops);
+
+		debugfs_create_file("secy", 0600, rvu->rvu_dbg.mcs_rx, mcs,
+				    &rvu_dbg_mcs_rx_secy_stats_fops);
+
+		debugfs_create_file("sc", 0600, rvu->rvu_dbg.mcs_rx, mcs,
+				    &rvu_dbg_mcs_rx_sc_stats_fops);
+
+		debugfs_create_file("sa", 0600, rvu->rvu_dbg.mcs_rx, mcs,
+				    &rvu_dbg_mcs_rx_sa_stats_fops);
+
+		debugfs_create_file("port", 0600, rvu->rvu_dbg.mcs_rx, mcs,
+				    &rvu_dbg_mcs_rx_port_stats_fops);
+
+		rvu->rvu_dbg.mcs_tx = debugfs_create_dir("tx_stats", rvu->rvu_dbg.mcs);
+
+		debugfs_create_file("flowid", 0600, rvu->rvu_dbg.mcs_tx, mcs,
+				    &rvu_dbg_mcs_tx_flowid_stats_fops);
+
+		debugfs_create_file("secy", 0600, rvu->rvu_dbg.mcs_tx, mcs,
+				    &rvu_dbg_mcs_tx_secy_stats_fops);
+
+		debugfs_create_file("sc", 0600, rvu->rvu_dbg.mcs_tx, mcs,
+				    &rvu_dbg_mcs_tx_sc_stats_fops);
+
+		debugfs_create_file("sa", 0600, rvu->rvu_dbg.mcs_tx, mcs,
+				    &rvu_dbg_mcs_tx_sa_stats_fops);
+
+		debugfs_create_file("port", 0600, rvu->rvu_dbg.mcs_tx, mcs,
+				    &rvu_dbg_mcs_tx_port_stats_fops);
+	}
+}
+
 #define LMT_MAPTBL_ENTRY_SIZE 16
 /* Dump LMTST map table */
 static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp,
@@ -3053,6 +3398,7 @@ create:
 	rvu_dbg_npc_init(rvu);
 	rvu_dbg_cpt_init(rvu, BLKADDR_CPT0);
 	rvu_dbg_cpt_init(rvu, BLKADDR_CPT1);
+	rvu_dbg_mcs_init(rvu);
 }
 
 void rvu_dbg_exit(struct rvu *rvu)
-- 
cgit v1.2.3


From c54ffc73601c0a239e55911923a6e23a2a74f143 Mon Sep 17 00:00:00 2001
From: Subbaraya Sundeep <sbhatta@marvell.com>
Date: Sat, 1 Oct 2022 10:29:49 +0530
Subject: octeontx2-pf: mcs: Introduce MACSEC hardware offloading

This patch introduces the macsec offload feature to cn10k
PF netdev driver. The macsec offload ops like adding, deleting
and updating SecYs, SCs, SAs and stats are supported. XPN support
will be added in later patches. Some stats use same counter in hardware
which means based on the SecY mode the same counter represents different
stat. Hence when SecY mode/policy is changed then snapshot of current
stats are captured. Also there is no provision to specify the unique
flow-id/SCI per packet to hardware hence different mac address needs to
be set for macsec interfaces.

Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/marvell/octeontx2/nic/Makefile    |    1 +
 .../ethernet/marvell/octeontx2/nic/cn10k_macsec.c  | 1668 ++++++++++++++++++++
 .../ethernet/marvell/octeontx2/nic/otx2_common.c   |    1 +
 .../ethernet/marvell/octeontx2/nic/otx2_common.h   |   90 ++
 .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c   |   16 +
 5 files changed, 1776 insertions(+)
 create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c

diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
index d463dc72d80a..73fdb8798614 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
@@ -13,5 +13,6 @@ rvu_nicvf-y := otx2_vf.o otx2_devlink.o
 
 rvu_nicpf-$(CONFIG_DCB) += otx2_dcbnl.o
 rvu_nicvf-$(CONFIG_DCB) += otx2_dcbnl.o
+rvu_nicpf-$(CONFIG_MACSEC) += cn10k_macsec.o
 
 ccflags-y += -I$(srctree)/drivers/net/ethernet/marvell/octeontx2/af
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c
new file mode 100644
index 000000000000..64f3acd7f67b
--- /dev/null
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c
@@ -0,0 +1,1668 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell MACSEC hardware offload driver
+ *
+ * Copyright (C) 2022 Marvell.
+ */
+
+#include <linux/rtnetlink.h>
+#include <linux/bitfield.h>
+#include <net/macsec.h>
+#include "otx2_common.h"
+
+#define MCS_TCAM0_MAC_SA_MASK		GENMASK_ULL(63, 48)
+#define MCS_TCAM1_MAC_SA_MASK		GENMASK_ULL(31, 0)
+#define MCS_TCAM1_ETYPE_MASK		GENMASK_ULL(47, 32)
+
+#define MCS_SA_MAP_MEM_SA_USE		BIT_ULL(9)
+
+#define MCS_RX_SECY_PLCY_RW_MASK	GENMASK_ULL(49, 18)
+#define MCS_RX_SECY_PLCY_RP		BIT_ULL(17)
+#define MCS_RX_SECY_PLCY_AUTH_ENA	BIT_ULL(16)
+#define MCS_RX_SECY_PLCY_CIP		GENMASK_ULL(8, 5)
+#define MCS_RX_SECY_PLCY_VAL		GENMASK_ULL(2, 1)
+#define MCS_RX_SECY_PLCY_ENA		BIT_ULL(0)
+
+#define MCS_TX_SECY_PLCY_MTU		GENMASK_ULL(43, 28)
+#define MCS_TX_SECY_PLCY_ST_TCI		GENMASK_ULL(27, 22)
+#define MCS_TX_SECY_PLCY_ST_OFFSET	GENMASK_ULL(21, 15)
+#define MCS_TX_SECY_PLCY_INS_MODE	BIT_ULL(14)
+#define MCS_TX_SECY_PLCY_AUTH_ENA	BIT_ULL(13)
+#define MCS_TX_SECY_PLCY_CIP		GENMASK_ULL(5, 2)
+#define MCS_TX_SECY_PLCY_PROTECT	BIT_ULL(1)
+#define MCS_TX_SECY_PLCY_ENA		BIT_ULL(0)
+
+#define MCS_GCM_AES_128			0
+#define MCS_GCM_AES_256			1
+#define MCS_GCM_AES_XPN_128		2
+#define MCS_GCM_AES_XPN_256		3
+
+#define MCS_TCI_ES			0x40 /* end station */
+#define MCS_TCI_SC			0x20 /* SCI present */
+#define MCS_TCI_SCB			0x10 /* epon */
+#define MCS_TCI_E			0x08 /* encryption */
+#define MCS_TCI_C			0x04 /* changed text */
+
+static struct cn10k_mcs_txsc *cn10k_mcs_get_txsc(struct cn10k_mcs_cfg *cfg,
+						 struct macsec_secy *secy)
+{
+	struct cn10k_mcs_txsc *txsc;
+
+	list_for_each_entry(txsc, &cfg->txsc_list, entry) {
+		if (txsc->sw_secy == secy)
+			return txsc;
+	}
+
+	return NULL;
+}
+
+static struct cn10k_mcs_rxsc *cn10k_mcs_get_rxsc(struct cn10k_mcs_cfg *cfg,
+						 struct macsec_secy *secy,
+						 struct macsec_rx_sc *rx_sc)
+{
+	struct cn10k_mcs_rxsc *rxsc;
+
+	list_for_each_entry(rxsc, &cfg->rxsc_list, entry) {
+		if (rxsc->sw_rxsc == rx_sc && rxsc->sw_secy == secy)
+			return rxsc;
+	}
+
+	return NULL;
+}
+
+static const char *rsrc_name(enum mcs_rsrc_type rsrc_type)
+{
+	switch (rsrc_type) {
+	case MCS_RSRC_TYPE_FLOWID:
+		return "FLOW";
+	case MCS_RSRC_TYPE_SC:
+		return "SC";
+	case MCS_RSRC_TYPE_SECY:
+		return "SECY";
+	case MCS_RSRC_TYPE_SA:
+		return "SA";
+	default:
+		return "Unknown";
+	};
+
+	return "Unknown";
+}
+
+static int cn10k_mcs_alloc_rsrc(struct otx2_nic *pfvf, enum mcs_direction dir,
+				enum mcs_rsrc_type type, u16 *rsrc_id)
+{
+	struct mbox *mbox = &pfvf->mbox;
+	struct mcs_alloc_rsrc_req *req;
+	struct mcs_alloc_rsrc_rsp *rsp;
+	int ret = -ENOMEM;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_alloc_resources(mbox);
+	if (!req)
+		goto fail;
+
+	req->rsrc_type = type;
+	req->rsrc_cnt  = 1;
+	req->dir = dir;
+
+	ret = otx2_sync_mbox_msg(mbox);
+	if (ret)
+		goto fail;
+
+	rsp = (struct mcs_alloc_rsrc_rsp *)otx2_mbox_get_rsp(&pfvf->mbox.mbox,
+							     0, &req->hdr);
+	if (IS_ERR(rsp) || req->rsrc_cnt != rsp->rsrc_cnt ||
+	    req->rsrc_type != rsp->rsrc_type || req->dir != rsp->dir) {
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	switch (rsp->rsrc_type) {
+	case MCS_RSRC_TYPE_FLOWID:
+		*rsrc_id = rsp->flow_ids[0];
+		break;
+	case MCS_RSRC_TYPE_SC:
+		*rsrc_id = rsp->sc_ids[0];
+		break;
+	case MCS_RSRC_TYPE_SECY:
+		*rsrc_id = rsp->secy_ids[0];
+		break;
+	case MCS_RSRC_TYPE_SA:
+		*rsrc_id = rsp->sa_ids[0];
+		break;
+	default:
+		ret = -EINVAL;
+		goto fail;
+	};
+
+	mutex_unlock(&mbox->lock);
+
+	return 0;
+fail:
+	dev_err(pfvf->dev, "Failed to allocate %s %s resource\n",
+		dir == MCS_TX ? "TX" : "RX", rsrc_name(type));
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static void cn10k_mcs_free_rsrc(struct otx2_nic *pfvf, enum mcs_direction dir,
+				enum mcs_rsrc_type type, u16 hw_rsrc_id,
+				bool all)
+{
+	struct mbox *mbox = &pfvf->mbox;
+	struct mcs_free_rsrc_req *req;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_free_resources(mbox);
+	if (!req)
+		goto fail;
+
+	req->rsrc_id = hw_rsrc_id;
+	req->rsrc_type = type;
+	req->dir = dir;
+	if (all)
+		req->all = 1;
+
+	if (otx2_sync_mbox_msg(&pfvf->mbox))
+		goto fail;
+
+	mutex_unlock(&mbox->lock);
+
+	return;
+fail:
+	dev_err(pfvf->dev, "Failed to free %s %s resource\n",
+		dir == MCS_TX ? "TX" : "RX", rsrc_name(type));
+	mutex_unlock(&mbox->lock);
+}
+
+static int cn10k_mcs_alloc_txsa(struct otx2_nic *pfvf, u16 *hw_sa_id)
+{
+	return cn10k_mcs_alloc_rsrc(pfvf, MCS_TX, MCS_RSRC_TYPE_SA, hw_sa_id);
+}
+
+static int cn10k_mcs_alloc_rxsa(struct otx2_nic *pfvf, u16 *hw_sa_id)
+{
+	return cn10k_mcs_alloc_rsrc(pfvf, MCS_RX, MCS_RSRC_TYPE_SA, hw_sa_id);
+}
+
+static void cn10k_mcs_free_txsa(struct otx2_nic *pfvf, u16 hw_sa_id)
+{
+	cn10k_mcs_free_rsrc(pfvf, MCS_TX, MCS_RSRC_TYPE_SA, hw_sa_id, false);
+}
+
+static void cn10k_mcs_free_rxsa(struct otx2_nic *pfvf, u16 hw_sa_id)
+{
+	cn10k_mcs_free_rsrc(pfvf, MCS_RX, MCS_RSRC_TYPE_SA, hw_sa_id, false);
+}
+
+static int cn10k_mcs_write_rx_secy(struct otx2_nic *pfvf,
+				   struct macsec_secy *secy, u8 hw_secy_id)
+{
+	struct mcs_secy_plcy_write_req *req;
+	struct mbox *mbox = &pfvf->mbox;
+	u64 policy;
+	int ret;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_secy_plcy_write(mbox);
+	if (!req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	policy = FIELD_PREP(MCS_RX_SECY_PLCY_RW_MASK, secy->replay_window);
+	if (secy->replay_protect)
+		policy |= MCS_RX_SECY_PLCY_RP;
+
+	policy |= MCS_RX_SECY_PLCY_AUTH_ENA;
+	policy |= FIELD_PREP(MCS_RX_SECY_PLCY_CIP, MCS_GCM_AES_128);
+	policy |= FIELD_PREP(MCS_RX_SECY_PLCY_VAL, secy->validate_frames);
+
+	policy |= MCS_RX_SECY_PLCY_ENA;
+
+	req->plcy = policy;
+	req->secy_id = hw_secy_id;
+	req->dir = MCS_RX;
+
+	ret = otx2_sync_mbox_msg(mbox);
+
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_mcs_write_rx_flowid(struct otx2_nic *pfvf,
+				     struct cn10k_mcs_rxsc *rxsc, u8 hw_secy_id)
+{
+	struct macsec_rx_sc *sw_rx_sc = rxsc->sw_rxsc;
+	struct mcs_flowid_entry_write_req *req;
+	struct mbox *mbox = &pfvf->mbox;
+	int ret;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_flowid_entry_write(mbox);
+	if (!req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	req->data[1] = FIELD_PREP(MCS_TCAM1_ETYPE_MASK, ETH_P_MACSEC);
+	req->mask[1] = ~0ULL;
+	req->mask[1] &= ~MCS_TCAM1_ETYPE_MASK;
+
+	req->mask[0] = ~0ULL;
+	req->mask[2] = ~0ULL;
+	req->mask[3] = ~0ULL;
+
+	req->flow_id = rxsc->hw_flow_id;
+	req->secy_id = hw_secy_id;
+	req->sc_id = rxsc->hw_sc_id;
+	req->dir = MCS_RX;
+
+	if (sw_rx_sc->active)
+		req->ena = 1;
+
+	ret = otx2_sync_mbox_msg(mbox);
+
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_mcs_write_sc_cam(struct otx2_nic *pfvf,
+				  struct cn10k_mcs_rxsc *rxsc, u8 hw_secy_id)
+{
+	struct macsec_rx_sc *sw_rx_sc = rxsc->sw_rxsc;
+	struct mcs_rx_sc_cam_write_req *sc_req;
+	struct mbox *mbox = &pfvf->mbox;
+	int ret;
+
+	mutex_lock(&mbox->lock);
+
+	sc_req = otx2_mbox_alloc_msg_mcs_rx_sc_cam_write(mbox);
+	if (!sc_req) {
+		return -ENOMEM;
+		goto fail;
+	}
+
+	sc_req->sci = (__force u64)cpu_to_be64((__force u64)sw_rx_sc->sci);
+	sc_req->sc_id = rxsc->hw_sc_id;
+	sc_req->secy_id = hw_secy_id;
+
+	ret = otx2_sync_mbox_msg(mbox);
+
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_mcs_write_rx_sa_plcy(struct otx2_nic *pfvf,
+				      struct macsec_secy *secy,
+				      struct cn10k_mcs_rxsc *rxsc,
+				      u8 assoc_num, bool sa_in_use)
+{
+	unsigned char *src = rxsc->sa_key[assoc_num];
+	struct mcs_sa_plcy_write_req *plcy_req;
+	struct mcs_rx_sc_sa_map *map_req;
+	struct mbox *mbox = &pfvf->mbox;
+	u8 reg, key_len;
+	int ret;
+
+	mutex_lock(&mbox->lock);
+
+	plcy_req = otx2_mbox_alloc_msg_mcs_sa_plcy_write(mbox);
+	if (!plcy_req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	map_req = otx2_mbox_alloc_msg_mcs_rx_sc_sa_map_write(mbox);
+	if (!map_req) {
+		otx2_mbox_reset(&mbox->mbox, 0);
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	for (reg = 0, key_len = 0; key_len < secy->key_len; key_len += 8) {
+		memcpy((u8 *)&plcy_req->plcy[0][reg],
+		       (src + reg * 8), 8);
+		reg++;
+	}
+
+	plcy_req->sa_index[0] = rxsc->hw_sa_id[assoc_num];
+	plcy_req->sa_cnt = 1;
+	plcy_req->dir = MCS_RX;
+
+	map_req->sa_index = rxsc->hw_sa_id[assoc_num];
+	map_req->sa_in_use = sa_in_use;
+	map_req->sc_id = rxsc->hw_sc_id;
+	map_req->an = assoc_num;
+
+	/* Send two messages together */
+	ret = otx2_sync_mbox_msg(mbox);
+
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_mcs_write_rx_sa_pn(struct otx2_nic *pfvf,
+				    struct cn10k_mcs_rxsc *rxsc,
+				    u8 assoc_num, u64 next_pn)
+{
+	struct mcs_pn_table_write_req *req;
+	struct mbox *mbox = &pfvf->mbox;
+	int ret;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_pn_table_write(mbox);
+	if (!req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	req->pn_id = rxsc->hw_sa_id[assoc_num];
+	req->next_pn = next_pn;
+	req->dir = MCS_RX;
+
+	ret = otx2_sync_mbox_msg(mbox);
+
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_mcs_write_tx_secy(struct otx2_nic *pfvf,
+				   struct macsec_secy *secy,
+				   struct cn10k_mcs_txsc *txsc)
+{
+	struct mcs_secy_plcy_write_req *req;
+	struct mbox *mbox = &pfvf->mbox;
+	struct macsec_tx_sc *sw_tx_sc;
+	/* Insert SecTag after 12 bytes (DA+SA)*/
+	u8 tag_offset = 12;
+	u8 sectag_tci = 0;
+	u64 policy;
+	int ret;
+
+	sw_tx_sc = &secy->tx_sc;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_secy_plcy_write(mbox);
+	if (!req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	if (sw_tx_sc->send_sci) {
+		sectag_tci |= MCS_TCI_SC;
+	} else {
+		if (sw_tx_sc->end_station)
+			sectag_tci |= MCS_TCI_ES;
+		if (sw_tx_sc->scb)
+			sectag_tci |= MCS_TCI_SCB;
+	}
+
+	if (sw_tx_sc->encrypt)
+		sectag_tci |= (MCS_TCI_E | MCS_TCI_C);
+
+	policy = FIELD_PREP(MCS_TX_SECY_PLCY_MTU, secy->netdev->mtu);
+	/* Write SecTag excluding AN bits(1..0) */
+	policy |= FIELD_PREP(MCS_TX_SECY_PLCY_ST_TCI, sectag_tci >> 2);
+	policy |= FIELD_PREP(MCS_TX_SECY_PLCY_ST_OFFSET, tag_offset);
+	policy |= MCS_TX_SECY_PLCY_INS_MODE;
+	policy |= MCS_TX_SECY_PLCY_AUTH_ENA;
+	policy |= FIELD_PREP(MCS_TX_SECY_PLCY_CIP, MCS_GCM_AES_128);
+
+	if (secy->protect_frames)
+		policy |= MCS_TX_SECY_PLCY_PROTECT;
+
+	/* If the encodingsa does not exist/active and protect is
+	 * not set then frames can be sent out as it is. Hence enable
+	 * the policy irrespective of secy operational when !protect.
+	 */
+	if (!secy->protect_frames || secy->operational)
+		policy |= MCS_TX_SECY_PLCY_ENA;
+
+	req->plcy = policy;
+	req->secy_id = txsc->hw_secy_id_tx;
+	req->dir = MCS_TX;
+
+	ret = otx2_sync_mbox_msg(mbox);
+
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_mcs_write_tx_flowid(struct otx2_nic *pfvf,
+				     struct macsec_secy *secy,
+				     struct cn10k_mcs_txsc *txsc)
+{
+	struct mcs_flowid_entry_write_req *req;
+	struct mbox *mbox = &pfvf->mbox;
+	u64 mac_sa;
+	int ret;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_flowid_entry_write(mbox);
+	if (!req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	mac_sa = ether_addr_to_u64(secy->netdev->dev_addr);
+
+	req->data[0] = FIELD_PREP(MCS_TCAM0_MAC_SA_MASK, mac_sa);
+	req->data[1] = FIELD_PREP(MCS_TCAM1_MAC_SA_MASK, mac_sa >> 16);
+
+	req->mask[0] = ~0ULL;
+	req->mask[0] &= ~MCS_TCAM0_MAC_SA_MASK;
+
+	req->mask[1] = ~0ULL;
+	req->mask[1] &= ~MCS_TCAM1_MAC_SA_MASK;
+
+	req->mask[2] = ~0ULL;
+	req->mask[3] = ~0ULL;
+
+	req->flow_id = txsc->hw_flow_id;
+	req->secy_id = txsc->hw_secy_id_tx;
+	req->sc_id = txsc->hw_sc_id;
+	req->sci = (__force u64)cpu_to_be64((__force u64)secy->sci);
+	req->dir = MCS_TX;
+	/* This can be enabled since stack xmits packets only when interface is up */
+	req->ena = 1;
+
+	ret = otx2_sync_mbox_msg(mbox);
+
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_mcs_link_tx_sa2sc(struct otx2_nic *pfvf,
+				   struct macsec_secy *secy,
+				   struct cn10k_mcs_txsc *txsc,
+				   u8 sa_num, bool sa_active)
+{
+	struct mcs_tx_sc_sa_map *map_req;
+	struct mbox *mbox = &pfvf->mbox;
+	int ret;
+
+	/* Link the encoding_sa only to SC out of all SAs */
+	if (txsc->encoding_sa != sa_num)
+		return 0;
+
+	mutex_lock(&mbox->lock);
+
+	map_req = otx2_mbox_alloc_msg_mcs_tx_sc_sa_map_write(mbox);
+	if (!map_req) {
+		otx2_mbox_reset(&mbox->mbox, 0);
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	map_req->sa_index0 = txsc->hw_sa_id[sa_num];
+	map_req->sa_index0_vld = sa_active;
+	map_req->sectag_sci = (__force u64)cpu_to_be64((__force u64)secy->sci);
+	map_req->sc_id = txsc->hw_sc_id;
+
+	ret = otx2_sync_mbox_msg(mbox);
+
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_mcs_write_tx_sa_plcy(struct otx2_nic *pfvf,
+				      struct macsec_secy *secy,
+				      struct cn10k_mcs_txsc *txsc,
+				      u8 assoc_num)
+{
+	unsigned char *src = txsc->sa_key[assoc_num];
+	struct mcs_sa_plcy_write_req *plcy_req;
+	struct mbox *mbox = &pfvf->mbox;
+	u8 reg, key_len;
+	int ret;
+
+	mutex_lock(&mbox->lock);
+
+	plcy_req = otx2_mbox_alloc_msg_mcs_sa_plcy_write(mbox);
+	if (!plcy_req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	for (reg = 0, key_len = 0; key_len < secy->key_len; key_len += 8) {
+		memcpy((u8 *)&plcy_req->plcy[0][reg], (src + reg * 8), 8);
+		reg++;
+	}
+
+	plcy_req->plcy[0][8] = assoc_num;
+	plcy_req->sa_index[0] = txsc->hw_sa_id[assoc_num];
+	plcy_req->sa_cnt = 1;
+	plcy_req->dir = MCS_TX;
+
+	ret = otx2_sync_mbox_msg(mbox);
+
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_write_tx_sa_pn(struct otx2_nic *pfvf,
+				struct cn10k_mcs_txsc *txsc,
+				u8 assoc_num, u64 next_pn)
+{
+	struct mcs_pn_table_write_req *req;
+	struct mbox *mbox = &pfvf->mbox;
+	int ret;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_pn_table_write(mbox);
+	if (!req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	req->pn_id = txsc->hw_sa_id[assoc_num];
+	req->next_pn = next_pn;
+	req->dir = MCS_TX;
+
+	ret = otx2_sync_mbox_msg(mbox);
+
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_mcs_ena_dis_flowid(struct otx2_nic *pfvf, u16 hw_flow_id,
+				    bool enable, enum mcs_direction dir)
+{
+	struct mcs_flowid_ena_dis_entry *req;
+	struct mbox *mbox = &pfvf->mbox;
+	int ret;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_flowid_ena_entry(mbox);
+	if (!req) {
+		return -ENOMEM;
+		goto fail;
+	}
+
+	req->flow_id = hw_flow_id;
+	req->ena = enable;
+	req->dir = dir;
+
+	ret = otx2_sync_mbox_msg(mbox);
+
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_mcs_sa_stats(struct otx2_nic *pfvf, u8 hw_sa_id,
+			      struct mcs_sa_stats *rsp_p,
+			      enum mcs_direction dir, bool clear)
+{
+	struct mcs_clear_stats *clear_req;
+	struct mbox *mbox = &pfvf->mbox;
+	struct mcs_stats_req *req;
+	struct mcs_sa_stats *rsp;
+	int ret;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_get_sa_stats(mbox);
+	if (!req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	req->id = hw_sa_id;
+	req->dir = dir;
+
+	if (!clear)
+		goto send_msg;
+
+	clear_req = otx2_mbox_alloc_msg_mcs_clear_stats(mbox);
+	if (!clear_req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	clear_req->id = hw_sa_id;
+	clear_req->dir = dir;
+	clear_req->type = MCS_RSRC_TYPE_SA;
+
+send_msg:
+	ret = otx2_sync_mbox_msg(mbox);
+	if (ret)
+		goto fail;
+
+	rsp = (struct mcs_sa_stats *)otx2_mbox_get_rsp(&pfvf->mbox.mbox,
+						       0, &req->hdr);
+	if (IS_ERR(rsp)) {
+		ret = PTR_ERR(rsp);
+		goto fail;
+	}
+
+	memcpy(rsp_p, rsp, sizeof(*rsp_p));
+
+	mutex_unlock(&mbox->lock);
+
+	return 0;
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_mcs_sc_stats(struct otx2_nic *pfvf, u8 hw_sc_id,
+			      struct mcs_sc_stats *rsp_p,
+			      enum mcs_direction dir, bool clear)
+{
+	struct mcs_clear_stats *clear_req;
+	struct mbox *mbox = &pfvf->mbox;
+	struct mcs_stats_req *req;
+	struct mcs_sc_stats *rsp;
+	int ret;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_get_sc_stats(mbox);
+	if (!req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	req->id = hw_sc_id;
+	req->dir = dir;
+
+	if (!clear)
+		goto send_msg;
+
+	clear_req = otx2_mbox_alloc_msg_mcs_clear_stats(mbox);
+	if (!clear_req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	clear_req->id = hw_sc_id;
+	clear_req->dir = dir;
+	clear_req->type = MCS_RSRC_TYPE_SC;
+
+send_msg:
+	ret = otx2_sync_mbox_msg(mbox);
+	if (ret)
+		goto fail;
+
+	rsp = (struct mcs_sc_stats *)otx2_mbox_get_rsp(&pfvf->mbox.mbox,
+						       0, &req->hdr);
+	if (IS_ERR(rsp)) {
+		ret = PTR_ERR(rsp);
+		goto fail;
+	}
+
+	memcpy(rsp_p, rsp, sizeof(*rsp_p));
+
+	mutex_unlock(&mbox->lock);
+
+	return 0;
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static int cn10k_mcs_secy_stats(struct otx2_nic *pfvf, u8 hw_secy_id,
+				struct mcs_secy_stats *rsp_p,
+				enum mcs_direction dir, bool clear)
+{
+	struct mcs_clear_stats *clear_req;
+	struct mbox *mbox = &pfvf->mbox;
+	struct mcs_secy_stats *rsp;
+	struct mcs_stats_req *req;
+	int ret;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_get_secy_stats(mbox);
+	if (!req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	req->id = hw_secy_id;
+	req->dir = dir;
+
+	if (!clear)
+		goto send_msg;
+
+	clear_req = otx2_mbox_alloc_msg_mcs_clear_stats(mbox);
+	if (!clear_req) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	clear_req->id = hw_secy_id;
+	clear_req->dir = dir;
+	clear_req->type = MCS_RSRC_TYPE_SECY;
+
+send_msg:
+	ret = otx2_sync_mbox_msg(mbox);
+	if (ret)
+		goto fail;
+
+	rsp = (struct mcs_secy_stats *)otx2_mbox_get_rsp(&pfvf->mbox.mbox,
+							 0, &req->hdr);
+	if (IS_ERR(rsp)) {
+		ret = PTR_ERR(rsp);
+		goto fail;
+	}
+
+	memcpy(rsp_p, rsp, sizeof(*rsp_p));
+
+	mutex_unlock(&mbox->lock);
+
+	return 0;
+fail:
+	mutex_unlock(&mbox->lock);
+	return ret;
+}
+
+static struct cn10k_mcs_txsc *cn10k_mcs_create_txsc(struct otx2_nic *pfvf)
+{
+	struct cn10k_mcs_txsc *txsc;
+	int ret;
+
+	txsc = kzalloc(sizeof(*txsc), GFP_KERNEL);
+	if (!txsc)
+		return ERR_PTR(-ENOMEM);
+
+	ret = cn10k_mcs_alloc_rsrc(pfvf, MCS_TX, MCS_RSRC_TYPE_FLOWID,
+				   &txsc->hw_flow_id);
+	if (ret)
+		goto fail;
+
+	/* For a SecY, one TX secy and one RX secy HW resources are needed */
+	ret = cn10k_mcs_alloc_rsrc(pfvf, MCS_TX, MCS_RSRC_TYPE_SECY,
+				   &txsc->hw_secy_id_tx);
+	if (ret)
+		goto free_flowid;
+
+	ret = cn10k_mcs_alloc_rsrc(pfvf, MCS_RX, MCS_RSRC_TYPE_SECY,
+				   &txsc->hw_secy_id_rx);
+	if (ret)
+		goto free_tx_secy;
+
+	ret = cn10k_mcs_alloc_rsrc(pfvf, MCS_TX, MCS_RSRC_TYPE_SC,
+				   &txsc->hw_sc_id);
+	if (ret)
+		goto free_rx_secy;
+
+	return txsc;
+free_rx_secy:
+	cn10k_mcs_free_rsrc(pfvf, MCS_RX, MCS_RSRC_TYPE_SECY,
+			    txsc->hw_secy_id_rx, false);
+free_tx_secy:
+	cn10k_mcs_free_rsrc(pfvf, MCS_TX, MCS_RSRC_TYPE_SECY,
+			    txsc->hw_secy_id_tx, false);
+free_flowid:
+	cn10k_mcs_free_rsrc(pfvf, MCS_TX, MCS_RSRC_TYPE_FLOWID,
+			    txsc->hw_flow_id, false);
+fail:
+	return ERR_PTR(ret);
+}
+
+/* Free Tx SC and its SAs(if any) resources to AF
+ */
+static void cn10k_mcs_delete_txsc(struct otx2_nic *pfvf,
+				  struct cn10k_mcs_txsc *txsc)
+{
+	u8 sa_bmap = txsc->sa_bmap;
+	u8 sa_num = 0;
+
+	while (sa_bmap) {
+		if (sa_bmap & 1) {
+			cn10k_mcs_write_tx_sa_plcy(pfvf, txsc->sw_secy,
+						   txsc, sa_num);
+			cn10k_mcs_free_txsa(pfvf, txsc->hw_sa_id[sa_num]);
+		}
+		sa_num++;
+		sa_bmap >>= 1;
+	}
+
+	cn10k_mcs_free_rsrc(pfvf, MCS_TX, MCS_RSRC_TYPE_SC,
+			    txsc->hw_sc_id, false);
+	cn10k_mcs_free_rsrc(pfvf, MCS_RX, MCS_RSRC_TYPE_SECY,
+			    txsc->hw_secy_id_rx, false);
+	cn10k_mcs_free_rsrc(pfvf, MCS_TX, MCS_RSRC_TYPE_SECY,
+			    txsc->hw_secy_id_tx, false);
+	cn10k_mcs_free_rsrc(pfvf, MCS_TX, MCS_RSRC_TYPE_FLOWID,
+			    txsc->hw_flow_id, false);
+}
+
+static struct cn10k_mcs_rxsc *cn10k_mcs_create_rxsc(struct otx2_nic *pfvf)
+{
+	struct cn10k_mcs_rxsc *rxsc;
+	int ret;
+
+	rxsc = kzalloc(sizeof(*rxsc), GFP_KERNEL);
+	if (!rxsc)
+		return ERR_PTR(-ENOMEM);
+
+	ret = cn10k_mcs_alloc_rsrc(pfvf, MCS_RX, MCS_RSRC_TYPE_FLOWID,
+				   &rxsc->hw_flow_id);
+	if (ret)
+		goto fail;
+
+	ret = cn10k_mcs_alloc_rsrc(pfvf, MCS_RX, MCS_RSRC_TYPE_SC,
+				   &rxsc->hw_sc_id);
+	if (ret)
+		goto free_flowid;
+
+	return rxsc;
+free_flowid:
+	cn10k_mcs_free_rsrc(pfvf, MCS_RX, MCS_RSRC_TYPE_FLOWID,
+			    rxsc->hw_flow_id, false);
+fail:
+	return ERR_PTR(ret);
+}
+
+/* Free Rx SC and its SAs(if any) resources to AF
+ */
+static void cn10k_mcs_delete_rxsc(struct otx2_nic *pfvf,
+				  struct cn10k_mcs_rxsc *rxsc)
+{
+	u8 sa_bmap = rxsc->sa_bmap;
+	u8 sa_num = 0;
+
+	while (sa_bmap) {
+		if (sa_bmap & 1) {
+			cn10k_mcs_write_rx_sa_plcy(pfvf, rxsc->sw_secy, rxsc,
+						   sa_num, false);
+			cn10k_mcs_free_rxsa(pfvf, rxsc->hw_sa_id[sa_num]);
+		}
+		sa_num++;
+		sa_bmap >>= 1;
+	}
+
+	cn10k_mcs_free_rsrc(pfvf, MCS_RX, MCS_RSRC_TYPE_SC,
+			    rxsc->hw_sc_id, false);
+	cn10k_mcs_free_rsrc(pfvf, MCS_RX, MCS_RSRC_TYPE_FLOWID,
+			    rxsc->hw_flow_id, false);
+}
+
+static int cn10k_mcs_secy_tx_cfg(struct otx2_nic *pfvf, struct macsec_secy *secy,
+				 struct cn10k_mcs_txsc *txsc,
+				 struct macsec_tx_sa *sw_tx_sa, u8 sa_num)
+{
+	if (sw_tx_sa) {
+		cn10k_mcs_write_tx_sa_plcy(pfvf, secy, txsc, sa_num);
+		cn10k_write_tx_sa_pn(pfvf, txsc, sa_num,
+				     sw_tx_sa->next_pn_halves.lower);
+		cn10k_mcs_link_tx_sa2sc(pfvf, secy, txsc, sa_num,
+					sw_tx_sa->active);
+	}
+
+	cn10k_mcs_write_tx_secy(pfvf, secy, txsc);
+	cn10k_mcs_write_tx_flowid(pfvf, secy, txsc);
+	/* When updating secy, change RX secy also */
+	cn10k_mcs_write_rx_secy(pfvf, secy, txsc->hw_secy_id_rx);
+
+	return 0;
+}
+
+static int cn10k_mcs_secy_rx_cfg(struct otx2_nic *pfvf,
+				 struct macsec_secy *secy, u8 hw_secy_id)
+{
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct cn10k_mcs_rxsc *mcs_rx_sc;
+	struct macsec_rx_sc *sw_rx_sc;
+	struct macsec_rx_sa *sw_rx_sa;
+	u8 sa_num;
+
+	for (sw_rx_sc = rcu_dereference_bh(secy->rx_sc); sw_rx_sc && sw_rx_sc->active;
+	     sw_rx_sc = rcu_dereference_bh(sw_rx_sc->next)) {
+		mcs_rx_sc = cn10k_mcs_get_rxsc(cfg, secy, sw_rx_sc);
+		if (unlikely(!mcs_rx_sc))
+			continue;
+
+		for (sa_num = 0; sa_num < CN10K_MCS_SA_PER_SC; sa_num++) {
+			sw_rx_sa = rcu_dereference_bh(sw_rx_sc->sa[sa_num]);
+			if (!sw_rx_sa)
+				continue;
+
+			cn10k_mcs_write_rx_sa_plcy(pfvf, secy, mcs_rx_sc,
+						   sa_num, sw_rx_sa->active);
+			cn10k_mcs_write_rx_sa_pn(pfvf, mcs_rx_sc, sa_num,
+						 sw_rx_sa->next_pn_halves.lower);
+		}
+
+		cn10k_mcs_write_rx_flowid(pfvf, mcs_rx_sc, hw_secy_id);
+		cn10k_mcs_write_sc_cam(pfvf, mcs_rx_sc, hw_secy_id);
+	}
+
+	return 0;
+}
+
+static int cn10k_mcs_disable_rxscs(struct otx2_nic *pfvf,
+				   struct macsec_secy *secy,
+				   bool delete)
+{
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct cn10k_mcs_rxsc *mcs_rx_sc;
+	struct macsec_rx_sc *sw_rx_sc;
+	int ret;
+
+	for (sw_rx_sc = rcu_dereference_bh(secy->rx_sc); sw_rx_sc && sw_rx_sc->active;
+	     sw_rx_sc = rcu_dereference_bh(sw_rx_sc->next)) {
+		mcs_rx_sc = cn10k_mcs_get_rxsc(cfg, secy, sw_rx_sc);
+		if (unlikely(!mcs_rx_sc))
+			continue;
+
+		ret = cn10k_mcs_ena_dis_flowid(pfvf, mcs_rx_sc->hw_flow_id,
+					       false, MCS_RX);
+		if (ret)
+			dev_err(pfvf->dev, "Failed to disable TCAM for SC %d\n",
+				mcs_rx_sc->hw_sc_id);
+		if (delete) {
+			cn10k_mcs_delete_rxsc(pfvf, mcs_rx_sc);
+			list_del(&mcs_rx_sc->entry);
+			kfree(mcs_rx_sc);
+		}
+	}
+
+	return 0;
+}
+
+static void cn10k_mcs_sync_stats(struct otx2_nic *pfvf, struct macsec_secy *secy,
+				 struct cn10k_mcs_txsc *txsc)
+{
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct mcs_secy_stats rx_rsp = { 0 };
+	struct mcs_sc_stats sc_rsp = { 0 };
+	struct cn10k_mcs_rxsc *rxsc;
+
+	/* Because of shared counters for some stats in the hardware, when
+	 * updating secy policy take a snapshot of current stats and reset them.
+	 * Below are the effected stats because of shared counters.
+	 */
+
+	/* Check if sync is really needed */
+	if (secy->validate_frames == txsc->last_validate_frames &&
+	    secy->protect_frames == txsc->last_protect_frames)
+		return;
+
+	cn10k_mcs_secy_stats(pfvf, txsc->hw_secy_id_rx, &rx_rsp, MCS_RX, true);
+
+	txsc->stats.InPktsBadTag += rx_rsp.pkt_badtag_cnt;
+	txsc->stats.InPktsUnknownSCI += rx_rsp.pkt_nosa_cnt;
+	txsc->stats.InPktsNoSCI += rx_rsp.pkt_nosaerror_cnt;
+	if (txsc->last_validate_frames == MACSEC_VALIDATE_STRICT)
+		txsc->stats.InPktsNoTag += rx_rsp.pkt_untaged_cnt;
+	else
+		txsc->stats.InPktsUntagged += rx_rsp.pkt_untaged_cnt;
+
+	list_for_each_entry(rxsc, &cfg->rxsc_list, entry) {
+		cn10k_mcs_sc_stats(pfvf, rxsc->hw_sc_id, &sc_rsp, MCS_RX, true);
+
+		rxsc->stats.InOctetsValidated += sc_rsp.octet_validate_cnt;
+		rxsc->stats.InOctetsDecrypted += sc_rsp.octet_decrypt_cnt;
+
+		rxsc->stats.InPktsInvalid += sc_rsp.pkt_invalid_cnt;
+		rxsc->stats.InPktsNotValid += sc_rsp.pkt_notvalid_cnt;
+
+		if (txsc->last_protect_frames)
+			rxsc->stats.InPktsLate += sc_rsp.pkt_late_cnt;
+		else
+			rxsc->stats.InPktsDelayed += sc_rsp.pkt_late_cnt;
+
+		if (txsc->last_validate_frames == MACSEC_VALIDATE_CHECK)
+			rxsc->stats.InPktsUnchecked += sc_rsp.pkt_unchecked_cnt;
+		else
+			rxsc->stats.InPktsOK += sc_rsp.pkt_unchecked_cnt;
+	}
+
+	txsc->last_validate_frames = secy->validate_frames;
+	txsc->last_protect_frames = secy->protect_frames;
+}
+
+static int cn10k_mdo_open(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct macsec_secy *secy = ctx->secy;
+	struct macsec_tx_sa *sw_tx_sa;
+	struct cn10k_mcs_txsc *txsc;
+	u8 sa_num;
+	int err;
+
+	txsc = cn10k_mcs_get_txsc(cfg, ctx->secy);
+	if (!txsc)
+		return -ENOENT;
+
+	sa_num = txsc->encoding_sa;
+	sw_tx_sa = rcu_dereference_bh(secy->tx_sc.sa[sa_num]);
+
+	err = cn10k_mcs_secy_tx_cfg(pfvf, secy, txsc, sw_tx_sa, sa_num);
+	if (err)
+		return err;
+
+	return cn10k_mcs_secy_rx_cfg(pfvf, secy, txsc->hw_secy_id_rx);
+}
+
+static int cn10k_mdo_stop(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct cn10k_mcs_txsc *txsc;
+	int err;
+
+	txsc = cn10k_mcs_get_txsc(cfg, ctx->secy);
+	if (!txsc)
+		return -ENOENT;
+
+	err = cn10k_mcs_ena_dis_flowid(pfvf, txsc->hw_flow_id, false, MCS_TX);
+	if (err)
+		return err;
+
+	return cn10k_mcs_disable_rxscs(pfvf, ctx->secy, false);
+}
+
+static int cn10k_mdo_add_secy(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct macsec_secy *secy = ctx->secy;
+	struct cn10k_mcs_txsc *txsc;
+
+	if (secy->icv_len != MACSEC_DEFAULT_ICV_LEN)
+		return -EOPNOTSUPP;
+
+	/* Stick to 16 bytes key len until XPN support is added */
+	if (secy->key_len != 16)
+		return -EOPNOTSUPP;
+
+	if (secy->xpn)
+		return -EOPNOTSUPP;
+
+	txsc = cn10k_mcs_create_txsc(pfvf);
+	if (IS_ERR(txsc))
+		return -ENOSPC;
+
+	txsc->sw_secy = secy;
+	txsc->encoding_sa = secy->tx_sc.encoding_sa;
+	txsc->last_validate_frames = secy->validate_frames;
+	txsc->last_protect_frames = secy->protect_frames;
+
+	list_add(&txsc->entry, &cfg->txsc_list);
+
+	if (netif_running(secy->netdev))
+		return cn10k_mcs_secy_tx_cfg(pfvf, secy, txsc, NULL, 0);
+
+	return 0;
+}
+
+static int cn10k_mdo_upd_secy(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct macsec_secy *secy = ctx->secy;
+	struct macsec_tx_sa *sw_tx_sa;
+	struct cn10k_mcs_txsc *txsc;
+	u8 sa_num;
+	int err;
+
+	txsc = cn10k_mcs_get_txsc(cfg, secy);
+	if (!txsc)
+		return -ENOENT;
+
+	txsc->encoding_sa = secy->tx_sc.encoding_sa;
+
+	sa_num = txsc->encoding_sa;
+	sw_tx_sa = rcu_dereference_bh(secy->tx_sc.sa[sa_num]);
+
+	if (netif_running(secy->netdev)) {
+		cn10k_mcs_sync_stats(pfvf, secy, txsc);
+
+		err = cn10k_mcs_secy_tx_cfg(pfvf, secy, txsc, sw_tx_sa, sa_num);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int cn10k_mdo_del_secy(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct cn10k_mcs_txsc *txsc;
+
+	txsc = cn10k_mcs_get_txsc(cfg, ctx->secy);
+	if (!txsc)
+		return -ENOENT;
+
+	cn10k_mcs_ena_dis_flowid(pfvf, txsc->hw_flow_id, false, MCS_TX);
+	cn10k_mcs_disable_rxscs(pfvf, ctx->secy, true);
+	cn10k_mcs_delete_txsc(pfvf, txsc);
+	list_del(&txsc->entry);
+	kfree(txsc);
+
+	return 0;
+}
+
+static int cn10k_mdo_add_txsa(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct macsec_tx_sa *sw_tx_sa = ctx->sa.tx_sa;
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct macsec_secy *secy = ctx->secy;
+	u8 sa_num = ctx->sa.assoc_num;
+	struct cn10k_mcs_txsc *txsc;
+	int err;
+
+	txsc = cn10k_mcs_get_txsc(cfg, secy);
+	if (!txsc)
+		return -ENOENT;
+
+	if (sa_num >= CN10K_MCS_SA_PER_SC)
+		return -EOPNOTSUPP;
+
+	if (cn10k_mcs_alloc_txsa(pfvf, &txsc->hw_sa_id[sa_num]))
+		return -ENOSPC;
+
+	memcpy(&txsc->sa_key[sa_num], ctx->sa.key, secy->key_len);
+	txsc->sa_bmap |= 1 << sa_num;
+
+	if (netif_running(secy->netdev)) {
+		err = cn10k_mcs_write_tx_sa_plcy(pfvf, secy, txsc, sa_num);
+		if (err)
+			return err;
+
+		err = cn10k_write_tx_sa_pn(pfvf, txsc, sa_num,
+					   sw_tx_sa->next_pn_halves.lower);
+		if (err)
+			return err;
+
+		err = cn10k_mcs_link_tx_sa2sc(pfvf, secy, txsc,
+					      sa_num, sw_tx_sa->active);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int cn10k_mdo_upd_txsa(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct macsec_tx_sa *sw_tx_sa = ctx->sa.tx_sa;
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct macsec_secy *secy = ctx->secy;
+	u8 sa_num = ctx->sa.assoc_num;
+	struct cn10k_mcs_txsc *txsc;
+	int err;
+
+	txsc = cn10k_mcs_get_txsc(cfg, secy);
+	if (!txsc)
+		return -ENOENT;
+
+	if (sa_num >= CN10K_MCS_SA_PER_SC)
+		return -EOPNOTSUPP;
+
+	if (netif_running(secy->netdev)) {
+		/* Keys cannot be changed after creation */
+		err = cn10k_write_tx_sa_pn(pfvf, txsc, sa_num,
+					   sw_tx_sa->next_pn_halves.lower);
+		if (err)
+			return err;
+
+		err = cn10k_mcs_link_tx_sa2sc(pfvf, secy, txsc,
+					      sa_num, sw_tx_sa->active);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int cn10k_mdo_del_txsa(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	u8 sa_num = ctx->sa.assoc_num;
+	struct cn10k_mcs_txsc *txsc;
+
+	txsc = cn10k_mcs_get_txsc(cfg, ctx->secy);
+	if (!txsc)
+		return -ENOENT;
+
+	if (sa_num >= CN10K_MCS_SA_PER_SC)
+		return -EOPNOTSUPP;
+
+	cn10k_mcs_free_txsa(pfvf, txsc->hw_sa_id[sa_num]);
+	txsc->sa_bmap &= ~(1 << sa_num);
+
+	return 0;
+}
+
+static int cn10k_mdo_add_rxsc(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct macsec_secy *secy = ctx->secy;
+	struct cn10k_mcs_rxsc *rxsc;
+	struct cn10k_mcs_txsc *txsc;
+	int err;
+
+	txsc = cn10k_mcs_get_txsc(cfg, secy);
+	if (!txsc)
+		return -ENOENT;
+
+	rxsc = cn10k_mcs_create_rxsc(pfvf);
+	if (IS_ERR(rxsc))
+		return -ENOSPC;
+
+	rxsc->sw_secy = ctx->secy;
+	rxsc->sw_rxsc = ctx->rx_sc;
+	list_add(&rxsc->entry, &cfg->rxsc_list);
+
+	if (netif_running(secy->netdev)) {
+		err = cn10k_mcs_write_rx_flowid(pfvf, rxsc, txsc->hw_secy_id_rx);
+		if (err)
+			return err;
+
+		err = cn10k_mcs_write_sc_cam(pfvf, rxsc, txsc->hw_secy_id_rx);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int cn10k_mdo_upd_rxsc(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct macsec_secy *secy = ctx->secy;
+	bool enable = ctx->rx_sc->active;
+	struct cn10k_mcs_rxsc *rxsc;
+
+	rxsc = cn10k_mcs_get_rxsc(cfg, secy, ctx->rx_sc);
+	if (!rxsc)
+		return -ENOENT;
+
+	if (netif_running(secy->netdev))
+		return cn10k_mcs_ena_dis_flowid(pfvf, rxsc->hw_flow_id,
+						enable, MCS_RX);
+
+	return 0;
+}
+
+static int cn10k_mdo_del_rxsc(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct cn10k_mcs_rxsc *rxsc;
+
+	rxsc = cn10k_mcs_get_rxsc(cfg, ctx->secy, ctx->rx_sc);
+	if (!rxsc)
+		return -ENOENT;
+
+	cn10k_mcs_ena_dis_flowid(pfvf, rxsc->hw_flow_id, false, MCS_RX);
+	cn10k_mcs_delete_rxsc(pfvf, rxsc);
+	list_del(&rxsc->entry);
+	kfree(rxsc);
+
+	return 0;
+}
+
+static int cn10k_mdo_add_rxsa(struct macsec_context *ctx)
+{
+	struct macsec_rx_sc *sw_rx_sc = ctx->sa.rx_sa->sc;
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct macsec_rx_sa *rx_sa = ctx->sa.rx_sa;
+	u64 next_pn = rx_sa->next_pn_halves.lower;
+	struct macsec_secy *secy = ctx->secy;
+	bool sa_in_use = rx_sa->active;
+	u8 sa_num = ctx->sa.assoc_num;
+	struct cn10k_mcs_rxsc *rxsc;
+	int err;
+
+	rxsc = cn10k_mcs_get_rxsc(cfg, secy, sw_rx_sc);
+	if (!rxsc)
+		return -ENOENT;
+
+	if (sa_num >= CN10K_MCS_SA_PER_SC)
+		return -EOPNOTSUPP;
+
+	if (cn10k_mcs_alloc_rxsa(pfvf, &rxsc->hw_sa_id[sa_num]))
+		return -ENOSPC;
+
+	memcpy(&rxsc->sa_key[sa_num], ctx->sa.key, ctx->secy->key_len);
+	rxsc->sa_bmap |= 1 << sa_num;
+
+	if (netif_running(secy->netdev)) {
+		err = cn10k_mcs_write_rx_sa_plcy(pfvf, secy, rxsc,
+						 sa_num, sa_in_use);
+		if (err)
+			return err;
+
+		err = cn10k_mcs_write_rx_sa_pn(pfvf, rxsc, sa_num, next_pn);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int cn10k_mdo_upd_rxsa(struct macsec_context *ctx)
+{
+	struct macsec_rx_sc *sw_rx_sc = ctx->sa.rx_sa->sc;
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct macsec_rx_sa *rx_sa = ctx->sa.rx_sa;
+	u64 next_pn = rx_sa->next_pn_halves.lower;
+	struct macsec_secy *secy = ctx->secy;
+	bool sa_in_use = rx_sa->active;
+	u8 sa_num = ctx->sa.assoc_num;
+	struct cn10k_mcs_rxsc *rxsc;
+	int err;
+
+	rxsc = cn10k_mcs_get_rxsc(cfg, secy, sw_rx_sc);
+	if (!rxsc)
+		return -ENOENT;
+
+	if (sa_num >= CN10K_MCS_SA_PER_SC)
+		return -EOPNOTSUPP;
+
+	if (netif_running(secy->netdev)) {
+		err = cn10k_mcs_write_rx_sa_plcy(pfvf, secy, rxsc, sa_num, sa_in_use);
+		if (err)
+			return err;
+
+		err = cn10k_mcs_write_rx_sa_pn(pfvf, rxsc, sa_num, next_pn);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int cn10k_mdo_del_rxsa(struct macsec_context *ctx)
+{
+	struct macsec_rx_sc *sw_rx_sc = ctx->sa.rx_sa->sc;
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	u8 sa_num = ctx->sa.assoc_num;
+	struct cn10k_mcs_rxsc *rxsc;
+
+	rxsc = cn10k_mcs_get_rxsc(cfg, ctx->secy, sw_rx_sc);
+	if (!rxsc)
+		return -ENOENT;
+
+	if (sa_num >= CN10K_MCS_SA_PER_SC)
+		return -EOPNOTSUPP;
+
+	cn10k_mcs_write_rx_sa_plcy(pfvf, ctx->secy, rxsc, sa_num, false);
+	cn10k_mcs_free_rxsa(pfvf, rxsc->hw_sa_id[sa_num]);
+
+	rxsc->sa_bmap &= ~(1 << sa_num);
+
+	return 0;
+}
+
+static int cn10k_mdo_get_dev_stats(struct macsec_context *ctx)
+{
+	struct mcs_secy_stats tx_rsp = { 0 }, rx_rsp = { 0 };
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct macsec_secy *secy = ctx->secy;
+	struct cn10k_mcs_txsc *txsc;
+
+	txsc = cn10k_mcs_get_txsc(cfg, ctx->secy);
+	if (!txsc)
+		return -ENOENT;
+
+	cn10k_mcs_secy_stats(pfvf, txsc->hw_secy_id_tx, &tx_rsp, MCS_TX, false);
+	ctx->stats.dev_stats->OutPktsUntagged = tx_rsp.pkt_untagged_cnt;
+	ctx->stats.dev_stats->OutPktsTooLong = tx_rsp.pkt_toolong_cnt;
+
+	cn10k_mcs_secy_stats(pfvf, txsc->hw_secy_id_rx, &rx_rsp, MCS_RX, true);
+	txsc->stats.InPktsBadTag += rx_rsp.pkt_badtag_cnt;
+	txsc->stats.InPktsUnknownSCI += rx_rsp.pkt_nosa_cnt;
+	txsc->stats.InPktsNoSCI += rx_rsp.pkt_nosaerror_cnt;
+	if (secy->validate_frames == MACSEC_VALIDATE_STRICT)
+		txsc->stats.InPktsNoTag += rx_rsp.pkt_untaged_cnt;
+	else
+		txsc->stats.InPktsUntagged += rx_rsp.pkt_untaged_cnt;
+	txsc->stats.InPktsOverrun = 0;
+
+	ctx->stats.dev_stats->InPktsNoTag = txsc->stats.InPktsNoTag;
+	ctx->stats.dev_stats->InPktsUntagged = txsc->stats.InPktsUntagged;
+	ctx->stats.dev_stats->InPktsBadTag = txsc->stats.InPktsBadTag;
+	ctx->stats.dev_stats->InPktsUnknownSCI = txsc->stats.InPktsUnknownSCI;
+	ctx->stats.dev_stats->InPktsNoSCI = txsc->stats.InPktsNoSCI;
+	ctx->stats.dev_stats->InPktsOverrun = txsc->stats.InPktsOverrun;
+
+	return 0;
+}
+
+static int cn10k_mdo_get_tx_sc_stats(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct mcs_sc_stats rsp = { 0 };
+	struct cn10k_mcs_txsc *txsc;
+
+	txsc = cn10k_mcs_get_txsc(cfg, ctx->secy);
+	if (!txsc)
+		return -ENOENT;
+
+	cn10k_mcs_sc_stats(pfvf, txsc->hw_sc_id, &rsp, MCS_TX, false);
+
+	ctx->stats.tx_sc_stats->OutPktsProtected = rsp.pkt_protected_cnt;
+	ctx->stats.tx_sc_stats->OutPktsEncrypted = rsp.pkt_encrypt_cnt;
+	ctx->stats.tx_sc_stats->OutOctetsProtected = rsp.octet_protected_cnt;
+	ctx->stats.tx_sc_stats->OutOctetsEncrypted = rsp.octet_encrypt_cnt;
+
+	return 0;
+}
+
+static int cn10k_mdo_get_tx_sa_stats(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct mcs_sa_stats rsp = { 0 };
+	u8 sa_num = ctx->sa.assoc_num;
+	struct cn10k_mcs_txsc *txsc;
+
+	txsc = cn10k_mcs_get_txsc(cfg, ctx->secy);
+	if (!txsc)
+		return -ENOENT;
+
+	if (sa_num >= CN10K_MCS_SA_PER_SC)
+		return -EOPNOTSUPP;
+
+	cn10k_mcs_sa_stats(pfvf, txsc->hw_sa_id[sa_num], &rsp, MCS_TX, false);
+
+	ctx->stats.tx_sa_stats->OutPktsProtected = rsp.pkt_protected_cnt;
+	ctx->stats.tx_sa_stats->OutPktsEncrypted = rsp.pkt_encrypt_cnt;
+
+	return 0;
+}
+
+static int cn10k_mdo_get_rx_sc_stats(struct macsec_context *ctx)
+{
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct macsec_secy *secy = ctx->secy;
+	struct mcs_sc_stats rsp = { 0 };
+	struct cn10k_mcs_rxsc *rxsc;
+
+	rxsc = cn10k_mcs_get_rxsc(cfg, secy, ctx->rx_sc);
+	if (!rxsc)
+		return -ENOENT;
+
+	cn10k_mcs_sc_stats(pfvf, rxsc->hw_sc_id, &rsp, MCS_RX, true);
+
+	rxsc->stats.InOctetsValidated += rsp.octet_validate_cnt;
+	rxsc->stats.InOctetsDecrypted += rsp.octet_decrypt_cnt;
+
+	rxsc->stats.InPktsInvalid += rsp.pkt_invalid_cnt;
+	rxsc->stats.InPktsNotValid += rsp.pkt_notvalid_cnt;
+
+	if (secy->protect_frames)
+		rxsc->stats.InPktsLate += rsp.pkt_late_cnt;
+	else
+		rxsc->stats.InPktsDelayed += rsp.pkt_late_cnt;
+
+	if (secy->validate_frames == MACSEC_VALIDATE_CHECK)
+		rxsc->stats.InPktsUnchecked += rsp.pkt_unchecked_cnt;
+	else
+		rxsc->stats.InPktsOK += rsp.pkt_unchecked_cnt;
+
+	ctx->stats.rx_sc_stats->InOctetsValidated = rxsc->stats.InOctetsValidated;
+	ctx->stats.rx_sc_stats->InOctetsDecrypted = rxsc->stats.InOctetsDecrypted;
+	ctx->stats.rx_sc_stats->InPktsInvalid = rxsc->stats.InPktsInvalid;
+	ctx->stats.rx_sc_stats->InPktsNotValid = rxsc->stats.InPktsNotValid;
+	ctx->stats.rx_sc_stats->InPktsLate = rxsc->stats.InPktsLate;
+	ctx->stats.rx_sc_stats->InPktsDelayed = rxsc->stats.InPktsDelayed;
+	ctx->stats.rx_sc_stats->InPktsUnchecked = rxsc->stats.InPktsUnchecked;
+	ctx->stats.rx_sc_stats->InPktsOK = rxsc->stats.InPktsOK;
+
+	return 0;
+}
+
+static int cn10k_mdo_get_rx_sa_stats(struct macsec_context *ctx)
+{
+	struct macsec_rx_sc *sw_rx_sc = ctx->sa.rx_sa->sc;
+	struct otx2_nic *pfvf = netdev_priv(ctx->netdev);
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct mcs_sa_stats rsp = { 0 };
+	u8 sa_num = ctx->sa.assoc_num;
+	struct cn10k_mcs_rxsc *rxsc;
+
+	rxsc = cn10k_mcs_get_rxsc(cfg, ctx->secy, sw_rx_sc);
+	if (!rxsc)
+		return -ENOENT;
+
+	if (sa_num >= CN10K_MCS_SA_PER_SC)
+		return -EOPNOTSUPP;
+
+	cn10k_mcs_sa_stats(pfvf, rxsc->hw_sa_id[sa_num], &rsp, MCS_RX, false);
+
+	ctx->stats.rx_sa_stats->InPktsOK = rsp.pkt_ok_cnt;
+	ctx->stats.rx_sa_stats->InPktsInvalid = rsp.pkt_invalid_cnt;
+	ctx->stats.rx_sa_stats->InPktsNotValid = rsp.pkt_notvalid_cnt;
+	ctx->stats.rx_sa_stats->InPktsNotUsingSA = rsp.pkt_nosaerror_cnt;
+	ctx->stats.rx_sa_stats->InPktsUnusedSA = rsp.pkt_nosa_cnt;
+
+	return 0;
+}
+
+static const struct macsec_ops cn10k_mcs_ops = {
+	.mdo_dev_open = cn10k_mdo_open,
+	.mdo_dev_stop = cn10k_mdo_stop,
+	.mdo_add_secy = cn10k_mdo_add_secy,
+	.mdo_upd_secy = cn10k_mdo_upd_secy,
+	.mdo_del_secy = cn10k_mdo_del_secy,
+	.mdo_add_rxsc = cn10k_mdo_add_rxsc,
+	.mdo_upd_rxsc = cn10k_mdo_upd_rxsc,
+	.mdo_del_rxsc = cn10k_mdo_del_rxsc,
+	.mdo_add_rxsa = cn10k_mdo_add_rxsa,
+	.mdo_upd_rxsa = cn10k_mdo_upd_rxsa,
+	.mdo_del_rxsa = cn10k_mdo_del_rxsa,
+	.mdo_add_txsa = cn10k_mdo_add_txsa,
+	.mdo_upd_txsa = cn10k_mdo_upd_txsa,
+	.mdo_del_txsa = cn10k_mdo_del_txsa,
+	.mdo_get_dev_stats = cn10k_mdo_get_dev_stats,
+	.mdo_get_tx_sc_stats = cn10k_mdo_get_tx_sc_stats,
+	.mdo_get_tx_sa_stats = cn10k_mdo_get_tx_sa_stats,
+	.mdo_get_rx_sc_stats = cn10k_mdo_get_rx_sc_stats,
+	.mdo_get_rx_sa_stats = cn10k_mdo_get_rx_sa_stats,
+};
+
+void cn10k_handle_mcs_event(struct otx2_nic *pfvf, struct mcs_intr_info *event)
+{
+	struct cn10k_mcs_cfg *cfg = pfvf->macsec_cfg;
+	struct macsec_tx_sa *sw_tx_sa = NULL;
+	struct macsec_secy *secy = NULL;
+	struct cn10k_mcs_txsc *txsc;
+	u8 an;
+
+	if (!test_bit(CN10K_HW_MACSEC, &pfvf->hw.cap_flag))
+		return;
+
+	if (!(event->intr_mask & MCS_CPM_TX_PACKET_XPN_EQ0_INT))
+		return;
+
+	/* Find the SecY to which the expired hardware SA is mapped */
+	list_for_each_entry(txsc, &cfg->txsc_list, entry) {
+		for (an = 0; an < CN10K_MCS_SA_PER_SC; an++)
+			if (txsc->hw_sa_id[an] == event->sa_id) {
+				secy = txsc->sw_secy;
+				sw_tx_sa = rcu_dereference_bh(secy->tx_sc.sa[an]);
+			}
+	}
+
+	if (secy && sw_tx_sa)
+		macsec_pn_wrapped(secy, sw_tx_sa);
+}
+
+int cn10k_mcs_init(struct otx2_nic *pfvf)
+{
+	struct mbox *mbox = &pfvf->mbox;
+	struct cn10k_mcs_cfg *cfg;
+	struct mcs_intr_cfg *req;
+
+	if (!test_bit(CN10K_HW_MACSEC, &pfvf->hw.cap_flag))
+		return 0;
+
+	cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+	if (!cfg)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&cfg->txsc_list);
+	INIT_LIST_HEAD(&cfg->rxsc_list);
+	pfvf->macsec_cfg = cfg;
+
+	pfvf->netdev->features |= NETIF_F_HW_MACSEC;
+	pfvf->netdev->macsec_ops = &cn10k_mcs_ops;
+
+	mutex_lock(&mbox->lock);
+
+	req = otx2_mbox_alloc_msg_mcs_intr_cfg(mbox);
+	if (!req)
+		goto fail;
+
+	req->intr_mask = MCS_CPM_TX_PACKET_XPN_EQ0_INT;
+
+	if (otx2_sync_mbox_msg(mbox))
+		goto fail;
+
+	mutex_unlock(&mbox->lock);
+
+	return 0;
+fail:
+	dev_err(pfvf->dev, "Cannot notify PN wrapped event\n");
+	return 0;
+}
+
+void cn10k_mcs_free(struct otx2_nic *pfvf)
+{
+	if (!test_bit(CN10K_HW_MACSEC, &pfvf->hw.cap_flag))
+		return;
+
+	cn10k_mcs_free_rsrc(pfvf, MCS_TX, MCS_RSRC_TYPE_SECY, 0, true);
+	cn10k_mcs_free_rsrc(pfvf, MCS_RX, MCS_RSRC_TYPE_SECY, 0, true);
+	kfree(pfvf->macsec_cfg);
+	pfvf->macsec_cfg = NULL;
+}
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
index bc3e6aae6efa..9ac9e6615ae7 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
@@ -1827,4 +1827,5 @@ otx2_mbox_up_handler_ ## _fn_name(struct otx2_nic *pfvf,		\
 }									\
 EXPORT_SYMBOL(otx2_mbox_up_handler_ ## _fn_name);
 MBOX_UP_CGX_MESSAGES
+MBOX_UP_MCS_MESSAGES
 #undef M
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
index 4c7691a1a1ed..282db6fe3b08 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
@@ -19,6 +19,7 @@
 #include <net/devlink.h>
 #include <linux/time64.h>
 #include <linux/dim.h>
+#include <uapi/linux/if_macsec.h>
 
 #include <mbox.h>
 #include <npc.h>
@@ -33,6 +34,7 @@
 #define PCI_DEVID_OCTEONTX2_RVU_AFVF		0xA0F8
 
 #define PCI_SUBSYS_DEVID_96XX_RVU_PFVF		0xB200
+#define PCI_SUBSYS_DEVID_CN10K_B_RVU_PFVF	0xBD00
 
 /* PCI BAR nos */
 #define PCI_CFG_REG_BAR_NUM                     2
@@ -244,6 +246,7 @@ struct otx2_hw {
 #define CN10K_LMTST		2
 #define CN10K_RPM		3
 #define CN10K_PTP_ONESTEP	4
+#define CN10K_HW_MACSEC		5
 	unsigned long		cap_flag;
 
 #define LMT_LINE_SIZE		128
@@ -351,6 +354,66 @@ struct dev_hw_ops {
 	void	(*aura_freeptr)(void *dev, int aura, u64 buf);
 };
 
+#define CN10K_MCS_SA_PER_SC	4
+
+/* Stats which need to be accumulated in software because
+ * of shared counters in hardware.
+ */
+struct cn10k_txsc_stats {
+	u64 InPktsUntagged;
+	u64 InPktsNoTag;
+	u64 InPktsBadTag;
+	u64 InPktsUnknownSCI;
+	u64 InPktsNoSCI;
+	u64 InPktsOverrun;
+};
+
+struct cn10k_rxsc_stats {
+	u64 InOctetsValidated;
+	u64 InOctetsDecrypted;
+	u64 InPktsUnchecked;
+	u64 InPktsDelayed;
+	u64 InPktsOK;
+	u64 InPktsInvalid;
+	u64 InPktsLate;
+	u64 InPktsNotValid;
+	u64 InPktsNotUsingSA;
+	u64 InPktsUnusedSA;
+};
+
+struct cn10k_mcs_txsc {
+	struct macsec_secy *sw_secy;
+	struct cn10k_txsc_stats stats;
+	struct list_head entry;
+	enum macsec_validation_type last_validate_frames;
+	bool last_protect_frames;
+	u16 hw_secy_id_tx;
+	u16 hw_secy_id_rx;
+	u16 hw_flow_id;
+	u16 hw_sc_id;
+	u16 hw_sa_id[CN10K_MCS_SA_PER_SC];
+	u8 sa_bmap;
+	u8 sa_key[CN10K_MCS_SA_PER_SC][MACSEC_MAX_KEY_LEN];
+	u8 encoding_sa;
+};
+
+struct cn10k_mcs_rxsc {
+	struct macsec_secy *sw_secy;
+	struct macsec_rx_sc *sw_rxsc;
+	struct cn10k_rxsc_stats stats;
+	struct list_head entry;
+	u16 hw_flow_id;
+	u16 hw_sc_id;
+	u16 hw_sa_id[CN10K_MCS_SA_PER_SC];
+	u8 sa_bmap;
+	u8 sa_key[CN10K_MCS_SA_PER_SC][MACSEC_MAX_KEY_LEN];
+};
+
+struct cn10k_mcs_cfg {
+	struct list_head txsc_list;
+	struct list_head rxsc_list;
+};
+
 struct otx2_nic {
 	void __iomem		*reg_base;
 	struct net_device	*netdev;
@@ -438,6 +501,10 @@ struct otx2_nic {
 
 	/* napi event count. It is needed for adaptive irq coalescing. */
 	u32 napi_events;
+
+#if IS_ENABLED(CONFIG_MACSEC)
+	struct cn10k_mcs_cfg	*macsec_cfg;
+#endif
 };
 
 static inline bool is_otx2_lbkvf(struct pci_dev *pdev)
@@ -477,6 +544,11 @@ static inline bool is_dev_otx2(struct pci_dev *pdev)
 		midr == PCI_REVISION_ID_95XXMM || midr == PCI_REVISION_ID_95XXO);
 }
 
+static inline bool is_dev_cn10kb(struct pci_dev *pdev)
+{
+	return pdev->subsystem_device == PCI_SUBSYS_DEVID_CN10K_B_RVU_PFVF;
+}
+
 static inline void otx2_setup_dev_hw_settings(struct otx2_nic *pfvf)
 {
 	struct otx2_hw *hw = &pfvf->hw;
@@ -508,6 +580,9 @@ static inline void otx2_setup_dev_hw_settings(struct otx2_nic *pfvf)
 		__set_bit(CN10K_RPM, &hw->cap_flag);
 		__set_bit(CN10K_PTP_ONESTEP, &hw->cap_flag);
 	}
+
+	if (is_dev_cn10kb(pfvf->pdev))
+		__set_bit(CN10K_HW_MACSEC, &hw->cap_flag);
 }
 
 /* Register read/write APIs */
@@ -763,6 +838,7 @@ otx2_mbox_up_handler_ ## _fn_name(struct otx2_nic *pfvf,		\
 				struct _rsp_type *rsp);			\
 
 MBOX_UP_CGX_MESSAGES
+MBOX_UP_MCS_MESSAGES
 #undef M
 
 /* Time to wait before watchdog kicks off */
@@ -945,4 +1021,18 @@ int otx2_pfc_txschq_alloc(struct otx2_nic *pfvf);
 int otx2_pfc_txschq_update(struct otx2_nic *pfvf);
 int otx2_pfc_txschq_stop(struct otx2_nic *pfvf);
 #endif
+
+#if IS_ENABLED(CONFIG_MACSEC)
+/* MACSEC offload support */
+int cn10k_mcs_init(struct otx2_nic *pfvf);
+void cn10k_mcs_free(struct otx2_nic *pfvf);
+void cn10k_handle_mcs_event(struct otx2_nic *pfvf, struct mcs_intr_info *event);
+#else
+static inline int cn10k_mcs_init(struct otx2_nic *pfvf) { return 0; }
+static inline void cn10k_mcs_free(struct otx2_nic *pfvf) {}
+static inline void cn10k_handle_mcs_event(struct otx2_nic *pfvf,
+					  struct mcs_intr_info *event)
+{}
+#endif /* CONFIG_MACSEC */
+
 #endif /* OTX2_COMMON_H */
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
index fa9348d6a4f4..5803d7f9137c 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
@@ -858,6 +858,15 @@ static void otx2_handle_link_event(struct otx2_nic *pf)
 	}
 }
 
+int otx2_mbox_up_handler_mcs_intr_notify(struct otx2_nic *pf,
+					 struct mcs_intr_info *event,
+					 struct msg_rsp *rsp)
+{
+	cn10k_handle_mcs_event(pf, event);
+
+	return 0;
+}
+
 int otx2_mbox_up_handler_cgx_link_event(struct otx2_nic *pf,
 					struct cgx_link_info_msg *msg,
 					struct msg_rsp *rsp)
@@ -917,6 +926,7 @@ static int otx2_process_mbox_msg_up(struct otx2_nic *pf,
 		return err;						\
 	}
 MBOX_UP_CGX_MESSAGES
+MBOX_UP_MCS_MESSAGES
 #undef M
 		break;
 	default:
@@ -2764,6 +2774,10 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (err)
 		goto err_ptp_destroy;
 
+	err = cn10k_mcs_init(pf);
+	if (err)
+		goto err_del_mcam_entries;
+
 	if (pf->flags & OTX2_FLAG_NTUPLE_SUPPORT)
 		netdev->hw_features |= NETIF_F_NTUPLE;
 
@@ -2978,6 +2992,8 @@ static void otx2_remove(struct pci_dev *pdev)
 		otx2_config_pause_frm(pf);
 	}
 
+	cn10k_mcs_free(pf);
+
 #ifdef CONFIG_DCB
 	/* Disable PFC config */
 	if (pf->pfc_en) {
-- 
cgit v1.2.3


From ba0fbdb95da5ddd8db457ce6ba09d16dd979a294 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maxtram95@gmail.com>
Date: Sat, 1 Oct 2022 13:57:13 +0300
Subject: net: wwan: iosm: Call mutex_init before locking it

wwan_register_ops calls wwan_create_default_link, which ends up in the
ipc_wwan_newlink callback that locks ipc_wwan->if_mutex. However, this
mutex is not yet initialized by that point. Fix it by moving mutex_init
above the wwan_register_ops call. This also makes the order of
operations in ipc_wwan_init symmetric to ipc_wwan_deinit.

Fixes: 83068395bbfc ("net: iosm: create default link via WWAN core")
Signed-off-by: Maxim Mikityanskiy <maxtram95@gmail.com>
Reviewed-by: M Chetan Kumar <m.chetan.kumar@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/wwan/iosm/iosm_ipc_wwan.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wwan/iosm/iosm_ipc_wwan.c b/drivers/net/wwan/iosm/iosm_ipc_wwan.c
index 27151148c782..4712f01a7e33 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_wwan.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_wwan.c
@@ -323,15 +323,16 @@ struct iosm_wwan *ipc_wwan_init(struct iosm_imem *ipc_imem, struct device *dev)
 	ipc_wwan->dev = dev;
 	ipc_wwan->ipc_imem = ipc_imem;
 
+	mutex_init(&ipc_wwan->if_mutex);
+
 	/* WWAN core will create a netdev for the default IP MUX channel */
 	if (wwan_register_ops(ipc_wwan->dev, &iosm_wwan_ops, ipc_wwan,
 			      IP_MUX_SESSION_DEFAULT)) {
+		mutex_destroy(&ipc_wwan->if_mutex);
 		kfree(ipc_wwan);
 		return NULL;
 	}
 
-	mutex_init(&ipc_wwan->if_mutex);
-
 	return ipc_wwan;
 }
 
-- 
cgit v1.2.3


From 3a4d061c699bd3eedc80dc97a4b2a2e1af83c6f5 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sun, 2 Oct 2022 01:43:44 +0900
Subject: net/ieee802154: reject zero-sized raw_sendmsg()

syzbot is hitting skb_assert_len() warning at raw_sendmsg() for ieee802154
socket. What commit dc633700f00f726e ("net/af_packet: check len when
min_header_len equals to 0") does also applies to ieee802154 socket.

Link: https://syzkaller.appspot.com/bug?extid=5ea725c25d06fb9114c4
Reported-by: syzbot <syzbot+5ea725c25d06fb9114c4@syzkaller.appspotmail.com>
Fixes: fd1894224407c484 ("bpf: Don't redirect packets with invalid pkt_len")
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/socket.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index 7889e1ef7fad..cbd0e2ac4ffe 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -251,6 +251,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 		return -EOPNOTSUPP;
 	}
 
+	if (!size)
+		return -EINVAL;
+
 	lock_sock(sk);
 	if (!sk->sk_bound_dev_if)
 		dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154);
-- 
cgit v1.2.3


From 62c07983bef9d3e78e71189441e1a470f0d1e653 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 1 Oct 2022 13:51:02 -0700
Subject: once: add DO_ONCE_SLOW() for sleepable contexts

Christophe Leroy reported a ~80ms latency spike
happening at first TCP connect() time.

This is because __inet_hash_connect() uses get_random_once()
to populate a perturbation table which became quite big
after commit 4c2c8f03a5ab ("tcp: increase source port perturb table to 2^16")

get_random_once() uses DO_ONCE(), which block hard irqs for the duration
of the operation.

This patch adds DO_ONCE_SLOW() which uses a mutex instead of a spinlock
for operations where we prefer to stay in process context.

Then __inet_hash_connect() can use get_random_slow_once()
to populate its perturbation table.

Fixes: 4c2c8f03a5ab ("tcp: increase source port perturb table to 2^16")
Fixes: 190cc82489f4 ("tcp: change source port randomizarion at connect() time")
Reported-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Link: https://lore.kernel.org/netdev/CANn89iLAEYBaoYajy0Y9UmGFff5GPxDUoG-ErVB2jDdRNQ5Tug@mail.gmail.com/T/#t
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willy Tarreau <w@1wt.eu>
Tested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/once.h       | 28 ++++++++++++++++++++++++++++
 lib/once.c                 | 30 ++++++++++++++++++++++++++++++
 net/ipv4/inet_hashtables.c |  4 ++--
 3 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/include/linux/once.h b/include/linux/once.h
index b14d8b309d52..176ab75b42df 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -5,10 +5,18 @@
 #include <linux/types.h>
 #include <linux/jump_label.h>
 
+/* Helpers used from arbitrary contexts.
+ * Hard irqs are blocked, be cautious.
+ */
 bool __do_once_start(bool *done, unsigned long *flags);
 void __do_once_done(bool *done, struct static_key_true *once_key,
 		    unsigned long *flags, struct module *mod);
 
+/* Variant for process contexts only. */
+bool __do_once_slow_start(bool *done);
+void __do_once_slow_done(bool *done, struct static_key_true *once_key,
+			 struct module *mod);
+
 /* Call a function exactly once. The idea of DO_ONCE() is to perform
  * a function call such as initialization of random seeds, etc, only
  * once, where DO_ONCE() can live in the fast-path. After @func has
@@ -52,7 +60,27 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
 		___ret;							     \
 	})
 
+/* Variant of DO_ONCE() for process/sleepable contexts. */
+#define DO_ONCE_SLOW(func, ...)						     \
+	({								     \
+		bool ___ret = false;					     \
+		static bool __section(".data.once") ___done = false;	     \
+		static DEFINE_STATIC_KEY_TRUE(___once_key);		     \
+		if (static_branch_unlikely(&___once_key)) {		     \
+			___ret = __do_once_slow_start(&___done);	     \
+			if (unlikely(___ret)) {				     \
+				func(__VA_ARGS__);			     \
+				__do_once_slow_done(&___done, &___once_key,  \
+						    THIS_MODULE);	     \
+			}						     \
+		}							     \
+		___ret;							     \
+	})
+
 #define get_random_once(buf, nbytes)					     \
 	DO_ONCE(get_random_bytes, (buf), (nbytes))
 
+#define get_random_slow_once(buf, nbytes)				     \
+	DO_ONCE_SLOW(get_random_bytes, (buf), (nbytes))
+
 #endif /* _LINUX_ONCE_H */
diff --git a/lib/once.c b/lib/once.c
index 59149bf3bfb4..351f66aad310 100644
--- a/lib/once.c
+++ b/lib/once.c
@@ -66,3 +66,33 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
 	once_disable_jump(once_key, mod);
 }
 EXPORT_SYMBOL(__do_once_done);
+
+static DEFINE_MUTEX(once_mutex);
+
+bool __do_once_slow_start(bool *done)
+	__acquires(once_mutex)
+{
+	mutex_lock(&once_mutex);
+	if (*done) {
+		mutex_unlock(&once_mutex);
+		/* Keep sparse happy by restoring an even lock count on
+		 * this mutex. In case we return here, we don't call into
+		 * __do_once_done but return early in the DO_ONCE_SLOW() macro.
+		 */
+		__acquire(once_mutex);
+		return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(__do_once_slow_start);
+
+void __do_once_slow_done(bool *done, struct static_key_true *once_key,
+			 struct module *mod)
+	__releases(once_mutex)
+{
+	*done = true;
+	mutex_unlock(&once_mutex);
+	once_disable_jump(once_key, mod);
+}
+EXPORT_SYMBOL(__do_once_slow_done);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 49db8c597eea..dc1c5629cd0d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -958,8 +958,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 	if (likely(remaining > 1))
 		remaining &= ~1U;
 
-	net_get_random_once(table_perturb,
-			    INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
+	get_random_slow_once(table_perturb,
+			     INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
 	index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
 
 	offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
-- 
cgit v1.2.3


From 736baae643cb1ccaeaf989ef8eb09ce085020479 Mon Sep 17 00:00:00 2001
From: Bagas Sanjaya <bagasdotme@gmail.com>
Date: Sun, 2 Oct 2022 10:20:23 +0700
Subject: Documentation: bpf: Add implementation notes documentations to table
 of contents

Sphinx reported warnings on missing implementation notes documentations in the
table of contents:

Documentation/bpf/clang-notes.rst: WARNING: document isn't included in any toctree
Documentation/bpf/linux-notes.rst: WARNING: document isn't included in any toctree

Add these documentations to the table of contents (index.rst) of BPF
documentation to fix the warnings.

Link: https://lore.kernel.org/linux-doc/202210020749.yfgDZbRL-lkp@intel.com/
Fixes: 6c7aaffb24efbd ("bpf, docs: Move Clang notes to a separate file")
Fixes: 6166da0a02cde2 ("bpf, docs: Move legacy packet instructions to a separate file")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Bagas Sanjaya <bagasdotme@gmail.com>
Link: https://lore.kernel.org/r/20221002032022.24693-1-bagasdotme@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/index.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst
index 1bc2c5c58bdb..1b50de1983ee 100644
--- a/Documentation/bpf/index.rst
+++ b/Documentation/bpf/index.rst
@@ -26,6 +26,8 @@ that goes into great technical depth about the BPF Architecture.
    classic_vs_extended.rst
    bpf_licensing
    test_debug
+   clang-notes
+   linux-notes
    other
 
 .. only::  subproject and html
-- 
cgit v1.2.3


From 820dc0523e05c12810bb6bf4e56ce26e4c1948a2 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Fri, 30 Sep 2022 00:38:43 +0200
Subject: net: netfilter: move bpf_ct_set_nat_info kfunc in nf_nat_bpf.c

Remove circular dependency between nf_nat module and nf_conntrack one
moving bpf_ct_set_nat_info kfunc in nf_nat_bpf.c

Fixes: 0fabd2aa199f ("net: netfilter: add bpf_ct_set_nat_info kfunc helper")
Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Tested-by: Yauheni Kaliuta <ykaliuta@redhat.com>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/51a65513d2cda3eeb0754842e8025ab3966068d8.1664490511.git.lorenzo@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/netfilter/nf_conntrack_bpf.h | 19 ++++++++
 net/netfilter/Makefile                   |  6 +++
 net/netfilter/nf_conntrack_bpf.c         | 50 --------------------
 net/netfilter/nf_nat_bpf.c               | 79 ++++++++++++++++++++++++++++++++
 net/netfilter/nf_nat_core.c              |  4 +-
 5 files changed, 106 insertions(+), 52 deletions(-)
 create mode 100644 net/netfilter/nf_nat_bpf.c

diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h
index c8b80add1142..2d0da478c8e0 100644
--- a/include/net/netfilter/nf_conntrack_bpf.h
+++ b/include/net/netfilter/nf_conntrack_bpf.h
@@ -4,6 +4,11 @@
 #define _NF_CONNTRACK_BPF_H
 
 #include <linux/kconfig.h>
+#include <net/netfilter/nf_conntrack.h>
+
+struct nf_conn___init {
+	struct nf_conn ct;
+};
 
 #if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
     (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
@@ -24,4 +29,18 @@ static inline void cleanup_nf_conntrack_bpf(void)
 
 #endif
 
+#if (IS_BUILTIN(CONFIG_NF_NAT) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_NF_NAT) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+extern int register_nf_nat_bpf(void);
+
+#else
+
+static inline int register_nf_nat_bpf(void)
+{
+	return 0;
+}
+
+#endif
+
 #endif /* _NF_CONNTRACK_BPF_H */
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 06df49ea6329..0f060d100880 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -60,6 +60,12 @@ obj-$(CONFIG_NF_NAT) += nf_nat.o
 nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
 nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o
 
+ifeq ($(CONFIG_NF_NAT),m)
+nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o
+else ifeq ($(CONFIG_NF_NAT),y)
+nf_nat-$(CONFIG_DEBUG_INFO_BTF) += nf_nat_bpf.o
+endif
+
 # NAT helpers
 obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
 obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 756ea818574e..8639e7efd0e2 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -14,10 +14,8 @@
 #include <linux/types.h>
 #include <linux/btf_ids.h>
 #include <linux/net_namespace.h>
-#include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_bpf.h>
 #include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_nat.h>
 
 /* bpf_ct_opts - Options for CT lookup helpers
  *
@@ -239,10 +237,6 @@ __diag_push();
 __diag_ignore_all("-Wmissing-prototypes",
 		  "Global functions as their definitions will be in nf_conntrack BTF");
 
-struct nf_conn___init {
-	struct nf_conn ct;
-};
-
 /* bpf_xdp_ct_alloc - Allocate a new CT entry
  *
  * Parameters:
@@ -476,49 +470,6 @@ int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
 	return nf_ct_change_status_common(nfct, status);
 }
 
-/* bpf_ct_set_nat_info - Set source or destination nat address
- *
- * Set source or destination nat address of the newly allocated
- * nf_conn before insertion. This must be invoked for referenced
- * PTR_TO_BTF_ID to nf_conn___init.
- *
- * Parameters:
- * @nfct	- Pointer to referenced nf_conn object, obtained using
- *		  bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
- * @addr	- Nat source/destination address
- * @port	- Nat source/destination port. Non-positive values are
- *		  interpreted as select a random port.
- * @manip	- NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST
- */
-int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
-			union nf_inet_addr *addr, int port,
-			enum nf_nat_manip_type manip)
-{
-#if ((IS_MODULE(CONFIG_NF_NAT) && IS_MODULE(CONFIG_NF_CONNTRACK)) || \
-     IS_BUILTIN(CONFIG_NF_NAT))
-	struct nf_conn *ct = (struct nf_conn *)nfct;
-	u16 proto = nf_ct_l3num(ct);
-	struct nf_nat_range2 range;
-
-	if (proto != NFPROTO_IPV4 && proto != NFPROTO_IPV6)
-		return -EINVAL;
-
-	memset(&range, 0, sizeof(struct nf_nat_range2));
-	range.flags = NF_NAT_RANGE_MAP_IPS;
-	range.min_addr = *addr;
-	range.max_addr = range.min_addr;
-	if (port > 0) {
-		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
-		range.min_proto.all = cpu_to_be16(port);
-		range.max_proto.all = range.min_proto.all;
-	}
-
-	return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
-#else
-	return -EOPNOTSUPP;
-#endif
-}
-
 __diag_pop()
 
 BTF_SET8_START(nf_ct_kfunc_set)
@@ -532,7 +483,6 @@ BTF_ID_FLAGS(func, bpf_ct_set_timeout, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_ct_change_timeout, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_ct_set_status, KF_TRUSTED_ARGS)
 BTF_ID_FLAGS(func, bpf_ct_change_status, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
 BTF_SET8_END(nf_ct_kfunc_set)
 
 static const struct btf_kfunc_id_set nf_conntrack_kfunc_set = {
diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c
new file mode 100644
index 000000000000..0fa5a0bbb0ff
--- /dev/null
+++ b/net/netfilter/nf_nat_bpf.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable NAT Helpers for XDP and TC-BPF hook
+ *
+ * These are called from the XDP and SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_nat.h>
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in nf_nat BTF");
+
+/* bpf_ct_set_nat_info - Set source or destination nat address
+ *
+ * Set source or destination nat address of the newly allocated
+ * nf_conn before insertion. This must be invoked for referenced
+ * PTR_TO_BTF_ID to nf_conn___init.
+ *
+ * Parameters:
+ * @nfct	- Pointer to referenced nf_conn object, obtained using
+ *		  bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
+ * @addr	- Nat source/destination address
+ * @port	- Nat source/destination port. Non-positive values are
+ *		  interpreted as select a random port.
+ * @manip	- NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST
+ */
+int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
+			union nf_inet_addr *addr, int port,
+			enum nf_nat_manip_type manip)
+{
+	struct nf_conn *ct = (struct nf_conn *)nfct;
+	u16 proto = nf_ct_l3num(ct);
+	struct nf_nat_range2 range;
+
+	if (proto != NFPROTO_IPV4 && proto != NFPROTO_IPV6)
+		return -EINVAL;
+
+	memset(&range, 0, sizeof(struct nf_nat_range2));
+	range.flags = NF_NAT_RANGE_MAP_IPS;
+	range.min_addr = *addr;
+	range.max_addr = range.min_addr;
+	if (port > 0) {
+		range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+		range.min_proto.all = cpu_to_be16(port);
+		range.max_proto.all = range.min_proto.all;
+	}
+
+	return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
+}
+
+__diag_pop()
+
+BTF_SET8_START(nf_nat_kfunc_set)
+BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS)
+BTF_SET8_END(nf_nat_kfunc_set)
+
+static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &nf_nat_kfunc_set,
+};
+
+int register_nf_nat_bpf(void)
+{
+	int ret;
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
+					&nf_bpf_nat_kfunc_set);
+	if (ret)
+		return ret;
+
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+					 &nf_bpf_nat_kfunc_set);
+}
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 7981be526f26..d8e6380f6337 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -16,7 +16,7 @@
 #include <linux/siphash.h>
 #include <linux/rtnetlink.h>
 
-#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
@@ -1152,7 +1152,7 @@ static int __init nf_nat_init(void)
 	WARN_ON(nf_nat_hook != NULL);
 	RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
 
-	return 0;
+	return register_nf_nat_bpf();
 }
 
 static void __exit nf_nat_cleanup(void)
-- 
cgit v1.2.3


From 450a580fc4b5e7f7fb8d9b1a0208bf0d1efc53a8 Mon Sep 17 00:00:00 2001
From: Nathan Huckleberry <nhuck@google.com>
Date: Thu, 29 Sep 2022 11:27:03 -0700
Subject: net: lan966x: Fix return type of lan966x_port_xmit

The ndo_start_xmit field in net_device_ops is expected to be of type
netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb, struct net_device *dev).

The mismatched return type breaks forward edge kCFI since the underlying
function definition does not match the function hook definition.

The return type of lan966x_port_xmit should be changed from int to
netdev_tx_t.

Reported-by: Dan Carpenter <error27@gmail.com>
Link: https://github.com/ClangBuiltLinux/linux/issues/1703
Cc: llvm@lists.linux.dev
Signed-off-by: Nathan Huckleberry <nhuck@google.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/r/20220929182704.64438-1-nhuck@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/microchip/lan966x/lan966x_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
index b98d37c76edb..be2fd030cccb 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
@@ -344,7 +344,8 @@ static void lan966x_ifh_set_timestamp(void *ifh, u64 timestamp)
 		IFH_POS_TIMESTAMP, IFH_LEN * 4, PACK, 0);
 }
 
-static int lan966x_port_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t lan966x_port_xmit(struct sk_buff *skb,
+				     struct net_device *dev)
 {
 	struct lan966x_port *port = netdev_priv(dev);
 	struct lan966x *lan966x = port->lan966x;
-- 
cgit v1.2.3


From 93e2be344a7db169b7119de21ac1bf253b8c6907 Mon Sep 17 00:00:00 2001
From: Andrew Gaul <gaul@gaul.org>
Date: Sun, 2 Oct 2022 12:41:28 +0900
Subject: r8152: Rate limit overflow messages

My system shows almost 10 million of these messages over a 24-hour
period which pollutes my logs.

Signed-off-by: Andrew Gaul <gaul@google.com>
Link: https://lore.kernel.org/r/20221002034128.2026653-1-gaul@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/r8152.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 688905ea0a6d..e7b0b59e2bc8 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -1874,7 +1874,9 @@ static void intr_callback(struct urb *urb)
 			   "Stop submitting intr, status %d\n", status);
 		return;
 	case -EOVERFLOW:
-		netif_info(tp, intr, tp->netdev, "intr status -EOVERFLOW\n");
+		if (net_ratelimit())
+			netif_info(tp, intr, tp->netdev,
+				   "intr status -EOVERFLOW\n");
 		goto resubmit;
 	/* -EPIPE:  should clear the halt */
 	default:
-- 
cgit v1.2.3


From ace5dc61620ba1592bf9e24da4c290a357830b8c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Fri, 30 Sep 2022 17:45:27 -0500
Subject: net: ipa: update comments

This patch just updates comments throughout the IPA code.

Transaction state is now tracked using indexes into an array rather
than linked lists, and a few comments refer to the "old way" of
doing things.  The description of how transactions are used was
changed to refer to "operations" rather than "commands", to
(hopefully) remove a possible ambiguity.

IPA register offsets and fields are now handled differently as well,
and the register documentation is updated to better describe the
code.

A few minor updates to comments were made (e.g., adding a missing
word, fixing a typo or punctuation, etc.).

Finally, the local macro atomic_dec_not_zero() is no longer used, so
it is deleted.

Signed-off-by: Alex Elder <elder@linaro.org>
Link: https://lore.kernel.org/r/20220930224527.3503404-1-elder@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ipa/gsi.c          | 24 +++++++-------
 drivers/net/ipa/gsi_private.h  |  8 ++---
 drivers/net/ipa/gsi_trans.c    | 56 ++++++++++++++++----------------
 drivers/net/ipa/gsi_trans.h    |  2 +-
 drivers/net/ipa/ipa_cmd.c      |  2 +-
 drivers/net/ipa/ipa_data.h     |  2 +-
 drivers/net/ipa/ipa_endpoint.c |  2 --
 drivers/net/ipa/ipa_reg.h      | 73 +++++++++++++++++-------------------------
 8 files changed, 75 insertions(+), 94 deletions(-)

diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c
index f8036ee78647..6faa358c53fe 100644
--- a/drivers/net/ipa/gsi.c
+++ b/drivers/net/ipa/gsi.c
@@ -56,9 +56,9 @@
  * element can also contain an immediate command, requesting the IPA perform
  * actions other than data transfer.
  *
- * Each TRE refers to a block of data--also located DRAM.  After writing one
- * or more TREs to a channel, the writer (either the IPA or an EE) writes a
- * doorbell register to inform the receiving side how many elements have
+ * Each TRE refers to a block of data--also located in DRAM.  After writing
+ * one or more TREs to a channel, the writer (either the IPA or an EE) writes
+ * a doorbell register to inform the receiving side how many elements have
  * been written.
  *
  * Each channel has a GSI "event ring" associated with it.  An event ring
@@ -1347,8 +1347,8 @@ gsi_event_trans(struct gsi *gsi, struct gsi_event *event)
  * we update transactions to record their actual received lengths.
  *
  * When an event for a TX channel arrives we use information in the
- * transaction to report the number of requests and bytes have been
- * transferred.
+ * transaction to report the number of requests and bytes that have
+ * been transferred.
  *
  * This function is called whenever we learn that the GSI hardware has filled
  * new events since the last time we checked.  The ring's index field tells
@@ -1474,7 +1474,7 @@ void gsi_channel_doorbell(struct gsi_channel *channel)
 	iowrite32(val, gsi->virt + GSI_CH_C_DOORBELL_0_OFFSET(channel_id));
 }
 
-/* Consult hardware, move any newly completed transactions to completed list */
+/* Consult hardware, move newly completed transactions to completed state */
 void gsi_channel_update(struct gsi_channel *channel)
 {
 	u32 evt_ring_id = channel->evt_ring_id;
@@ -1515,17 +1515,17 @@ void gsi_channel_update(struct gsi_channel *channel)
  *
  * Return:	Transaction pointer, or null if none are available
  *
- * This function returns the first entry on a channel's completed transaction
- * list.  If that list is empty, the hardware is consulted to determine
- * whether any new transactions have completed.  If so, they're moved to the
- * completed list and the new first entry is returned.  If there are no more
- * completed transactions, a null pointer is returned.
+ * This function returns the first of a channel's completed transactions.
+ * If no transactions are in completed state, the hardware is consulted to
+ * determine whether any new transactions have completed.  If so, they're
+ * moved to completed state and the first such transaction is returned.
+ * If there are no more completed transactions, a null pointer is returned.
  */
 static struct gsi_trans *gsi_channel_poll_one(struct gsi_channel *channel)
 {
 	struct gsi_trans *trans;
 
-	/* Get the first transaction from the completed list */
+	/* Get the first completed transaction */
 	trans = gsi_channel_trans_complete(channel);
 	if (trans)
 		gsi_trans_move_polled(trans);
diff --git a/drivers/net/ipa/gsi_private.h b/drivers/net/ipa/gsi_private.h
index af4cc13864e2..6a73dae764d4 100644
--- a/drivers/net/ipa/gsi_private.h
+++ b/drivers/net/ipa/gsi_private.h
@@ -18,13 +18,13 @@ struct gsi_channel;
 
 /**
  * gsi_trans_move_complete() - Mark a GSI transaction completed
- * @trans:	Transaction to commit
+ * @trans:	Transaction whose state is to be updated
  */
 void gsi_trans_move_complete(struct gsi_trans *trans);
 
 /**
  * gsi_trans_move_polled() - Mark a transaction polled
- * @trans:	Transaction to update
+ * @trans:	Transaction whose state is to be updated
  */
 void gsi_trans_move_polled(struct gsi_trans *trans);
 
@@ -97,8 +97,8 @@ void gsi_channel_doorbell(struct gsi_channel *channel);
 /* gsi_channel_update() - Update knowledge of channel hardware state
  * @channel:	Channel to be updated
  *
- * Consult hardware, move any newly completed transactions to a
- * channel's completed list.
+ * Consult hardware, change the state of any newly-completed transactions
+ * on a channel.
  */
 void gsi_channel_update(struct gsi_channel *channel);
 
diff --git a/drivers/net/ipa/gsi_trans.c b/drivers/net/ipa/gsi_trans.c
index 03e54fc4376a..c791e32161b9 100644
--- a/drivers/net/ipa/gsi_trans.c
+++ b/drivers/net/ipa/gsi_trans.c
@@ -22,37 +22,36 @@
  * DOC: GSI Transactions
  *
  * A GSI transaction abstracts the behavior of a GSI channel by representing
- * everything about a related group of IPA commands in a single structure.
- * (A "command" in this sense is either a data transfer or an IPA immediate
+ * everything about a related group of IPA operations in a single structure.
+ * (A "operation" in this sense is either a data transfer or an IPA immediate
  * command.)  Most details of interaction with the GSI hardware are managed
- * by the GSI transaction core, allowing users to simply describe commands
+ * by the GSI transaction core, allowing users to simply describe operations
  * to be performed.  When a transaction has completed a callback function
  * (dependent on the type of endpoint associated with the channel) allows
  * cleanup of resources associated with the transaction.
  *
- * To perform a command (or set of them), a user of the GSI transaction
+ * To perform an operation (or set of them), a user of the GSI transaction
  * interface allocates a transaction, indicating the number of TREs required
- * (one per command).  If sufficient TREs are available, they are reserved
+ * (one per operation).  If sufficient TREs are available, they are reserved
  * for use in the transaction and the allocation succeeds.  This way
- * exhaustion of the available TREs in a channel ring is detected
- * as early as possible.  All resources required to complete a transaction
- * are allocated at transaction allocation time.
+ * exhaustion of the available TREs in a channel ring is detected as early
+ * as possible.  Any other resources that might be needed to complete a
+ * transaction are also allocated when the transaction is allocated.
  *
- * Commands performed as part of a transaction are represented in an array
- * of Linux scatterlist structures.  This array is allocated with the
- * transaction, and its entries are initialized using standard scatterlist
- * functions (such as sg_set_buf() or skb_to_sgvec()).
+ * Operations performed as part of a transaction are represented in an array
+ * of Linux scatterlist structures, allocated with the transaction.  These
+ * scatterlist structures are initialized by "adding" operations to the
+ * transaction.  If a buffer in an operation must be mapped for DMA, this is
+ * done at the time it is added to the transaction.  It is possible for a
+ * mapping error to occur when an operation is added.  In this case the
+ * transaction should simply be freed; this correctly releases resources
+ * associated with the transaction.
  *
- * Once a transaction's scatterlist structures have been initialized, the
- * transaction is committed.  The caller is responsible for mapping buffers
- * for DMA if necessary, and this should be done *before* allocating
- * the transaction.  Between a successful allocation and commit of a
- * transaction no errors should occur.
- *
- * Committing transfers ownership of the entire transaction to the GSI
- * transaction core.  The GSI transaction code formats the content of
- * the scatterlist array into the channel ring buffer and informs the
- * hardware that new TREs are available to process.
+ * Once all operations have been successfully added to a transaction, the
+ * transaction is committed.  Committing transfers ownership of the entire
+ * transaction to the GSI transaction core.  The GSI transaction code
+ * formats the content of the scatterlist array into the channel ring
+ * buffer and informs the hardware that new TREs are available to process.
  *
  * The last TRE in each transaction is marked to interrupt the AP when the
  * GSI hardware has completed it.  Because transfers described by TREs are
@@ -125,11 +124,10 @@ void gsi_trans_pool_exit(struct gsi_trans_pool *pool)
 	memset(pool, 0, sizeof(*pool));
 }
 
-/* Allocate the requested number of (zeroed) entries from the pool */
-/* Home-grown DMA pool.  This way we can preallocate and use the tre_count
- * to guarantee allocations will succeed.  Even though we specify max_alloc
- * (and it can be more than one), we only allow allocation of a single
- * element from a DMA pool.
+/* Home-grown DMA pool.  This way we can preallocate the pool, and guarantee
+ * allocations will succeed.  The immediate commands in a transaction can
+ * require up to max_alloc elements from the pool.  But we only allow
+ * allocation of a single element from a DMA pool at a time.
  */
 int gsi_trans_pool_init_dma(struct device *dev, struct gsi_trans_pool *pool,
 			    size_t size, u32 count, u32 max_alloc)
@@ -537,8 +535,8 @@ static void gsi_trans_tre_fill(struct gsi_tre *dest_tre, dma_addr_t addr,
  *
  * Formats channel ring TRE entries based on the content of the scatterlist.
  * Maps a transaction pointer to the last ring entry used for the transaction,
- * so it can be recovered when it completes.  Moves the transaction to the
- * pending list.  Finally, updates the channel ring pointer and optionally
+ * so it can be recovered when it completes.  Moves the transaction to
+ * pending state.  Finally, updates the channel ring pointer and optionally
  * rings the doorbell.
  */
 static void __gsi_trans_commit(struct gsi_trans *trans, bool ring_db)
diff --git a/drivers/net/ipa/gsi_trans.h b/drivers/net/ipa/gsi_trans.h
index af8c4c6719d1..18c4ba263679 100644
--- a/drivers/net/ipa/gsi_trans.h
+++ b/drivers/net/ipa/gsi_trans.h
@@ -74,7 +74,7 @@ struct gsi_trans {
 
 /**
  * gsi_trans_pool_init() - Initialize a pool of structures for transactions
- * @pool:	GSI transaction poll pointer
+ * @pool:	GSI transaction pool pointer
  * @size:	Size of elements in the pool
  * @count:	Minimum number of elements in the pool
  * @max_alloc:	Maximum number of elements allocated at a time from pool
diff --git a/drivers/net/ipa/ipa_cmd.c b/drivers/net/ipa/ipa_cmd.c
index f762d7d5f31f..87014bbcf971 100644
--- a/drivers/net/ipa/ipa_cmd.c
+++ b/drivers/net/ipa/ipa_cmd.c
@@ -32,7 +32,7 @@
  * immediate command's opcode.  The payload for a command resides in AP
  * memory and is described by a single scatterlist entry in its transaction.
  * Commands do not require a transaction completion callback, and are
- * (currently) always issued using gsi_trans_commit_wait().
+ * always issued using gsi_trans_commit_wait().
  */
 
 /* Some commands can wait until indicated pipeline stages are clear */
diff --git a/drivers/net/ipa/ipa_data.h b/drivers/net/ipa/ipa_data.h
index e15eb3cd3e33..e239bcca833d 100644
--- a/drivers/net/ipa/ipa_data.h
+++ b/drivers/net/ipa/ipa_data.h
@@ -31,7 +31,7 @@
  * communication path between the IPA and a particular execution environment
  * (EE), such as the AP or Modem.  Each EE has a set of channels associated
  * with it, and each channel has an ID unique for that EE.  For the most part
- * the only GSI channels of concern to this driver belong to the AP
+ * the only GSI channels of concern to this driver belong to the AP.
  *
  * An endpoint is an IPA construct representing a single channel anywhere
  * in the system.  An IPA endpoint ID maps directly to an (EE, channel_id)
diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c
index 0da02d8d238d..a09f323a7e9f 100644
--- a/drivers/net/ipa/ipa_endpoint.c
+++ b/drivers/net/ipa/ipa_endpoint.c
@@ -23,8 +23,6 @@
 #include "ipa_gsi.h"
 #include "ipa_power.h"
 
-#define atomic_dec_not_zero(v)	atomic_add_unless((v), -1, 0)
-
 /* Hardware is told about receive buffers once a "batch" has been queued */
 #define IPA_REPLENISH_BATCH	16		/* Must be non-zero */
 
diff --git a/drivers/net/ipa/ipa_reg.h b/drivers/net/ipa/ipa_reg.h
index f81381891a2e..3c768c9d3d10 100644
--- a/drivers/net/ipa/ipa_reg.h
+++ b/drivers/net/ipa/ipa_reg.h
@@ -17,53 +17,38 @@ struct ipa;
  * DOC: IPA Registers
  *
  * IPA registers are located within the "ipa-reg" address space defined by
- * Device Tree.  The offset of each register within that space is specified
- * by symbols defined below.  The address space is mapped to virtual memory
- * space in ipa_mem_init().  All IPA registers are 32 bits wide.
+ * Device Tree.  Each register has a specified offset within that space,
+ * which is mapped into virtual memory space in ipa_mem_init().  Each
+ * has a unique identifer, taken from the ipa_reg_id enumerated type.
+ * All IPA registers are 32 bits wide.
  *
- * Certain register types are duplicated for a number of instances of
- * something.  For example, each IPA endpoint has an set of registers
- * defining its configuration.  The offset to an endpoint's set of registers
- * is computed based on an "base" offset, plus an endpoint's ID multiplied
- * and a "stride" value for the register.  For such registers, the offset is
- * computed by a function-like macro that takes a parameter used in the
- * computation.
+ * Certain "parameterized" register types are duplicated for a number of
+ * instances of something.  For example, each IPA endpoint has an set of
+ * registers defining its configuration.  The offset to an endpoint's set
+ * of registers is computed based on an "base" offset, plus an endpoint's
+ * ID multiplied and a "stride" value for the register.  Similarly, some
+ * registers have an offset that depends on execution environment.  In
+ * this case, the stride is multiplied by a member of the gsi_ee_id
+ * enumerated type.
  *
- * Some register offsets depend on execution environment.  For these an "ee"
- * parameter is supplied to the offset macro.  The "ee" value is a member of
- * the gsi_ee enumerated type.
+ * Each version of IPA implements an array of ipa_reg structures indexed
+ * by register ID.  Each entry in the array specifies the base offset and
+ * (for parameterized registers) a non-zero stride value.  Not all versions
+ * of IPA define all registers.  The offset for a register is returned by
+ * ipa_reg_offset() when the register's ipa_reg structure is supplied;
+ * zero is returned for an undefined register (this should never happen).
  *
- * The offset of a register dependent on endpoint ID is computed by a macro
- * that is supplied a parameter "ep", "txep", or "rxep".  A register with an
- * "ep" parameter is valid for any endpoint; a register with a "txep" or
- * "rxep" parameter is valid only for TX or RX endpoints, respectively.  The
- * "*ep" value is assumed to be less than the maximum valid endpoint ID
- * for the current hardware, and that will not exceed IPA_ENDPOINT_MAX.
- *
- * The offset of registers related to filter and route tables is computed
- * by a macro that is supplied a parameter "er".  The "er" represents an
- * endpoint ID for filters, or a route ID for routes.  For filters, the
- * endpoint ID must be less than IPA_ENDPOINT_MAX, but is further restricted
- * because not all endpoints support filtering.  For routes, the route ID
- * must be less than IPA_ROUTE_MAX.
- *
- * The offset of registers related to resource types is computed by a macro
- * that is supplied a parameter "rt".  The "rt" represents a resource type,
- * which is a member of the ipa_resource_type_src enumerated type for
- * source endpoint resources or the ipa_resource_type_dst enumerated type
- * for destination endpoint resources.
- *
- * Some registers encode multiple fields within them.  For these, each field
- * has a symbol below defining a field mask that encodes both the position
- * and width of the field within its register.
- *
- * In some cases, different versions of IPA hardware use different offset or
- * field mask values.  In such cases an inline_function(ipa) is used rather
- * than a MACRO to define the offset or field mask to use.
- *
- * Finally, some registers hold bitmasks representing endpoints.  In such
- * cases the @available field in the @ipa structure defines the "full" set
- * of valid bits for the register.
+ * Some registers encode multiple fields within them.  Each field in
+ * such a register has a unique identifier (from an enumerated type).
+ * The position and width of the fields in a register are defined by
+ * an array of field masks, indexed by field ID.  Two functions are
+ * used to access register fields; both take an ipa_reg structure as
+ * argument.  To encode a value to be represented in a register field,
+ * the value and field ID are passed to ipa_reg_encode().  To extract
+ * a value encoded in a register field, the field ID is passed to
+ * ipa_reg_decode().  In addition, for single-bit fields, ipa_reg_bit()
+ * can be used to either encode the bit value, or to generate a mask
+ * used to extract the bit value.
  */
 
 /* enum ipa_reg_id - IPA register IDs */
-- 
cgit v1.2.3


From a4388da51ad56366d330144975e50f162a10bd8b Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@linaro.org>
Date: Fri, 30 Sep 2022 17:45:49 -0500
Subject: net: ipa: update copyrights

Some source files state copyright dates that are earlier than the
last modification of the file.  Change the copyright year to 2022 in
all such cases.

Signed-off-by: Alex Elder <elder@linaro.org>
Link: https://lore.kernel.org/r/20220930224549.3503434-1-elder@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ipa/gsi.c           | 2 +-
 drivers/net/ipa/gsi.h           | 2 +-
 drivers/net/ipa/gsi_private.h   | 2 +-
 drivers/net/ipa/gsi_reg.h       | 2 +-
 drivers/net/ipa/gsi_trans.c     | 2 +-
 drivers/net/ipa/gsi_trans.h     | 2 +-
 drivers/net/ipa/ipa.h           | 2 +-
 drivers/net/ipa/ipa_cmd.c       | 2 +-
 drivers/net/ipa/ipa_cmd.h       | 2 +-
 drivers/net/ipa/ipa_data.h      | 2 +-
 drivers/net/ipa/ipa_endpoint.c  | 2 +-
 drivers/net/ipa/ipa_endpoint.h  | 2 +-
 drivers/net/ipa/ipa_interrupt.c | 2 +-
 drivers/net/ipa/ipa_interrupt.h | 2 +-
 drivers/net/ipa/ipa_main.c      | 2 +-
 drivers/net/ipa/ipa_mem.c       | 2 +-
 drivers/net/ipa/ipa_modem.c     | 2 +-
 drivers/net/ipa/ipa_modem.h     | 2 +-
 drivers/net/ipa/ipa_power.c     | 2 +-
 drivers/net/ipa/ipa_power.h     | 2 +-
 drivers/net/ipa/ipa_qmi.c       | 2 +-
 drivers/net/ipa/ipa_qmi.h       | 2 +-
 drivers/net/ipa/ipa_qmi_msg.c   | 2 +-
 drivers/net/ipa/ipa_qmi_msg.h   | 2 +-
 drivers/net/ipa/ipa_reg.c       | 2 +-
 drivers/net/ipa/ipa_reg.h       | 2 +-
 drivers/net/ipa/ipa_resource.c  | 2 +-
 drivers/net/ipa/ipa_smp2p.c     | 2 +-
 drivers/net/ipa/ipa_smp2p.h     | 2 +-
 drivers/net/ipa/ipa_sysfs.c     | 2 +-
 drivers/net/ipa/ipa_sysfs.h     | 2 +-
 drivers/net/ipa/ipa_table.c     | 2 +-
 drivers/net/ipa/ipa_table.h     | 2 +-
 drivers/net/ipa/ipa_uc.c        | 2 +-
 drivers/net/ipa/ipa_uc.h        | 2 +-
 drivers/net/ipa/ipa_version.h   | 2 +-
 36 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ipa/gsi.c b/drivers/net/ipa/gsi.c
index 6faa358c53fe..bea2da1c4c51 100644
--- a/drivers/net/ipa/gsi.c
+++ b/drivers/net/ipa/gsi.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2015-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2021 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ipa/gsi.h b/drivers/net/ipa/gsi.h
index 0fc25a6ae006..49dcadba4e0b 100644
--- a/drivers/net/ipa/gsi.h
+++ b/drivers/net/ipa/gsi.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2015-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2021 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 #ifndef _GSI_H_
 #define _GSI_H_
diff --git a/drivers/net/ipa/gsi_private.h b/drivers/net/ipa/gsi_private.h
index 6a73dae764d4..c65f7c5cdc8d 100644
--- a/drivers/net/ipa/gsi_private.h
+++ b/drivers/net/ipa/gsi_private.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2015-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2020 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 #ifndef _GSI_PRIVATE_H_
 #define _GSI_PRIVATE_H_
diff --git a/drivers/net/ipa/gsi_reg.h b/drivers/net/ipa/gsi_reg.h
index b36fd10a57d6..3763359f208f 100644
--- a/drivers/net/ipa/gsi_reg.h
+++ b/drivers/net/ipa/gsi_reg.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2015-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2021 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 #ifndef _GSI_REG_H_
 #define _GSI_REG_H_
diff --git a/drivers/net/ipa/gsi_trans.c b/drivers/net/ipa/gsi_trans.c
index c791e32161b9..26b7f683a3e1 100644
--- a/drivers/net/ipa/gsi_trans.c
+++ b/drivers/net/ipa/gsi_trans.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2020 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ipa/gsi_trans.h b/drivers/net/ipa/gsi_trans.h
index 18c4ba263679..30c1c2dc77c6 100644
--- a/drivers/net/ipa/gsi_trans.h
+++ b/drivers/net/ipa/gsi_trans.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2020 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 #ifndef _GSI_TRANS_H_
 #define _GSI_TRANS_H_
diff --git a/drivers/net/ipa/ipa.h b/drivers/net/ipa/ipa.h
index 349643cf2b44..09ead433ec38 100644
--- a/drivers/net/ipa/ipa.h
+++ b/drivers/net/ipa/ipa.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2020 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 #ifndef _IPA_H_
 #define _IPA_H_
diff --git a/drivers/net/ipa/ipa_cmd.c b/drivers/net/ipa/ipa_cmd.c
index 87014bbcf971..26c3db9f52b1 100644
--- a/drivers/net/ipa/ipa_cmd.c
+++ b/drivers/net/ipa/ipa_cmd.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2021 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ipa/ipa_cmd.h b/drivers/net/ipa/ipa_cmd.h
index 9215ddad1010..8e4243c1f0bb 100644
--- a/drivers/net/ipa/ipa_cmd.h
+++ b/drivers/net/ipa/ipa_cmd.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2020 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 #ifndef _IPA_CMD_H_
 #define _IPA_CMD_H_
diff --git a/drivers/net/ipa/ipa_data.h b/drivers/net/ipa/ipa_data.h
index e239bcca833d..e5a6ce75c7dd 100644
--- a/drivers/net/ipa/ipa_data.h
+++ b/drivers/net/ipa/ipa_data.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2021 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 #ifndef _IPA_DATA_H_
 #define _IPA_DATA_H_
diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c
index a09f323a7e9f..093e11ec7c2d 100644
--- a/drivers/net/ipa/ipa_endpoint.c
+++ b/drivers/net/ipa/ipa_endpoint.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2021 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ipa/ipa_endpoint.h b/drivers/net/ipa/ipa_endpoint.h
index 28e0a7386fd7..d8dfa24f5214 100644
--- a/drivers/net/ipa/ipa_endpoint.h
+++ b/drivers/net/ipa/ipa_endpoint.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2020 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 #ifndef _IPA_ENDPOINT_H_
 #define _IPA_ENDPOINT_H_
diff --git a/drivers/net/ipa/ipa_interrupt.c b/drivers/net/ipa/ipa_interrupt.c
index d0142b17a275..c269432f9c2e 100644
--- a/drivers/net/ipa/ipa_interrupt.c
+++ b/drivers/net/ipa/ipa_interrupt.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2014-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2020 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 
 /* DOC: IPA Interrupts
diff --git a/drivers/net/ipa/ipa_interrupt.h b/drivers/net/ipa/ipa_interrupt.h
index 231390cea52a..f31fd9965fdc 100644
--- a/drivers/net/ipa/ipa_interrupt.h
+++ b/drivers/net/ipa/ipa_interrupt.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2020 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 #ifndef _IPA_INTERRUPT_H_
 #define _IPA_INTERRUPT_H_
diff --git a/drivers/net/ipa/ipa_main.c b/drivers/net/ipa/ipa_main.c
index a0f6212aa3c3..3461ad3029ab 100644
--- a/drivers/net/ipa/ipa_main.c
+++ b/drivers/net/ipa/ipa_main.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2021 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ipa/ipa_mem.c b/drivers/net/ipa/ipa_mem.c
index 9abf473be1dd..f84c6830495a 100644
--- a/drivers/net/ipa/ipa_mem.c
+++ b/drivers/net/ipa/ipa_mem.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2021 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ipa/ipa_modem.c b/drivers/net/ipa/ipa_modem.c
index c8b1c4d9c507..423422a2a445 100644
--- a/drivers/net/ipa/ipa_modem.c
+++ b/drivers/net/ipa/ipa_modem.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2014-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2021 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 
 #include <linux/errno.h>
diff --git a/drivers/net/ipa/ipa_modem.h b/drivers/net/ipa/ipa_modem.h
index e64ccc2402e9..d85718db9a57 100644
--- a/drivers/net/ipa/ipa_modem.h
+++ b/drivers/net/ipa/ipa_modem.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2020 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 #ifndef _IPA_MODEM_H_
 #define _IPA_MODEM_H_
diff --git a/drivers/net/ipa/ipa_power.c b/drivers/net/ipa/ipa_power.c
index db5ac7552286..8420f93128a2 100644
--- a/drivers/net/ipa/ipa_power.c
+++ b/drivers/net/ipa/ipa_power.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2021 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 
 #include <linux/clk.h>
diff --git a/drivers/net/ipa/ipa_power.h b/drivers/net/ipa/ipa_power.h
index 6f84f057a209..896f052e51a1 100644
--- a/drivers/net/ipa/ipa_power.h
+++ b/drivers/net/ipa/ipa_power.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2020 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 #ifndef _IPA_POWER_H_
 #define _IPA_POWER_H_
diff --git a/drivers/net/ipa/ipa_qmi.c b/drivers/net/ipa/ipa_qmi.c
index 6f874f99b910..8295fd4b70d1 100644
--- a/drivers/net/ipa/ipa_qmi.c
+++ b/drivers/net/ipa/ipa_qmi.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2013-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2020 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ipa/ipa_qmi.h b/drivers/net/ipa/ipa_qmi.h
index 856ef629ccc8..1c236826c17a 100644
--- a/drivers/net/ipa/ipa_qmi.h
+++ b/drivers/net/ipa/ipa_qmi.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2020 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 #ifndef _IPA_QMI_H_
 #define _IPA_QMI_H_
diff --git a/drivers/net/ipa/ipa_qmi_msg.c b/drivers/net/ipa/ipa_qmi_msg.c
index 75d3fc0092e9..97c0befe8d86 100644
--- a/drivers/net/ipa/ipa_qmi_msg.c
+++ b/drivers/net/ipa/ipa_qmi_msg.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2020 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 #include <linux/stddef.h>
 #include <linux/soc/qcom/qmi.h>
diff --git a/drivers/net/ipa/ipa_qmi_msg.h b/drivers/net/ipa/ipa_qmi_msg.h
index 9651aa59b596..e29663965f43 100644
--- a/drivers/net/ipa/ipa_qmi_msg.h
+++ b/drivers/net/ipa/ipa_qmi_msg.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2020 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 #ifndef _IPA_QMI_MSG_H_
 #define _IPA_QMI_MSG_H_
diff --git a/drivers/net/ipa/ipa_reg.c b/drivers/net/ipa/ipa_reg.c
index fb4663bcf14b..22f067741d9b 100644
--- a/drivers/net/ipa/ipa_reg.c
+++ b/drivers/net/ipa/ipa_reg.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2020 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 
 #include <linux/io.h>
diff --git a/drivers/net/ipa/ipa_reg.h b/drivers/net/ipa/ipa_reg.h
index 3c768c9d3d10..7bf70f70f63f 100644
--- a/drivers/net/ipa/ipa_reg.h
+++ b/drivers/net/ipa/ipa_reg.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2021 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 #ifndef _IPA_REG_H_
 #define _IPA_REG_H_
diff --git a/drivers/net/ipa/ipa_resource.c b/drivers/net/ipa/ipa_resource.c
index 5376b71f4598..a257f0e5e361 100644
--- a/drivers/net/ipa/ipa_resource.c
+++ b/drivers/net/ipa/ipa_resource.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2021 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ipa/ipa_smp2p.c b/drivers/net/ipa/ipa_smp2p.c
index 211233612039..5620dc271fac 100644
--- a/drivers/net/ipa/ipa_smp2p.c
+++ b/drivers/net/ipa/ipa_smp2p.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2020 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ipa/ipa_smp2p.h b/drivers/net/ipa/ipa_smp2p.h
index 59cee31a7383..9b969b03d1a4 100644
--- a/drivers/net/ipa/ipa_smp2p.h
+++ b/drivers/net/ipa/ipa_smp2p.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2020 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 #ifndef _IPA_SMP2P_H_
 #define _IPA_SMP2P_H_
diff --git a/drivers/net/ipa/ipa_sysfs.c b/drivers/net/ipa/ipa_sysfs.c
index c0c8641cdd14..5cbc15a971f9 100644
--- a/drivers/net/ipa/ipa_sysfs.c
+++ b/drivers/net/ipa/ipa_sysfs.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 
-/* Copyright (C) 2021 Linaro Ltd. */
+/* Copyright (C) 2021-2022 Linaro Ltd. */
 
 #include <linux/kernel.h>
 #include <linux/types.h>
diff --git a/drivers/net/ipa/ipa_sysfs.h b/drivers/net/ipa/ipa_sysfs.h
index 4a3ffd1e4e3f..58ba22810bab 100644
--- a/drivers/net/ipa/ipa_sysfs.h
+++ b/drivers/net/ipa/ipa_sysfs.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2021 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 #ifndef _IPA_SYSFS_H_
 #define _IPA_SYSFS_H_
diff --git a/drivers/net/ipa/ipa_table.c b/drivers/net/ipa/ipa_table.c
index 02cab1b59f21..510ff2dc8999 100644
--- a/drivers/net/ipa/ipa_table.c
+++ b/drivers/net/ipa/ipa_table.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2021 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ipa/ipa_table.h b/drivers/net/ipa/ipa_table.h
index 1538e2e1732f..395189f75d78 100644
--- a/drivers/net/ipa/ipa_table.h
+++ b/drivers/net/ipa/ipa_table.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2021 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 #ifndef _IPA_TABLE_H_
 #define _IPA_TABLE_H_
diff --git a/drivers/net/ipa/ipa_uc.c b/drivers/net/ipa/ipa_uc.c
index cf21f1a87a88..f0ee47281015 100644
--- a/drivers/net/ipa/ipa_uc.c
+++ b/drivers/net/ipa/ipa_uc.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2018-2020 Linaro Ltd.
+ * Copyright (C) 2018-2022 Linaro Ltd.
  */
 
 #include <linux/types.h>
diff --git a/drivers/net/ipa/ipa_uc.h b/drivers/net/ipa/ipa_uc.h
index 23847f934d64..8514096e6f36 100644
--- a/drivers/net/ipa/ipa_uc.h
+++ b/drivers/net/ipa/ipa_uc.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2020 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 #ifndef _IPA_UC_H_
 #define _IPA_UC_H_
diff --git a/drivers/net/ipa/ipa_version.h b/drivers/net/ipa/ipa_version.h
index 58f7b43b4db3..7870e0cc3d7c 100644
--- a/drivers/net/ipa/ipa_version.h
+++ b/drivers/net/ipa/ipa_version.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
 /* Copyright (c) 2012-2018, The Linux Foundation. All rights reserved.
- * Copyright (C) 2019-2020 Linaro Ltd.
+ * Copyright (C) 2019-2022 Linaro Ltd.
  */
 #ifndef _IPA_VERSION_H_
 #define _IPA_VERSION_H_
-- 
cgit v1.2.3


From 0152dfee235e87660f52a117fc9f70dc55956bb4 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 3 Oct 2022 17:19:27 +0100
Subject: net: mvpp2: fix mvpp2 debugfs leak

When mvpp2 is unloaded, the driver specific debugfs directory is not
removed, which technically leads to a memory leak. However, this
directory is only created when the first device is probed, so the
hardware is present. Removing the module is only something a developer
would to when e.g. testing out changes, so the module would be
reloaded. So this memory leak is minor.

The original attempt in commit fe2c9c61f668 ("net: mvpp2: debugfs: fix
memory leak when using debugfs_lookup()") that was labelled as a memory
leak fix was not, it fixed a refcount leak, but in doing so created a
problem when the module is reloaded - the directory already exists, but
mvpp2_root is NULL, so we lose all debugfs entries. This fix has been
reverted.

This is the alternative fix, where we remove the offending directory
whenever the driver is unloaded.

Fixes: 21da57a23125 ("net: mvpp2: add a debugfs interface for the Header Parser")
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Marcin Wojtas <mw@semihalf.com>
Link: https://lore.kernel.org/r/E1ofOAB-00CzkG-UO@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h         |  1 +
 drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c | 10 ++++++++--
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c    | 13 ++++++++++++-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
index ad73a488fc5f..11e603686a27 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
@@ -1530,6 +1530,7 @@ u32 mvpp2_read(struct mvpp2 *priv, u32 offset);
 void mvpp2_dbgfs_init(struct mvpp2 *priv, const char *name);
 
 void mvpp2_dbgfs_cleanup(struct mvpp2 *priv);
+void mvpp2_dbgfs_exit(void);
 
 void mvpp23_rx_fifo_fc_en(struct mvpp2 *priv, int port, bool en);
 
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c
index 4a3baa7e0142..75e83ea2a926 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_debugfs.c
@@ -691,6 +691,13 @@ static int mvpp2_dbgfs_port_init(struct dentry *parent,
 	return 0;
 }
 
+static struct dentry *mvpp2_root;
+
+void mvpp2_dbgfs_exit(void)
+{
+	debugfs_remove(mvpp2_root);
+}
+
 void mvpp2_dbgfs_cleanup(struct mvpp2 *priv)
 {
 	debugfs_remove_recursive(priv->dbgfs_dir);
@@ -700,10 +707,9 @@ void mvpp2_dbgfs_cleanup(struct mvpp2 *priv)
 
 void mvpp2_dbgfs_init(struct mvpp2 *priv, const char *name)
 {
-	struct dentry *mvpp2_dir, *mvpp2_root;
+	struct dentry *mvpp2_dir;
 	int ret, i;
 
-	mvpp2_root = debugfs_lookup(MVPP2_DRIVER_NAME, NULL);
 	if (!mvpp2_root)
 		mvpp2_root = debugfs_create_dir(MVPP2_DRIVER_NAME, NULL);
 
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index b84128b549b4..eaa51cd7456b 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -7706,7 +7706,18 @@ static struct platform_driver mvpp2_driver = {
 	},
 };
 
-module_platform_driver(mvpp2_driver);
+static int __init mvpp2_driver_init(void)
+{
+	return platform_driver_register(&mvpp2_driver);
+}
+module_init(mvpp2_driver_init);
+
+static void __exit mvpp2_driver_exit(void)
+{
+	platform_driver_unregister(&mvpp2_driver);
+	mvpp2_dbgfs_exit();
+}
+module_exit(mvpp2_driver_exit);
 
 MODULE_DESCRIPTION("Marvell PPv2 Ethernet Driver - www.marvell.com");
 MODULE_AUTHOR("Marcin Wojtas <mw@semihalf.com>");
-- 
cgit v1.2.3


From 082a9edf12fef88400172e7d1b131d65a3ed492e Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:18 -0700
Subject: net/mlx5e: xsk: Flush RQ on XSK activation to save memory

The regular RQ remains open after opening an XSK socket, in order to
guarantee that closing the XSK socket never fails due to an error when
reopening the regular RQ.

To save memory, the regular RQ can be deactivated and flushed, releasing
all pages, when an XSK socket is open.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h          |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c |  9 +++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c     | 14 +++++++++-----
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 6bc6472b98f2..9e6347a67fd2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1036,6 +1036,7 @@ struct mlx5e_rq_param;
 int mlx5e_open_rq(struct mlx5e_params *params, struct mlx5e_rq_param *param,
 		  struct mlx5e_xsk_param *xsk, int node,
 		  struct mlx5e_rq *rq);
+#define MLX5E_RQ_WQES_TIMEOUT 20000 /* msecs */
 int mlx5e_wait_for_min_rx_wqes(struct mlx5e_rq *rq, int wait_time);
 void mlx5e_close_rq(struct mlx5e_rq *rq);
 int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
index 9804ef15a4d6..8b09e2f58a4d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
@@ -126,6 +126,9 @@ static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
 
 	mlx5e_rx_res_xsk_update(priv->rx_res, &priv->channels, ix, true);
 
+	mlx5e_deactivate_rq(&c->rq);
+	mlx5e_flush_rq(&c->rq, MLX5_RQC_STATE_RDY);
+
 	return 0;
 
 err_remove_pool:
@@ -165,7 +168,13 @@ static int mlx5e_xsk_disable_locked(struct mlx5e_priv *priv, u16 ix)
 		goto remove_pool;
 
 	c = priv->channels.c[ix];
+
+	mlx5e_activate_rq(&c->rq);
+	mlx5e_trigger_napi_icosq(c);
+	mlx5e_wait_for_min_rx_wqes(&c->rq, MLX5E_RQ_WQES_TIMEOUT);
+
 	mlx5e_rx_res_xsk_update(priv->rx_res, &priv->channels, ix, false);
+
 	mlx5e_deactivate_xsk(c);
 	mlx5e_close_xsk(c);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 21fe43406d88..10428ade96c1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2408,10 +2408,11 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c)
 		mlx5e_activate_txqsq(&c->sq[tc]);
 	mlx5e_activate_icosq(&c->icosq);
 	mlx5e_activate_icosq(&c->async_icosq);
-	mlx5e_activate_rq(&c->rq);
 
 	if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
 		mlx5e_activate_xsk(c);
+	else
+		mlx5e_activate_rq(&c->rq);
 
 	mlx5e_trigger_napi_icosq(c);
 }
@@ -2422,8 +2423,9 @@ static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
 
 	if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
 		mlx5e_deactivate_xsk(c);
+	else
+		mlx5e_deactivate_rq(&c->rq);
 
-	mlx5e_deactivate_rq(&c->rq);
 	mlx5e_deactivate_icosq(&c->async_icosq);
 	mlx5e_deactivate_icosq(&c->icosq);
 	for (tc = 0; tc < c->num_tc; tc++)
@@ -2515,8 +2517,6 @@ static void mlx5e_activate_channels(struct mlx5e_channels *chs)
 		mlx5e_ptp_activate_channel(chs->ptp);
 }
 
-#define MLX5E_RQ_WQES_TIMEOUT 20000 /* msecs */
-
 static int mlx5e_wait_channels_min_rx_wqes(struct mlx5e_channels *chs)
 {
 	int err = 0;
@@ -2524,8 +2524,12 @@ static int mlx5e_wait_channels_min_rx_wqes(struct mlx5e_channels *chs)
 
 	for (i = 0; i < chs->num; i++) {
 		int timeout = err ? 0 : MLX5E_RQ_WQES_TIMEOUT;
+		struct mlx5e_channel *c = chs->c[i];
+
+		if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
+			continue;
 
-		err |= mlx5e_wait_for_min_rx_wqes(&chs->c[i]->rq, timeout);
+		err |= mlx5e_wait_for_min_rx_wqes(&c->rq, timeout);
 
 		/* Don't wait on the XSK RQ, because the newer xdpsock sample
 		 * doesn't provide any Fill Ring entries at the setup stage.
-- 
cgit v1.2.3


From a2740f529da2dab929e10bf565073f6659c863fc Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:19 -0700
Subject: net/mlx5e: xsk: Set napi_id to support busy polling

xdp_rxq_info_reg should get the actual napi_id, not 0, in order to
support socket busy polling properly.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 10428ade96c1..3ee8295c2115 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -539,7 +539,7 @@ static int mlx5e_init_rxq_rq(struct mlx5e_channel *c, struct mlx5e_params *param
 	if (err)
 		return err;
 
-	return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, 0);
+	return xdp_rxq_info_reg(&rq->xdp_rxq, rq->netdev, rq->ix, c->napi.napi_id);
 }
 
 static int mlx5_rq_shampo_alloc(struct mlx5_core_dev *mdev,
-- 
cgit v1.2.3


From 1ca6492ec964325396d5822a26ff53876e466f71 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:20 -0700
Subject: net/mlx5e: xsk: Include XSK skb_from_cqe callbacks in INDIRECT_CALL

XSK is a performance-critical data path. To avoid an indirect function
call with a retpoline, include XSK callbacks in the INDIRECT_CALL macro,
so that they are called directly in XSK flows.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 36eda4c958a0..5835d86be8d8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -1709,9 +1709,10 @@ static void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 		goto free_wqe;
 	}
 
-	skb = INDIRECT_CALL_2(rq->wqe.skb_from_cqe,
+	skb = INDIRECT_CALL_3(rq->wqe.skb_from_cqe,
 			      mlx5e_skb_from_cqe_linear,
 			      mlx5e_skb_from_cqe_nonlinear,
+			      mlx5e_xsk_skb_from_cqe_linear,
 			      rq, wi, cqe_bcnt);
 	if (!skb) {
 		/* probably for XDP */
@@ -2180,9 +2181,10 @@ static void mlx5e_handle_rx_cqe_mpwrq(struct mlx5e_rq *rq, struct mlx5_cqe64 *cq
 
 	cqe_bcnt = mpwrq_get_cqe_byte_cnt(cqe);
 
-	skb = INDIRECT_CALL_2(rq->mpwqe.skb_from_cqe_mpwrq,
+	skb = INDIRECT_CALL_3(rq->mpwqe.skb_from_cqe_mpwrq,
 			      mlx5e_skb_from_cqe_mpwrq_linear,
 			      mlx5e_skb_from_cqe_mpwrq_nonlinear,
+			      mlx5e_xsk_skb_from_cqe_mpwrq_linear,
 			      rq, wi, cqe_bcnt, head_offset, page_idx);
 	if (!skb)
 		goto mpwrq_cqe_out;
-- 
cgit v1.2.3


From cfb4d09c30c9d5b2b4d09766ebff2ec7a0f669da Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:21 -0700
Subject: net/mlx5e: xsk: Improve need_wakeup logic

XSK need_wakeup mechanism allows the driver to stop busy waiting for
buffers when the fill ring is empty, yield to the application and signal
it that the driver needs to be waken up after the application refills
the fill ring.

Add protection against the race condition on the RX (refill) side: if
the application refills buffers after xskrq->post_wqes is called, but
before mlx5e_xsk_update_rx_wakeup, NAPI will exit, skipping taking these
buffers to the hardware WQ, and the application won't wake it up again.

Optimize the whole need_wakeup logic, removing unneeded flows, to
compensate for this new check.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.h    | 14 ---------
 .../net/ethernet/mellanox/mlx5/core/en/xsk/tx.h    | 12 --------
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c  | 33 ++++++++++++++--------
 4 files changed, 23 insertions(+), 37 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index 84a496a8d72f..087c943bd8e9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -5,7 +5,6 @@
 #define __MLX5_EN_XSK_RX_H__
 
 #include "en.h"
-#include <net/xdp_sock_drv.h>
 
 /* RX data path */
 
@@ -21,17 +20,4 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 					      struct mlx5e_wqe_frag_info *wi,
 					      u32 cqe_bcnt);
 
-static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err)
-{
-	if (!xsk_uses_need_wakeup(rq->xsk_pool))
-		return alloc_err;
-
-	if (unlikely(alloc_err))
-		xsk_set_rx_need_wakeup(rq->xsk_pool);
-	else
-		xsk_clear_rx_need_wakeup(rq->xsk_pool);
-
-	return false;
-}
-
 #endif /* __MLX5_EN_XSK_RX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
index a05085035f23..9c505158b975 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
@@ -5,7 +5,6 @@
 #define __MLX5_EN_XSK_TX_H__
 
 #include "en.h"
-#include <net/xdp_sock_drv.h>
 
 /* TX data path */
 
@@ -13,15 +12,4 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
 
 bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget);
 
-static inline void mlx5e_xsk_update_tx_wakeup(struct mlx5e_xdpsq *sq)
-{
-	if (!xsk_uses_need_wakeup(sq->xsk_pool))
-		return;
-
-	if (sq->pc != sq->cc)
-		xsk_clear_tx_need_wakeup(sq->xsk_pool);
-	else
-		xsk_set_tx_need_wakeup(sq->xsk_pool);
-}
-
 #endif /* __MLX5_EN_XSK_TX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 5835d86be8d8..b61604d87701 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -41,6 +41,7 @@
 #include <net/gro.h>
 #include <net/udp.h>
 #include <net/tcp.h>
+#include <net/xdp_sock_drv.h>
 #include "en.h"
 #include "en/txrx.h"
 #include "en_tc.h"
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 833be29170a1..9a458a5d9853 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -31,6 +31,7 @@
  */
 
 #include <linux/irq.h>
+#include <net/xdp_sock_drv.h>
 #include "en.h"
 #include "en/txrx.h"
 #include "en/xdp.h"
@@ -86,26 +87,36 @@ void mlx5e_trigger_irq(struct mlx5e_icosq *sq)
 
 static bool mlx5e_napi_xsk_post(struct mlx5e_xdpsq *xsksq, struct mlx5e_rq *xskrq)
 {
+	bool need_wakeup = xsk_uses_need_wakeup(xskrq->xsk_pool);
 	bool busy_xsk = false, xsk_rx_alloc_err;
 
-	/* Handle the race between the application querying need_wakeup and the
-	 * driver setting it:
-	 * 1. Update need_wakeup both before and after the TX. If it goes to
-	 * "yes", it can only happen with the first update.
-	 * 2. If the application queried need_wakeup before we set it, the
-	 * packets will be transmitted anyway, even w/o a wakeup.
-	 * 3. Give a chance to clear need_wakeup after new packets were queued
-	 * for TX.
+	/* If SQ is empty, there are no TX completions to trigger NAPI, so set
+	 * need_wakeup. Do it before queuing packets for TX to avoid race
+	 * condition with userspace.
 	 */
-	mlx5e_xsk_update_tx_wakeup(xsksq);
+	if (need_wakeup && xsksq->pc == xsksq->cc)
+		xsk_set_tx_need_wakeup(xsksq->xsk_pool);
 	busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
-	mlx5e_xsk_update_tx_wakeup(xsksq);
+	/* If we queued some packets for TX, no need for wakeup anymore. */
+	if (need_wakeup && xsksq->pc != xsksq->cc)
+		xsk_clear_tx_need_wakeup(xsksq->xsk_pool);
 
+	/* If WQ is empty, RX won't trigger NAPI, so set need_wakeup. Do it
+	 * before refilling to avoid race condition with userspace.
+	 */
+	if (need_wakeup && !mlx5e_rqwq_get_cur_sz(xskrq))
+		xsk_set_rx_need_wakeup(xskrq->xsk_pool);
 	xsk_rx_alloc_err = INDIRECT_CALL_2(xskrq->post_wqes,
 					   mlx5e_post_rx_mpwqes,
 					   mlx5e_post_rx_wqes,
 					   xskrq);
-	busy_xsk |= mlx5e_xsk_update_rx_wakeup(xskrq, xsk_rx_alloc_err);
+	/* Ask for wakeup if WQ is not full after refill. */
+	if (!need_wakeup)
+		busy_xsk |= xsk_rx_alloc_err;
+	else if (xsk_rx_alloc_err)
+		xsk_set_rx_need_wakeup(xskrq->xsk_pool);
+	else
+		xsk_clear_rx_need_wakeup(xskrq->xsk_pool);
 
 	return busy_xsk;
 }
-- 
cgit v1.2.3


From 168723c1f8d6e1e823d4c6ad3cf64478cf58330a Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:22 -0700
Subject: net/mlx5e: xsk: Use umr_mode to calculate striding RQ parameters

Instead of passing the unaligned flag, pass an enum that indicates the
UMR mode. The next commit will add the third mode (KLM for certain
configurations of XSK), which will be added to this enum instead of
adding another bool flag everywhere.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |   9 +-
 .../net/ethernet/mellanox/mlx5/core/en/params.c    | 126 ++++++++++++++-------
 .../net/ethernet/mellanox/mlx5/core/en/params.h    |  24 ++--
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.c    |   4 +-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |   4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  72 ++++++++----
 include/linux/mlx5/driver.h                        |   4 +
 7 files changed, 171 insertions(+), 72 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 9e6347a67fd2..a2d09f30acd1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -681,6 +681,11 @@ struct mlx5e_hw_gro_data {
 	int second_ip_id;
 };
 
+enum mlx5e_mpwrq_umr_mode {
+	MLX5E_MPWRQ_UMR_MODE_ALIGNED,
+	MLX5E_MPWRQ_UMR_MODE_UNALIGNED,
+};
+
 struct mlx5e_rq {
 	/* data path */
 	union {
@@ -708,7 +713,7 @@ struct mlx5e_rq {
 			u8                     pages_per_wqe;
 			u8                     umr_wqebbs;
 			u8                     mtts_per_wqe;
-			u8                     unaligned;
+			u8                     umr_mode;
 			struct mlx5e_shampo_hd *shampo;
 		} mpwqe;
 	};
@@ -1008,7 +1013,7 @@ struct mlx5e_profile {
 void mlx5e_build_ptys2ethtool_map(void);
 
 bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev, u8 page_shift,
-					    bool unaligned);
+					    enum mlx5e_mpwrq_umr_mode umr_mode);
 
 void mlx5e_shampo_dealloc_hd(struct mlx5e_rq *rq, u16 len, u16 start, bool close);
 void mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index ac4d70bb21e8..b57855bf7629 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -27,9 +27,48 @@ u8 mlx5e_mpwrq_page_shift(struct mlx5_core_dev *mdev, struct mlx5e_xsk_param *xs
 	return max(req_page_shift, min_page_shift);
 }
 
-u8 mlx5e_mpwrq_log_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift, bool unaligned)
+enum mlx5e_mpwrq_umr_mode
+mlx5e_mpwrq_umr_mode(struct mlx5_core_dev *mdev, struct mlx5e_xsk_param *xsk)
+{
+	/* Different memory management schemes use different mechanisms to map
+	 * user-mode memory. The stricter guarantees we have, the faster
+	 * mechanisms we use:
+	 * 1. MTT - direct mapping in page granularity.
+	 * 2. KSM - indirect mapping to another MKey to arbitrary addresses, but
+	 *    all mappings have the same size.
+	 */
+	bool unaligned = xsk ? xsk->unaligned : false;
+
+	/* XSK frames can start at arbitrary unaligned locations, but they all
+	 * have the same size which is a power of two. It allows to optimize to
+	 * one KSM per frame.
+	 */
+	if (unaligned)
+		return MLX5E_MPWRQ_UMR_MODE_UNALIGNED;
+
+	/* XSK: frames are naturally aligned, MTT can be used.
+	 * Non-XSK: Allocations happen in units of CPU pages, therefore, the
+	 * mappings are naturally aligned.
+	 */
+	return MLX5E_MPWRQ_UMR_MODE_ALIGNED;
+}
+
+u8 mlx5e_mpwrq_umr_entry_size(enum mlx5e_mpwrq_umr_mode mode)
 {
-	u8 umr_entry_size = unaligned ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt);
+	switch (mode) {
+	case MLX5E_MPWRQ_UMR_MODE_ALIGNED:
+		return sizeof(struct mlx5_mtt);
+	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
+		return sizeof(struct mlx5_ksm);
+	}
+	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", mode);
+	return 0;
+}
+
+u8 mlx5e_mpwrq_log_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift,
+			  enum mlx5e_mpwrq_umr_mode umr_mode)
+{
+	u8 umr_entry_size = mlx5e_mpwrq_umr_entry_size(umr_mode);
 	u8 max_pages_per_wqe, max_log_mpwqe_size;
 	u16 max_wqe_size;
 
@@ -44,9 +83,10 @@ u8 mlx5e_mpwrq_log_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift, bool unalig
 	return min_t(u8, max_log_mpwqe_size, MLX5_MPWRQ_MAX_LOG_WQE_SZ);
 }
 
-u8 mlx5e_mpwrq_pages_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift, bool unaligned)
+u8 mlx5e_mpwrq_pages_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift,
+			     enum mlx5e_mpwrq_umr_mode umr_mode)
 {
-	u8 log_wqe_sz = mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, unaligned);
+	u8 log_wqe_sz = mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, umr_mode);
 	u8 pages_per_wqe;
 
 	pages_per_wqe = log_wqe_sz > page_shift ? (1 << (log_wqe_sz - page_shift)) : 1;
@@ -59,10 +99,11 @@ u8 mlx5e_mpwrq_pages_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift, bool una
 	return pages_per_wqe;
 }
 
-u16 mlx5e_mpwrq_umr_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift, bool unaligned)
+u16 mlx5e_mpwrq_umr_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift,
+			   enum mlx5e_mpwrq_umr_mode umr_mode)
 {
-	u8 umr_entry_size = unaligned ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt);
-	u8 pages_per_wqe = mlx5e_mpwrq_pages_per_wqe(mdev, page_shift, unaligned);
+	u8 pages_per_wqe = mlx5e_mpwrq_pages_per_wqe(mdev, page_shift, umr_mode);
+	u8 umr_entry_size = mlx5e_mpwrq_umr_entry_size(umr_mode);
 	u16 umr_wqe_sz;
 
 	umr_wqe_sz = sizeof(struct mlx5e_umr_wqe) +
@@ -73,25 +114,30 @@ u16 mlx5e_mpwrq_umr_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift, bool unali
 	return umr_wqe_sz;
 }
 
-u8 mlx5e_mpwrq_umr_wqebbs(struct mlx5_core_dev *mdev, u8 page_shift, bool unaligned)
+u8 mlx5e_mpwrq_umr_wqebbs(struct mlx5_core_dev *mdev, u8 page_shift,
+			  enum mlx5e_mpwrq_umr_mode umr_mode)
 {
-	return DIV_ROUND_UP(mlx5e_mpwrq_umr_wqe_sz(mdev, page_shift, unaligned),
+	return DIV_ROUND_UP(mlx5e_mpwrq_umr_wqe_sz(mdev, page_shift, umr_mode),
 			    MLX5_SEND_WQE_BB);
 }
 
-u8 mlx5e_mpwrq_mtts_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift, bool unaligned)
+u8 mlx5e_mpwrq_mtts_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift,
+			    enum mlx5e_mpwrq_umr_mode umr_mode)
 {
+	u8 pages_per_wqe = mlx5e_mpwrq_pages_per_wqe(mdev, page_shift, umr_mode);
+
 	/* Add another page as a buffer between WQEs. This page will absorb
 	 * write overflow by the hardware, when receiving packets larger than
 	 * MTU. These oversize packets are dropped by the driver at a later
 	 * stage.
 	 */
-	return MLX5_ALIGN_MTTS(mlx5e_mpwrq_pages_per_wqe(mdev, page_shift, unaligned) + 1);
+	return MLX5_ALIGN_MTTS(pages_per_wqe + 1);
 }
 
-u32 mlx5e_mpwrq_max_num_entries(struct mlx5_core_dev *mdev, bool unaligned)
+u32 mlx5e_mpwrq_max_num_entries(struct mlx5_core_dev *mdev,
+				enum mlx5e_mpwrq_umr_mode umr_mode)
 {
-	if (unaligned)
+	if (umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)
 		return min(MLX5E_MAX_RQ_NUM_KSMS,
 			   1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size));
 
@@ -99,18 +145,19 @@ u32 mlx5e_mpwrq_max_num_entries(struct mlx5_core_dev *mdev, bool unaligned)
 }
 
 static u8 mlx5e_mpwrq_max_log_rq_size(struct mlx5_core_dev *mdev, u8 page_shift,
-				      bool unaligned)
+				      enum mlx5e_mpwrq_umr_mode umr_mode)
 {
-	u8 mtts_per_wqe = mlx5e_mpwrq_mtts_per_wqe(mdev, page_shift, unaligned);
-	u32 max_entries = mlx5e_mpwrq_max_num_entries(mdev, unaligned);
+	u8 mtts_per_wqe = mlx5e_mpwrq_mtts_per_wqe(mdev, page_shift, umr_mode);
+	u32 max_entries = mlx5e_mpwrq_max_num_entries(mdev, umr_mode);
 
 	return ilog2(max_entries / mtts_per_wqe);
 }
 
-u8 mlx5e_mpwrq_max_log_rq_pkts(struct mlx5_core_dev *mdev, u8 page_shift, bool unaligned)
+u8 mlx5e_mpwrq_max_log_rq_pkts(struct mlx5_core_dev *mdev, u8 page_shift,
+			       enum mlx5e_mpwrq_umr_mode umr_mode)
 {
-	return mlx5e_mpwrq_max_log_rq_size(mdev, page_shift, unaligned) +
-		mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, unaligned) -
+	return mlx5e_mpwrq_max_log_rq_size(mdev, page_shift, umr_mode) +
+		mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, umr_mode) -
 		MLX5E_ORDER2_MAX_PACKET_MTU;
 }
 
@@ -171,10 +218,10 @@ static u8 mlx5e_mpwqe_log_pkts_per_wqe(struct mlx5_core_dev *mdev,
 				       struct mlx5e_xsk_param *xsk)
 {
 	u32 linear_stride_sz = mlx5e_rx_get_linear_stride_sz(mdev, params, xsk, true);
+	enum mlx5e_mpwrq_umr_mode umr_mode = mlx5e_mpwrq_umr_mode(mdev, xsk);
 	u8 page_shift = mlx5e_mpwrq_page_shift(mdev, xsk);
-	bool unaligned = xsk ? xsk->unaligned : false;
 
-	return mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, unaligned) -
+	return mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, umr_mode) -
 		order_base_2(linear_stride_sz);
 }
 
@@ -200,10 +247,11 @@ bool mlx5e_rx_is_linear_skb(struct mlx5_core_dev *mdev,
 
 static bool mlx5e_verify_rx_mpwqe_strides(struct mlx5_core_dev *mdev,
 					  u8 log_stride_sz, u8 log_num_strides,
-					  u8 page_shift, bool unaligned)
+					  u8 page_shift,
+					  enum mlx5e_mpwrq_umr_mode umr_mode)
 {
 	if (log_stride_sz + log_num_strides !=
-	    mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, unaligned))
+	    mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, umr_mode))
 		return false;
 
 	if (log_stride_sz < MLX5_MPWQE_LOG_STRIDE_SZ_BASE ||
@@ -223,8 +271,8 @@ bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
 				  struct mlx5e_params *params,
 				  struct mlx5e_xsk_param *xsk)
 {
+	enum mlx5e_mpwrq_umr_mode umr_mode = mlx5e_mpwrq_umr_mode(mdev, xsk);
 	u8 page_shift = mlx5e_mpwrq_page_shift(mdev, xsk);
-	bool unaligned = xsk ? xsk->unaligned : false;
 	u8 log_num_strides;
 	u8 log_stride_sz;
 	u8 log_wqe_sz;
@@ -233,7 +281,7 @@ bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
 		return false;
 
 	log_stride_sz = order_base_2(mlx5e_rx_get_linear_stride_sz(mdev, params, xsk, true));
-	log_wqe_sz = mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, unaligned);
+	log_wqe_sz = mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, umr_mode);
 
 	if (log_wqe_sz < log_stride_sz)
 		return false;
@@ -242,19 +290,19 @@ bool mlx5e_rx_mpwqe_is_linear_skb(struct mlx5_core_dev *mdev,
 
 	return mlx5e_verify_rx_mpwqe_strides(mdev, log_stride_sz,
 					     log_num_strides, page_shift,
-					     unaligned);
+					     umr_mode);
 }
 
 u8 mlx5e_mpwqe_get_log_rq_size(struct mlx5_core_dev *mdev,
 			       struct mlx5e_params *params,
 			       struct mlx5e_xsk_param *xsk)
 {
+	enum mlx5e_mpwrq_umr_mode umr_mode = mlx5e_mpwrq_umr_mode(mdev, xsk);
 	u8 log_pkts_per_wqe, page_shift, max_log_rq_size;
-	bool unaligned = xsk ? xsk->unaligned : false;
 
 	log_pkts_per_wqe = mlx5e_mpwqe_log_pkts_per_wqe(mdev, params, xsk);
 	page_shift = mlx5e_mpwrq_page_shift(mdev, xsk);
-	max_log_rq_size = mlx5e_mpwrq_max_log_rq_size(mdev, page_shift, unaligned);
+	max_log_rq_size = mlx5e_mpwrq_max_log_rq_size(mdev, page_shift, umr_mode);
 
 	/* Numbers are unsigned, don't subtract to avoid underflow. */
 	if (params->log_rq_mtu_frames <
@@ -308,10 +356,10 @@ u8 mlx5e_mpwqe_get_log_num_strides(struct mlx5_core_dev *mdev,
 				   struct mlx5e_params *params,
 				   struct mlx5e_xsk_param *xsk)
 {
+	enum mlx5e_mpwrq_umr_mode umr_mode = mlx5e_mpwrq_umr_mode(mdev, xsk);
 	u8 page_shift = mlx5e_mpwrq_page_shift(mdev, xsk);
-	bool unaligned = xsk ? xsk->unaligned : false;
 
-	return mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, unaligned) -
+	return mlx5e_mpwrq_log_wqe_sz(mdev, page_shift, umr_mode) -
 		mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk);
 }
 
@@ -460,9 +508,10 @@ bool slow_pci_heuristic(struct mlx5_core_dev *mdev)
 
 int mlx5e_mpwrq_validate_regular(struct mlx5_core_dev *mdev, struct mlx5e_params *params)
 {
+	enum mlx5e_mpwrq_umr_mode umr_mode = mlx5e_mpwrq_umr_mode(mdev, NULL);
 	u8 page_shift = mlx5e_mpwrq_page_shift(mdev, NULL);
 
-	if (!mlx5e_check_fragmented_striding_rq_cap(mdev, page_shift, false))
+	if (!mlx5e_check_fragmented_striding_rq_cap(mdev, page_shift, umr_mode))
 		return -EOPNOTSUPP;
 
 	if (params->xdp_prog && !mlx5e_rx_mpwqe_is_linear_skb(mdev, params, NULL))
@@ -474,11 +523,12 @@ int mlx5e_mpwrq_validate_regular(struct mlx5_core_dev *mdev, struct mlx5e_params
 int mlx5e_mpwrq_validate_xsk(struct mlx5_core_dev *mdev, struct mlx5e_params *params,
 			     struct mlx5e_xsk_param *xsk)
 {
+	enum mlx5e_mpwrq_umr_mode umr_mode = mlx5e_mpwrq_umr_mode(mdev, xsk);
 	u8 page_shift = mlx5e_mpwrq_page_shift(mdev, xsk);
 	bool unaligned = xsk ? xsk->unaligned : false;
 	u16 max_mtu_pkts;
 
-	if (!mlx5e_check_fragmented_striding_rq_cap(mdev, page_shift, xsk->unaligned))
+	if (!mlx5e_check_fragmented_striding_rq_cap(mdev, page_shift, umr_mode))
 		return -EOPNOTSUPP;
 
 	if (!mlx5e_rx_mpwqe_is_linear_skb(mdev, params, xsk))
@@ -781,16 +831,16 @@ int mlx5e_build_rq_param(struct mlx5_core_dev *mdev,
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: {
 		u8 log_wqe_num_of_strides = mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk);
 		u8 log_wqe_stride_size = mlx5e_mpwqe_get_log_stride_size(mdev, params, xsk);
+		enum mlx5e_mpwrq_umr_mode umr_mode = mlx5e_mpwrq_umr_mode(mdev, xsk);
 		u8 page_shift = mlx5e_mpwrq_page_shift(mdev, xsk);
-		bool unaligned = xsk ? xsk->unaligned : false;
 
 		if (!mlx5e_verify_rx_mpwqe_strides(mdev, log_wqe_stride_size,
 						   log_wqe_num_of_strides,
-						   page_shift, unaligned)) {
+						   page_shift, umr_mode)) {
 			mlx5_core_err(mdev,
-				      "Bad RX MPWQE params: log_stride_size %u, log_num_strides %u, unaligned %d\n",
+				      "Bad RX MPWQE params: log_stride_size %u, log_num_strides %u, umr_mode %d\n",
 				      log_wqe_stride_size, log_wqe_num_of_strides,
-				      unaligned);
+				      umr_mode);
 			return -EINVAL;
 		}
 
@@ -974,11 +1024,11 @@ static u32 mlx5e_mpwrq_total_umr_wqebbs(struct mlx5_core_dev *mdev,
 					struct mlx5e_params *params,
 					struct mlx5e_xsk_param *xsk)
 {
+	enum mlx5e_mpwrq_umr_mode umr_mode = mlx5e_mpwrq_umr_mode(mdev, xsk);
 	u8 page_shift = mlx5e_mpwrq_page_shift(mdev, xsk);
-	bool unaligned = xsk ? xsk->unaligned : false;
 	u8 umr_wqebbs;
 
-	umr_wqebbs = mlx5e_mpwrq_umr_wqebbs(mdev, page_shift, unaligned);
+	umr_wqebbs = mlx5e_mpwrq_umr_wqebbs(mdev, page_shift, umr_mode);
 
 	return umr_wqebbs * (1 << mlx5e_mpwqe_get_log_rq_size(mdev, params, xsk));
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
index a3952afdcbe4..034debd140bc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
@@ -56,13 +56,23 @@ struct mlx5e_create_sq_param {
 /* Striding RQ dynamic parameters */
 
 u8 mlx5e_mpwrq_page_shift(struct mlx5_core_dev *mdev, struct mlx5e_xsk_param *xsk);
-u8 mlx5e_mpwrq_log_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift, bool unaligned);
-u8 mlx5e_mpwrq_pages_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift, bool unaligned);
-u16 mlx5e_mpwrq_umr_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift, bool unaligned);
-u8 mlx5e_mpwrq_umr_wqebbs(struct mlx5_core_dev *mdev, u8 page_shift, bool unaligned);
-u8 mlx5e_mpwrq_mtts_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift, bool unaligned);
-u32 mlx5e_mpwrq_max_num_entries(struct mlx5_core_dev *mdev, bool unaligned);
-u8 mlx5e_mpwrq_max_log_rq_pkts(struct mlx5_core_dev *mdev, u8 page_shift, bool unaligned);
+enum mlx5e_mpwrq_umr_mode
+mlx5e_mpwrq_umr_mode(struct mlx5_core_dev *mdev, struct mlx5e_xsk_param *xsk);
+u8 mlx5e_mpwrq_umr_entry_size(enum mlx5e_mpwrq_umr_mode mode);
+u8 mlx5e_mpwrq_log_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift,
+			  enum mlx5e_mpwrq_umr_mode umr_mode);
+u8 mlx5e_mpwrq_pages_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift,
+			     enum mlx5e_mpwrq_umr_mode umr_mode);
+u16 mlx5e_mpwrq_umr_wqe_sz(struct mlx5_core_dev *mdev, u8 page_shift,
+			   enum mlx5e_mpwrq_umr_mode umr_mode);
+u8 mlx5e_mpwrq_umr_wqebbs(struct mlx5_core_dev *mdev, u8 page_shift,
+			  enum mlx5e_mpwrq_umr_mode umr_mode);
+u8 mlx5e_mpwrq_mtts_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift,
+			    enum mlx5e_mpwrq_umr_mode umr_mode);
+u32 mlx5e_mpwrq_max_num_entries(struct mlx5_core_dev *mdev,
+				enum mlx5e_mpwrq_umr_mode umr_mode);
+u8 mlx5e_mpwrq_max_log_rq_pkts(struct mlx5_core_dev *mdev, u8 page_shift,
+			       enum mlx5e_mpwrq_umr_mode umr_mode);
 
 /* Parameter calculations */
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index aebc1d5a9004..e12a856331b8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -41,7 +41,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
 	memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
 
-	if (unlikely(rq->mpwqe.unaligned)) {
+	if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)) {
 		for (i = 0; i < batch; i++) {
 			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
 
@@ -67,7 +67,7 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 		cpu_to_be32((icosq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | MLX5_OPCODE_UMR);
 
 	offset = ix * rq->mpwqe.mtts_per_wqe;
-	if (likely(!rq->mpwqe.unaligned))
+	if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED))
 		offset = MLX5_ALIGNED_MTTS_OCTW(offset);
 	umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 26f1ac4683e7..24aa25da482b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -314,7 +314,9 @@ void mlx5e_ethtool_get_ringparam(struct mlx5e_priv *priv,
 	/* Limitation for regular RQ. XSK RQ may clamp the queue length in
 	 * mlx5e_mpwqe_get_log_rq_size.
 	 */
-	u8 max_log_mpwrq_pkts = mlx5e_mpwrq_max_log_rq_pkts(priv->mdev, PAGE_SHIFT, false);
+	u8 max_log_mpwrq_pkts = mlx5e_mpwrq_max_log_rq_pkts(priv->mdev,
+							    PAGE_SHIFT,
+							    MLX5E_MPWRQ_UMR_MODE_ALIGNED);
 
 	param->rx_max_pending = 1 << min_t(u8, MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE,
 					   max_log_mpwrq_pkts);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 3ee8295c2115..b5a416ff1603 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -69,7 +69,7 @@
 #include "en/trap.h"
 
 bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev, u8 page_shift,
-					    bool unaligned)
+					    enum mlx5e_mpwrq_umr_mode umr_mode)
 {
 	u16 umr_wqebbs, max_wqebbs;
 	bool striding_rq_umr;
@@ -79,7 +79,7 @@ bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev, u8 page_
 	if (!striding_rq_umr)
 		return false;
 
-	umr_wqebbs = mlx5e_mpwrq_umr_wqebbs(mdev, page_shift, unaligned);
+	umr_wqebbs = mlx5e_mpwrq_umr_wqebbs(mdev, page_shift, umr_mode);
 	max_wqebbs = mlx5e_get_max_sq_aligned_wqebbs(mdev);
 	/* Sanity check; should never happen, because mlx5e_mpwrq_umr_wqebbs is
 	 * calculated from mlx5e_get_max_sq_aligned_wqebbs.
@@ -203,6 +203,18 @@ static void mlx5e_disable_blocking_events(struct mlx5e_priv *priv)
 	mlx5_blocking_notifier_unregister(priv->mdev, &priv->blocking_events_nb);
 }
 
+static u16 mlx5e_mpwrq_umr_octowords(u32 entries, enum mlx5e_mpwrq_umr_mode umr_mode)
+{
+	switch (umr_mode) {
+	case MLX5E_MPWRQ_UMR_MODE_ALIGNED:
+		return MLX5_MTT_OCTW(entries);
+	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
+		return MLX5_KSM_OCTW(entries);
+	}
+	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", umr_mode);
+	return 0;
+}
+
 static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
 				       struct mlx5e_icosq *sq,
 				       struct mlx5e_umr_wqe *wqe)
@@ -213,7 +225,7 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
 	u8 ds_cnt;
 
 	ds_cnt = DIV_ROUND_UP(mlx5e_mpwrq_umr_wqe_sz(rq->mdev, rq->mpwqe.page_shift,
-						     rq->mpwqe.unaligned),
+						     rq->mpwqe.umr_mode),
 			      MLX5_SEND_WQE_DS);
 
 	cseg->qpn_ds    = cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) |
@@ -221,8 +233,7 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
 	cseg->umr_mkey  = rq->mpwqe.umr_mkey_be;
 
 	ucseg->flags = MLX5_UMR_TRANSLATION_OFFSET_EN | MLX5_UMR_INLINE;
-	octowords = rq->mpwqe.unaligned ? MLX5_KSM_OCTW(rq->mpwqe.pages_per_wqe) :
-					  MLX5_MTT_OCTW(rq->mpwqe.pages_per_wqe);
+	octowords = mlx5e_mpwrq_umr_octowords(rq->mpwqe.pages_per_wqe, rq->mpwqe.umr_mode);
 	ucseg->xlt_octowords = cpu_to_be16(octowords);
 	ucseg->mkey_mask     = cpu_to_be64(MLX5_MKEY_MASK_FREE);
 }
@@ -283,9 +294,23 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int node)
 	return 0;
 }
 
+
+static u8 mlx5e_mpwrq_access_mode(enum mlx5e_mpwrq_umr_mode umr_mode)
+{
+	switch (umr_mode) {
+	case MLX5E_MPWRQ_UMR_MODE_ALIGNED:
+		return MLX5_MKC_ACCESS_MODE_MTT;
+	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
+		return MLX5_MKC_ACCESS_MODE_KSM;
+	}
+	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", umr_mode);
+	return 0;
+}
+
 static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 				 u32 npages, u8 page_shift, u32 *umr_mkey,
-				 dma_addr_t filler_addr, bool unaligned)
+				 dma_addr_t filler_addr,
+				 enum mlx5e_mpwrq_umr_mode umr_mode)
 {
 	struct mlx5_mtt *mtt;
 	struct mlx5_ksm *ksm;
@@ -296,14 +321,16 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 	int err;
 	int i;
 
-	if (unaligned && !MLX5_CAP_GEN(mdev, fixed_buffer_size)) {
+	if (umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED &&
+	    !MLX5_CAP_GEN(mdev, fixed_buffer_size)) {
 		mlx5_core_warn(mdev, "Unaligned AF_XDP requires fixed_buffer_size capability\n");
 		return -EINVAL;
 	}
 
+	octwords = mlx5e_mpwrq_umr_octowords(npages, umr_mode);
+
 	inlen = MLX5_FLEXIBLE_INLEN(mdev, MLX5_ST_SZ_BYTES(create_mkey_in),
-				    unaligned ? sizeof(*ksm) : sizeof(*mtt),
-				    npages);
+				    MLX5_OCTWORD, octwords);
 	if (inlen < 0)
 		return inlen;
 
@@ -311,16 +338,13 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 	if (!in)
 		return -ENOMEM;
 
-	octwords = unaligned ? MLX5_KSM_OCTW(npages) : MLX5_MTT_OCTW(npages);
-
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 
 	MLX5_SET(mkc, mkc, free, 1);
 	MLX5_SET(mkc, mkc, umr_en, 1);
 	MLX5_SET(mkc, mkc, lw, 1);
 	MLX5_SET(mkc, mkc, lr, 1);
-	MLX5_SET(mkc, mkc, access_mode_1_0,
-		 unaligned ? MLX5_MKC_ACCESS_MODE_KSM : MLX5_MKC_ACCESS_MODE_MTT);
+	MLX5_SET(mkc, mkc, access_mode_1_0, mlx5e_mpwrq_access_mode(umr_mode));
 	mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
 	MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
@@ -335,19 +359,22 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 	 * the RQ's pool, while the gaps (wqe_overflow) remain mapped
 	 * to the default page.
 	 */
-	if (unaligned) {
+	switch (umr_mode) {
+	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
 		ksm = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
 		for (i = 0; i < npages; i++)
 			ksm[i] = (struct mlx5_ksm) {
 				.key = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey),
 				.va = cpu_to_be64(filler_addr),
 			};
-	} else {
+		break;
+	case MLX5E_MPWRQ_UMR_MODE_ALIGNED:
 		mtt = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
 		for (i = 0; i < npages; i++)
 			mtt[i] = (struct mlx5_mtt) {
 				.ptag = cpu_to_be64(filler_addr),
 			};
+		break;
 	}
 
 	err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen);
@@ -396,7 +423,7 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq
 	u32 umr_mkey;
 	int err;
 
-	max_num_entries = mlx5e_mpwrq_max_num_entries(mdev, rq->mpwqe.unaligned);
+	max_num_entries = mlx5e_mpwrq_max_num_entries(mdev, rq->mpwqe.umr_mode);
 
 	/* Shouldn't overflow, the result is at most MLX5E_MAX_RQ_NUM_MTTS. */
 	if (WARN_ON_ONCE(check_mul_overflow(wq_size, (u32)rq->mpwqe.mtts_per_wqe,
@@ -408,7 +435,7 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq
 
 	err = mlx5e_create_umr_mkey(mdev, num_entries, rq->mpwqe.page_shift,
 				    &umr_mkey, rq->wqe_overflow.addr,
-				    rq->mpwqe.unaligned);
+				    rq->mpwqe.umr_mode);
 	rq->mpwqe.umr_mkey_be = cpu_to_be32(umr_mkey);
 	return err;
 }
@@ -644,16 +671,16 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,
 		wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 
 		rq->mpwqe.page_shift = mlx5e_mpwrq_page_shift(mdev, xsk);
-		rq->mpwqe.unaligned = xsk ? xsk->unaligned : false;
+		rq->mpwqe.umr_mode = mlx5e_mpwrq_umr_mode(mdev, xsk);
 		rq->mpwqe.pages_per_wqe =
 			mlx5e_mpwrq_pages_per_wqe(mdev, rq->mpwqe.page_shift,
-						  rq->mpwqe.unaligned);
+						  rq->mpwqe.umr_mode);
 		rq->mpwqe.umr_wqebbs =
 			mlx5e_mpwrq_umr_wqebbs(mdev, rq->mpwqe.page_shift,
-					       rq->mpwqe.unaligned);
+					       rq->mpwqe.umr_mode);
 		rq->mpwqe.mtts_per_wqe =
 			mlx5e_mpwrq_mtts_per_wqe(mdev, rq->mpwqe.page_shift,
-						 rq->mpwqe.unaligned);
+						 rq->mpwqe.umr_mode);
 
 		pool_size = rq->mpwqe.pages_per_wqe <<
 			mlx5e_mpwqe_get_log_rq_size(mdev, params, xsk);
@@ -5012,7 +5039,8 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
 	if (!!MLX5_CAP_ETH(mdev, lro_cap) &&
 	    !MLX5_CAP_ETH(mdev, tunnel_lro_vxlan) &&
 	    !MLX5_CAP_ETH(mdev, tunnel_lro_gre) &&
-	    mlx5e_check_fragmented_striding_rq_cap(mdev, PAGE_SHIFT, false))
+	    mlx5e_check_fragmented_striding_rq_cap(mdev, PAGE_SHIFT,
+						   MLX5E_MPWRQ_UMR_MODE_ALIGNED))
 		netdev->vlan_features    |= NETIF_F_LRO;
 
 	netdev->hw_features       = netdev->vlan_features;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index f8ecb33105d3..285f301a6390 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1288,4 +1288,8 @@ static inline bool mlx5_get_roce_state(struct mlx5_core_dev *dev)
 	return mlx5_is_roce_on(dev);
 }
 
+enum {
+	MLX5_OCTWORD = 16,
+};
+
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From 9f123f740428e96ef2eae8b5e2876b675b6a4605 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:23 -0700
Subject: net/mlx5e: Improve MTT/KSM alignment

Make mlx5e_mpwrq_mtts_per_wqe take into account that KSM requires
smaller alignment than MTT.

Ensure that there is always an even amount of MTTs in a UMR WQE, so that
complete octwords are formed, and no garbage is mapped.

Drop extra alignment in MLX5_MTT_OCTW that may cause setting too big
ucseg->xlt_octowords, also leading to mapping garbage.

Generalize some calculations by introducing the MLX5_OCTWORD constant.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h        |  6 +-----
 drivers/net/ethernet/mellanox/mlx5/core/en/params.c | 10 +++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c   | 13 +++++--------
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c     |  2 +-
 5 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index a2d09f30acd1..93607db1dea4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -109,12 +109,8 @@ struct page_pool;
 #define MLX5_MPWRQ_MAX_PAGES_PER_WQE \
 	rounddown_pow_of_two(MLX5_UMR_MAX_MTT_SPACE / sizeof(struct mlx5_mtt))
 
-#define MLX5_ALIGN_MTTS(mtts)		(ALIGN(mtts, 8))
-#define MLX5_ALIGNED_MTTS_OCTW(mtts)	((mtts) / 2)
-#define MLX5_MTT_OCTW(mtts)		(MLX5_ALIGNED_MTTS_OCTW(MLX5_ALIGN_MTTS(mtts)))
-#define MLX5_KSM_OCTW(ksms)             (ksms)
 #define MLX5E_MAX_RQ_NUM_MTTS	\
-	(ALIGN_DOWN(U16_MAX, 4) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */
+	(ALIGN_DOWN(U16_MAX, 4) * 2) /* Fits into u16 and aligned by WQEBB. */
 #define MLX5E_MAX_RQ_NUM_KSMS (U16_MAX - 1) /* So that num_ksms fits into u16. */
 #define MLX5E_ORDER2_MAX_PACKET_MTU (order_base_2(10 * 1024))
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index b57855bf7629..e8c3b8abf941 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -91,6 +91,13 @@ u8 mlx5e_mpwrq_pages_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift,
 
 	pages_per_wqe = log_wqe_sz > page_shift ? (1 << (log_wqe_sz - page_shift)) : 1;
 
+	/* Two MTTs are needed to form an octword. The number of MTTs is encoded
+	 * in octwords in a UMR WQE, so we need at least two to avoid mapping
+	 * garbage addresses.
+	 */
+	if (WARN_ON_ONCE(pages_per_wqe < 2 && umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED))
+		pages_per_wqe = 2;
+
 	/* Sanity check for further calculations to succeed. */
 	BUILD_BUG_ON(MLX5_MPWRQ_MAX_PAGES_PER_WQE > 64);
 	if (WARN_ON_ONCE(pages_per_wqe > MLX5_MPWRQ_MAX_PAGES_PER_WQE))
@@ -131,7 +138,8 @@ u8 mlx5e_mpwrq_mtts_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift,
 	 * MTU. These oversize packets are dropped by the driver at a later
 	 * stage.
 	 */
-	return MLX5_ALIGN_MTTS(pages_per_wqe + 1);
+	return ALIGN(pages_per_wqe + 1,
+		     MLX5_SEND_WQE_BB / mlx5e_mpwrq_umr_entry_size(umr_mode));
 }
 
 u32 mlx5e_mpwrq_max_num_entries(struct mlx5_core_dev *mdev,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index e12a856331b8..4b2df2895505 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -66,9 +66,10 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	umr_wqe->ctrl.opmod_idx_opcode =
 		cpu_to_be32((icosq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | MLX5_OPCODE_UMR);
 
+	/* Optimized for speed: keep in sync with mlx5e_mpwrq_umr_entry_size. */
 	offset = ix * rq->mpwqe.mtts_per_wqe;
 	if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED))
-		offset = MLX5_ALIGNED_MTTS_OCTW(offset);
+		offset = offset * sizeof(struct mlx5_mtt) / MLX5_OCTWORD;
 	umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
 
 	icosq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b5a416ff1603..2093b6cc6c7c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -205,14 +205,11 @@ static void mlx5e_disable_blocking_events(struct mlx5e_priv *priv)
 
 static u16 mlx5e_mpwrq_umr_octowords(u32 entries, enum mlx5e_mpwrq_umr_mode umr_mode)
 {
-	switch (umr_mode) {
-	case MLX5E_MPWRQ_UMR_MODE_ALIGNED:
-		return MLX5_MTT_OCTW(entries);
-	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
-		return MLX5_KSM_OCTW(entries);
-	}
-	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", umr_mode);
-	return 0;
+	u8 umr_entry_size = mlx5e_mpwrq_umr_entry_size(umr_mode);
+
+	WARN_ON_ONCE(entries * umr_entry_size % MLX5_OCTWORD);
+
+	return entries * umr_entry_size / MLX5_OCTWORD;
 }
 
 static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index b61604d87701..58084650151f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -682,7 +682,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 		cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
 			    MLX5_OPCODE_UMR);
 
-	offset = MLX5_ALIGNED_MTTS_OCTW(ix * rq->mpwqe.mtts_per_wqe);
+	offset = (ix * rq->mpwqe.mtts_per_wqe) * sizeof(struct mlx5_mtt) / MLX5_OCTWORD;
 	umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
 
 	sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
-- 
cgit v1.2.3


From 139213451046eb6653a058a8922796f29b267b0f Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:24 -0700
Subject: net/mlx5e: xsk: Use KLM to protect frame overrun in unaligned mode

XSK RQs support striding RQ linear mode, but the stride size may be
bigger than the XSK frame size, because:

1. The stride size must be a power of two.

2. The stride size must be equal to the UMR page size. Each XSK frame is
treated as a separate page, because they aren't necessarily adjacent in
physical memory, so the driver can't put more than one stride per page.

3. The minimal MTT page size is 4096 on older firmware.

That means that if XSK frame size is 2048 or not a power of two, the
strides may be bigger than XSK frames. Normally, it's not a problem if
the hardware enforces the MTU. However, traffic between vports skips the
hardware MTU check, and oversized packets may be received.

If an oversized packet is bigger than the XSK frame but not bigger than
the stride, it will cause overwriting of the adjacent UMEM region. If
the packet takes more than one stride, they can be recycled for reuse,
so it's not a problem when the XSK frame size matches the stride size.

Work around the above issue by leveraging KLM to make a more
fine-grained mapping. The beginning of each stride is mapped to the
frame memory, and the padding up to the closest power of two is mapped
to the overflow page that doesn't belong to UMEM. This way, application
data corruption won't happen upon receiving packets bigger than MTU.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  1 +
 .../net/ethernet/mellanox/mlx5/core/en/params.c    | 45 ++++++++++++++++++++--
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.c    | 27 +++++++++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 27 +++++++++++--
 4 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 93607db1dea4..7c6861d6148d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -680,6 +680,7 @@ struct mlx5e_hw_gro_data {
 enum mlx5e_mpwrq_umr_mode {
 	MLX5E_MPWRQ_UMR_MODE_ALIGNED,
 	MLX5E_MPWRQ_UMR_MODE_UNALIGNED,
+	MLX5E_MPWRQ_UMR_MODE_OVERSIZED,
 };
 
 struct mlx5e_rq {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index e8c3b8abf941..203448ee9594 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -36,8 +36,28 @@ mlx5e_mpwrq_umr_mode(struct mlx5_core_dev *mdev, struct mlx5e_xsk_param *xsk)
 	 * 1. MTT - direct mapping in page granularity.
 	 * 2. KSM - indirect mapping to another MKey to arbitrary addresses, but
 	 *    all mappings have the same size.
+	 * 3. KLM - indirect mapping to another MKey to arbitrary addresses, and
+	 *    mappings can have different sizes.
 	 */
+	u8 page_shift = mlx5e_mpwrq_page_shift(mdev, xsk);
 	bool unaligned = xsk ? xsk->unaligned : false;
+	bool oversized = false;
+
+	if (xsk) {
+		oversized = xsk->chunk_size < (1 << page_shift);
+		WARN_ON_ONCE(xsk->chunk_size > (1 << page_shift));
+	}
+
+	/* XSK frame size doesn't match the UMR page size, either because the
+	 * frame size is not a power of two, or it's smaller than the minimal
+	 * page size supported by the firmware.
+	 * It's possible to receive packets bigger than MTU in certain setups.
+	 * To avoid writing over the XSK frame boundary, the top region of each
+	 * stride is mapped to a garbage page, resulting in two mappings of
+	 * different sizes per frame.
+	 */
+	if (oversized)
+		return MLX5E_MPWRQ_UMR_MODE_OVERSIZED;
 
 	/* XSK frames can start at arbitrary unaligned locations, but they all
 	 * have the same size which is a power of two. It allows to optimize to
@@ -60,6 +80,8 @@ u8 mlx5e_mpwrq_umr_entry_size(enum mlx5e_mpwrq_umr_mode mode)
 		return sizeof(struct mlx5_mtt);
 	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
 		return sizeof(struct mlx5_ksm);
+	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
+		return sizeof(struct mlx5_klm) * 2;
 	}
 	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", mode);
 	return 0;
@@ -145,11 +167,21 @@ u8 mlx5e_mpwrq_mtts_per_wqe(struct mlx5_core_dev *mdev, u8 page_shift,
 u32 mlx5e_mpwrq_max_num_entries(struct mlx5_core_dev *mdev,
 				enum mlx5e_mpwrq_umr_mode umr_mode)
 {
-	if (umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)
-		return min(MLX5E_MAX_RQ_NUM_KSMS,
-			   1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size));
+	/* Same limits apply to KSMs and KLMs. */
+	u32 klm_limit = min(MLX5E_MAX_RQ_NUM_KSMS,
+			    1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size));
 
-	return MLX5E_MAX_RQ_NUM_MTTS;
+	switch (umr_mode) {
+	case MLX5E_MPWRQ_UMR_MODE_ALIGNED:
+		return MLX5E_MAX_RQ_NUM_MTTS;
+	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
+		return klm_limit;
+	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
+		/* Each entry is two KLMs. */
+		return klm_limit / 2;
+	}
+	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", umr_mode);
+	return 0;
 }
 
 static u8 mlx5e_mpwrq_max_log_rq_size(struct mlx5_core_dev *mdev, u8 page_shift,
@@ -1084,6 +1116,11 @@ static u8 mlx5e_build_icosq_log_wq_sz(struct mlx5_core_dev *mdev,
 			xsk.unaligned = true;
 			max_xsk_wqebbs = max(max_xsk_wqebbs,
 				mlx5e_mpwrq_total_umr_wqebbs(mdev, params, &xsk));
+
+			/* XSK unaligned mode, frame size is not equal to stride size. */
+			xsk.chunk_size -= 1;
+			max_xsk_wqebbs = max(max_xsk_wqebbs,
+				mlx5e_mpwrq_total_umr_wqebbs(mdev, params, &xsk));
 		}
 
 		wqebbs += max_xsk_wqebbs;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 4b2df2895505..78d746704345 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -41,7 +41,15 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
 	memcpy(umr_wqe, &rq->mpwqe.umr_wqe, sizeof(struct mlx5e_umr_wqe));
 
-	if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)) {
+	if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED)) {
+		for (i = 0; i < batch; i++) {
+			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
+
+			umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
+				.ptag = cpu_to_be64(addr | MLX5_EN_WR),
+			};
+		}
+	} else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED)) {
 		for (i = 0; i < batch; i++) {
 			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
 
@@ -51,11 +59,22 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 			};
 		}
 	} else {
+		__be32 pad_size = cpu_to_be32((1 << rq->mpwqe.page_shift) -
+					      rq->xsk_pool->chunk_size);
+		__be32 frame_size = cpu_to_be32(rq->xsk_pool->chunk_size);
+
 		for (i = 0; i < batch; i++) {
 			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
 
-			umr_wqe->inline_mtts[i] = (struct mlx5_mtt) {
-				.ptag = cpu_to_be64(addr | MLX5_EN_WR),
+			umr_wqe->inline_klms[i << 1] = (struct mlx5_klm) {
+				.key = rq->mkey_be,
+				.va = cpu_to_be64(addr),
+				.bcount = frame_size,
+			};
+			umr_wqe->inline_klms[(i << 1) + 1] = (struct mlx5_klm) {
+				.key = rq->mkey_be,
+				.va = cpu_to_be64(rq->wqe_overflow.addr),
+				.bcount = pad_size,
 			};
 		}
 	}
@@ -70,6 +89,8 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 	offset = ix * rq->mpwqe.mtts_per_wqe;
 	if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_ALIGNED))
 		offset = offset * sizeof(struct mlx5_mtt) / MLX5_OCTWORD;
+	else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_OVERSIZED))
+		offset = offset * sizeof(struct mlx5_klm) * 2 / MLX5_OCTWORD;
 	umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
 
 	icosq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2093b6cc6c7c..ae728745379d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -299,6 +299,8 @@ static u8 mlx5e_mpwrq_access_mode(enum mlx5e_mpwrq_umr_mode umr_mode)
 		return MLX5_MKC_ACCESS_MODE_MTT;
 	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
 		return MLX5_MKC_ACCESS_MODE_KSM;
+	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
+		return MLX5_MKC_ACCESS_MODE_KLMS;
 	}
 	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", umr_mode);
 	return 0;
@@ -307,10 +309,12 @@ static u8 mlx5e_mpwrq_access_mode(enum mlx5e_mpwrq_umr_mode umr_mode)
 static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 				 u32 npages, u8 page_shift, u32 *umr_mkey,
 				 dma_addr_t filler_addr,
-				 enum mlx5e_mpwrq_umr_mode umr_mode)
+				 enum mlx5e_mpwrq_umr_mode umr_mode,
+				 u32 xsk_chunk_size)
 {
 	struct mlx5_mtt *mtt;
 	struct mlx5_ksm *ksm;
+	struct mlx5_klm *klm;
 	u32 octwords;
 	int inlen;
 	void *mkc;
@@ -347,7 +351,8 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 	MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
 	MLX5_SET64(mkc, mkc, len, npages << page_shift);
 	MLX5_SET(mkc, mkc, translations_octword_size, octwords);
-	MLX5_SET(mkc, mkc, log_page_size, page_shift);
+	if (umr_mode != MLX5E_MPWRQ_UMR_MODE_OVERSIZED)
+		MLX5_SET(mkc, mkc, log_page_size, page_shift);
 	MLX5_SET(create_mkey_in, in, translations_octword_actual_size, octwords);
 
 	/* Initialize the mkey with all MTTs pointing to a default
@@ -357,6 +362,21 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 	 * to the default page.
 	 */
 	switch (umr_mode) {
+	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
+		klm = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+		for (i = 0; i < npages; i++) {
+			klm[i << 1] = (struct mlx5_klm) {
+				.va = cpu_to_be64(filler_addr),
+				.bcount = cpu_to_be32(xsk_chunk_size),
+				.key = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey),
+			};
+			klm[(i << 1) + 1] = (struct mlx5_klm) {
+				.va = cpu_to_be64(filler_addr),
+				.bcount = cpu_to_be32((1 << page_shift) - xsk_chunk_size),
+				.key = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey),
+			};
+		}
+		break;
 	case MLX5E_MPWRQ_UMR_MODE_UNALIGNED:
 		ksm = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
 		for (i = 0; i < npages; i++)
@@ -415,6 +435,7 @@ static int mlx5e_create_umr_klm_mkey(struct mlx5_core_dev *mdev,
 
 static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq *rq)
 {
+	u32 xsk_chunk_size = rq->xsk_pool ? rq->xsk_pool->chunk_size : 0;
 	u32 wq_size = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
 	u32 num_entries, max_num_entries;
 	u32 umr_mkey;
@@ -432,7 +453,7 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev *mdev, struct mlx5e_rq
 
 	err = mlx5e_create_umr_mkey(mdev, num_entries, rq->mpwqe.page_shift,
 				    &umr_mkey, rq->wqe_overflow.addr,
-				    rq->mpwqe.umr_mode);
+				    rq->mpwqe.umr_mode, xsk_chunk_size);
 	rq->mpwqe.umr_mkey_be = cpu_to_be32(umr_mkey);
 	return err;
 }
-- 
cgit v1.2.3


From c6f0420468fb2e7bbe006ed492608d63a4ac9e28 Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:25 -0700
Subject: net/mlx5e: xsk: Print a warning in slow configurations

On striding RQ, when the XSK frame size doesn't match the MKey page
size, KLM is used for memory mappings, which is a slower mechanism than
MTT or KSM. It may happen in two cases:

1. Frame size is not a power of two (only possible in the unaligned mode
of XSK).

2. Frame size is 2048 bytes, and the firmware doesn't support MKey pages
smaller than 4096 bytes.

Depending on the case, print a warning and recommend to disable striding
RQ or upgrade the firmware.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
index 8b09e2f58a4d..ebada0c5af3c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/pool.c
@@ -99,6 +99,15 @@ static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
 
 	mlx5e_build_xsk_param(pool, &xsk);
 
+	if (priv->channels.params.rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ &&
+	    mlx5e_mpwrq_umr_mode(priv->mdev, &xsk) == MLX5E_MPWRQ_UMR_MODE_OVERSIZED) {
+		const char *recommendation = is_power_of_2(xsk.chunk_size) ?
+			"Upgrade firmware" : "Disable striding RQ";
+
+		mlx5_core_warn(priv->mdev, "Expected slowdown with XSK frame size %u. %s for better performance.\n",
+			       xsk.chunk_size, recommendation);
+	}
+
 	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
 		/* XSK objects will be created on open. */
 		goto validate_closed;
-- 
cgit v1.2.3


From c2c9e31dfa4f23045f72f613c5809d5b030cd27f Mon Sep 17 00:00:00 2001
From: Maxim Mikityanskiy <maximmi@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:26 -0700
Subject: net/mlx5e: xsk: Optimize for unaligned mode with 3072-byte frames

When XSK frame size is 3072 (or another power of two multiplied by 3),
KLM mechanism for NIC virtual memory page mapping can be optimized by
replacing it with KSM.

Before this change, two KLM entries were needed to map an XSK frame that
is not a power of two: one entry maps the UMEM memory up to the frame
length, the other maps the rest of the stride to the garbage page.

When the frame length divided by 3 is a power of two, it can be mapped
using 3 KSM entries, and the fourth will map the rest of the stride to
the garbage page. All 4 KSM entries are of the same size, which allows
for a much faster lookup.

Frame size 3072 is useful in certain use cases, because it allows
packing 4 frames into 3 pages. Generally speaking, other frame sizes
equal to PAGE_SIZE minus a power of two can be optimized in a similar
way, but it will require many more KSMs per frame, which slows down UMRs
a little bit, but more importantly may hit the limit for the maximum
number of KSM entries.

Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  1 +
 .../net/ethernet/mellanox/mlx5/core/en/params.c    | 20 ++++++++++++++++-
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.c    | 25 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 18 ++++++++++++++--
 4 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 7c6861d6148d..26a23047f1f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -681,6 +681,7 @@ enum mlx5e_mpwrq_umr_mode {
 	MLX5E_MPWRQ_UMR_MODE_ALIGNED,
 	MLX5E_MPWRQ_UMR_MODE_UNALIGNED,
 	MLX5E_MPWRQ_UMR_MODE_OVERSIZED,
+	MLX5E_MPWRQ_UMR_MODE_TRIPLE,
 };
 
 struct mlx5e_rq {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index 203448ee9594..29dd3a04c154 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -56,8 +56,16 @@ mlx5e_mpwrq_umr_mode(struct mlx5_core_dev *mdev, struct mlx5e_xsk_param *xsk)
 	 * stride is mapped to a garbage page, resulting in two mappings of
 	 * different sizes per frame.
 	 */
-	if (oversized)
+	if (oversized) {
+		/* An optimization for frame sizes equal to 3 * power_of_two.
+		 * 3 KSMs point to the frame, and one KSM points to the garbage
+		 * page, which works faster than KLM.
+		 */
+		if (xsk->chunk_size % 3 == 0 && is_power_of_2(xsk->chunk_size / 3))
+			return MLX5E_MPWRQ_UMR_MODE_TRIPLE;
+
 		return MLX5E_MPWRQ_UMR_MODE_OVERSIZED;
+	}
 
 	/* XSK frames can start at arbitrary unaligned locations, but they all
 	 * have the same size which is a power of two. It allows to optimize to
@@ -82,6 +90,8 @@ u8 mlx5e_mpwrq_umr_entry_size(enum mlx5e_mpwrq_umr_mode mode)
 		return sizeof(struct mlx5_ksm);
 	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
 		return sizeof(struct mlx5_klm) * 2;
+	case MLX5E_MPWRQ_UMR_MODE_TRIPLE:
+		return sizeof(struct mlx5_ksm) * 4;
 	}
 	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", mode);
 	return 0;
@@ -179,6 +189,9 @@ u32 mlx5e_mpwrq_max_num_entries(struct mlx5_core_dev *mdev,
 	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
 		/* Each entry is two KLMs. */
 		return klm_limit / 2;
+	case MLX5E_MPWRQ_UMR_MODE_TRIPLE:
+		/* Each entry is four KSMs. */
+		return klm_limit / 4;
 	}
 	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", umr_mode);
 	return 0;
@@ -1121,6 +1134,11 @@ static u8 mlx5e_build_icosq_log_wq_sz(struct mlx5_core_dev *mdev,
 			xsk.chunk_size -= 1;
 			max_xsk_wqebbs = max(max_xsk_wqebbs,
 				mlx5e_mpwrq_total_umr_wqebbs(mdev, params, &xsk));
+
+			/* XSK unaligned mode, frame size is a triple power of two. */
+			xsk.chunk_size = (1 << frame_shift) / 4 * 3;
+			max_xsk_wqebbs = max(max_xsk_wqebbs,
+				mlx5e_mpwrq_total_umr_wqebbs(mdev, params, &xsk));
 		}
 
 		wqebbs += max_xsk_wqebbs;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index 78d746704345..c91b54d9ff27 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -58,6 +58,29 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 				.va = cpu_to_be64(addr),
 			};
 		}
+	} else if (likely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE)) {
+		u32 mapping_size = 1 << (rq->mpwqe.page_shift - 2);
+
+		for (i = 0; i < batch; i++) {
+			dma_addr_t addr = xsk_buff_xdp_get_frame_dma(wi->alloc_units[i].xsk);
+
+			umr_wqe->inline_ksms[i << 2] = (struct mlx5_ksm) {
+				.key = rq->mkey_be,
+				.va = cpu_to_be64(addr),
+			};
+			umr_wqe->inline_ksms[(i << 2) + 1] = (struct mlx5_ksm) {
+				.key = rq->mkey_be,
+				.va = cpu_to_be64(addr + mapping_size),
+			};
+			umr_wqe->inline_ksms[(i << 2) + 2] = (struct mlx5_ksm) {
+				.key = rq->mkey_be,
+				.va = cpu_to_be64(addr + mapping_size * 2),
+			};
+			umr_wqe->inline_ksms[(i << 2) + 3] = (struct mlx5_ksm) {
+				.key = rq->mkey_be,
+				.va = cpu_to_be64(rq->wqe_overflow.addr),
+			};
+		}
 	} else {
 		__be32 pad_size = cpu_to_be32((1 << rq->mpwqe.page_shift) -
 					      rq->xsk_pool->chunk_size);
@@ -91,6 +114,8 @@ int mlx5e_xsk_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 		offset = offset * sizeof(struct mlx5_mtt) / MLX5_OCTWORD;
 	else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_OVERSIZED))
 		offset = offset * sizeof(struct mlx5_klm) * 2 / MLX5_OCTWORD;
+	else if (unlikely(rq->mpwqe.umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE))
+		offset = offset * sizeof(struct mlx5_ksm) * 4 / MLX5_OCTWORD;
 	umr_wqe->uctrl.xlt_offset = cpu_to_be16(offset);
 
 	icosq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ae728745379d..d4f03ff7b0e1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -301,6 +301,8 @@ static u8 mlx5e_mpwrq_access_mode(enum mlx5e_mpwrq_umr_mode umr_mode)
 		return MLX5_MKC_ACCESS_MODE_KSM;
 	case MLX5E_MPWRQ_UMR_MODE_OVERSIZED:
 		return MLX5_MKC_ACCESS_MODE_KLMS;
+	case MLX5E_MPWRQ_UMR_MODE_TRIPLE:
+		return MLX5_MKC_ACCESS_MODE_KSM;
 	}
 	WARN_ONCE(1, "MPWRQ UMR mode %d is not known\n", umr_mode);
 	return 0;
@@ -322,7 +324,8 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 	int err;
 	int i;
 
-	if (umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED &&
+	if ((umr_mode == MLX5E_MPWRQ_UMR_MODE_UNALIGNED ||
+	     umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE) &&
 	    !MLX5_CAP_GEN(mdev, fixed_buffer_size)) {
 		mlx5_core_warn(mdev, "Unaligned AF_XDP requires fixed_buffer_size capability\n");
 		return -EINVAL;
@@ -351,7 +354,9 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 	MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
 	MLX5_SET64(mkc, mkc, len, npages << page_shift);
 	MLX5_SET(mkc, mkc, translations_octword_size, octwords);
-	if (umr_mode != MLX5E_MPWRQ_UMR_MODE_OVERSIZED)
+	if (umr_mode == MLX5E_MPWRQ_UMR_MODE_TRIPLE)
+		MLX5_SET(mkc, mkc, log_page_size, page_shift - 2);
+	else if (umr_mode != MLX5E_MPWRQ_UMR_MODE_OVERSIZED)
 		MLX5_SET(mkc, mkc, log_page_size, page_shift);
 	MLX5_SET(create_mkey_in, in, translations_octword_actual_size, octwords);
 
@@ -392,6 +397,15 @@ static int mlx5e_create_umr_mkey(struct mlx5_core_dev *mdev,
 				.ptag = cpu_to_be64(filler_addr),
 			};
 		break;
+	case MLX5E_MPWRQ_UMR_MODE_TRIPLE:
+		ksm = MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
+		for (i = 0; i < npages * 4; i++) {
+			ksm[i] = (struct mlx5_ksm) {
+				.key = cpu_to_be32(mdev->mlx5e_res.hw_objs.mkey),
+				.va = cpu_to_be64(filler_addr),
+			};
+		}
+		break;
 	}
 
 	err = mlx5_core_create_mkey(mdev, umr_mkey, in, inlen);
-- 
cgit v1.2.3


From 16ab85e78439bab1201ff26ba430231d1574b4ae Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:27 -0700
Subject: net/mlx5e: Expose rx_oversize_pkts_buffer counter

Add the rx_oversize_pkts_buffer counter to ethtool statistics.
This counter exposes the number of dropped received packets due to
length which arrived to RQ and exceed software buffer size allocated by
the device for incoming traffic. It might imply that the device MTU is
larger than the software buffers size.

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.c | 21 ++++++++++++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/en_stats.h |  4 ++++
 include/linux/mlx5/mlx5_ifc.h                      |  8 ++++++--
 4 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index d4f03ff7b0e1..364f04309149 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3699,7 +3699,8 @@ mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	stats->rx_length_errors =
 		PPORT_802_3_GET(pstats, a_in_range_length_errors) +
 		PPORT_802_3_GET(pstats, a_out_of_range_length_field) +
-		PPORT_802_3_GET(pstats, a_frame_too_long_errors);
+		PPORT_802_3_GET(pstats, a_frame_too_long_errors) +
+		VNIC_ENV_GET(&priv->stats.vnic, eth_wqe_too_small);
 	stats->rx_crc_errors =
 		PPORT_802_3_GET(pstats, a_frame_check_sequence_errors);
 	stats->rx_frame_errors = PPORT_802_3_GET(pstats, a_alignment_errors);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
index 575717186912..03c1841970f1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.c
@@ -641,17 +641,26 @@ static const struct counter_desc vnic_env_stats_dev_oob_desc[] = {
 		VNIC_ENV_OFF(vport_env.internal_rq_out_of_buffer) },
 };
 
+static const struct counter_desc vnic_env_stats_drop_desc[] = {
+	{ "rx_oversize_pkts_buffer",
+		VNIC_ENV_OFF(vport_env.eth_wqe_too_small) },
+};
+
 #define NUM_VNIC_ENV_STEER_COUNTERS(dev) \
 	(MLX5_CAP_GEN(dev, nic_receive_steering_discard) ? \
 	 ARRAY_SIZE(vnic_env_stats_steer_desc) : 0)
 #define NUM_VNIC_ENV_DEV_OOB_COUNTERS(dev) \
 	(MLX5_CAP_GEN(dev, vnic_env_int_rq_oob) ? \
 	 ARRAY_SIZE(vnic_env_stats_dev_oob_desc) : 0)
+#define NUM_VNIC_ENV_DROP_COUNTERS(dev) \
+	(MLX5_CAP_GEN(dev, eth_wqe_too_small) ? \
+	 ARRAY_SIZE(vnic_env_stats_drop_desc) : 0)
 
 static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(vnic_env)
 {
 	return NUM_VNIC_ENV_STEER_COUNTERS(priv->mdev) +
-		NUM_VNIC_ENV_DEV_OOB_COUNTERS(priv->mdev);
+	       NUM_VNIC_ENV_DEV_OOB_COUNTERS(priv->mdev) +
+	       NUM_VNIC_ENV_DROP_COUNTERS(priv->mdev);
 }
 
 static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(vnic_env)
@@ -665,6 +674,11 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(vnic_env)
 	for (i = 0; i < NUM_VNIC_ENV_DEV_OOB_COUNTERS(priv->mdev); i++)
 		strcpy(data + (idx++) * ETH_GSTRING_LEN,
 		       vnic_env_stats_dev_oob_desc[i].format);
+
+	for (i = 0; i < NUM_VNIC_ENV_DROP_COUNTERS(priv->mdev); i++)
+		strcpy(data + (idx++) * ETH_GSTRING_LEN,
+		       vnic_env_stats_drop_desc[i].format);
+
 	return idx;
 }
 
@@ -679,6 +693,11 @@ static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(vnic_env)
 	for (i = 0; i < NUM_VNIC_ENV_DEV_OOB_COUNTERS(priv->mdev); i++)
 		data[idx++] = MLX5E_READ_CTR32_BE(priv->stats.vnic.query_vnic_env_out,
 						  vnic_env_stats_dev_oob_desc, i);
+
+	for (i = 0; i < NUM_VNIC_ENV_DROP_COUNTERS(priv->mdev); i++)
+		data[idx++] = MLX5E_READ_CTR32_BE(priv->stats.vnic.query_vnic_env_out,
+						  vnic_env_stats_drop_desc, i);
+
 	return idx;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
index 99e321bfb744..9f781085be47 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_stats.h
@@ -273,6 +273,10 @@ struct mlx5e_qcounter_stats {
 	u32 rx_if_down_packets;
 };
 
+#define VNIC_ENV_GET(vnic_env_stats, c) \
+	MLX5_GET(query_vnic_env_out, (vnic_env_stats)->query_vnic_env_out, \
+		 vport_env.c)
+
 struct mlx5e_vnic_env_stats {
 	__be64 query_vnic_env_out[MLX5_ST_SZ_QW(query_vnic_env_out)];
 };
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 1ad762e22d86..06574d430ff5 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1491,7 +1491,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_120[0xa];
 	u8         log_max_ra_req_dc[0x6];
-	u8         reserved_at_130[0x9];
+	u8         reserved_at_130[0x2];
+	u8         eth_wqe_too_small[0x1];
+	u8         reserved_at_133[0x6];
 	u8         vnic_env_cq_overrun[0x1];
 	u8         log_max_ra_res_dc[0x6];
 
@@ -3537,7 +3539,9 @@ struct mlx5_ifc_vnic_diagnostic_statistics_bits {
 
 	u8         cq_overrun[0x20];
 
-	u8         reserved_at_220[0xde0];
+	u8         eth_wqe_too_small[0x20];
+
+	u8         reserved_at_220[0xdc0];
 };
 
 struct mlx5_ifc_traffic_counter_bits {
-- 
cgit v1.2.3


From 9b98d395b85dd042fe83fb696b1ac02e6c93a520 Mon Sep 17 00:00:00 2001
From: Moshe Shemesh <moshe@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:28 -0700
Subject: net/mlx5: Start health poll at earlier stage of driver load

Start health poll at earlier stage, so if fw fatal issue occurred before
or during initialization commands such as init_hca or set_hca_cap the
poll health can detect and indicate that the driver is already in error
state.

Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 11 ++++++++---
 drivers/net/ethernet/mellanox/mlx5/core/main.c   | 17 ++++++++++-------
 include/linux/mlx5/driver.h                      |  1 +
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 59205ba2ef7b..5bfc54a10621 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -843,9 +843,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 
 	health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms);
 	add_timer(&health->timer);
-
-	if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc))
-		queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0);
 }
 
 void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
@@ -862,6 +859,14 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
 	del_timer_sync(&health->timer);
 }
 
+void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc))
+		queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0);
+}
+
 void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index daa7442f31c9..0b459d841c3a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1092,7 +1092,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 	mlx5_devcom_unregister_device(dev->priv.devcom);
 }
 
-static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
+static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot, u64 timeout)
 {
 	int err;
 
@@ -1130,10 +1130,12 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
 
 	mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_UP);
 
+	mlx5_start_health_poll(dev);
+
 	err = mlx5_core_enable_hca(dev, 0);
 	if (err) {
 		mlx5_core_err(dev, "enable hca failed\n");
-		goto err_cmd_cleanup;
+		goto stop_health_poll;
 	}
 
 	err = mlx5_core_set_issi(dev);
@@ -1185,8 +1187,7 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
 		mlx5_core_err(dev, "query hca failed\n");
 		goto reclaim_boot_pages;
 	}
-
-	mlx5_start_health_poll(dev);
+	mlx5_start_health_fw_log_up(dev);
 
 	return 0;
 
@@ -1194,6 +1195,8 @@ reclaim_boot_pages:
 	mlx5_reclaim_startup_pages(dev);
 err_disable_hca:
 	mlx5_core_disable_hca(dev, 0);
+stop_health_poll:
+	mlx5_stop_health_poll(dev, boot);
 err_cmd_cleanup:
 	mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN);
 	mlx5_cmd_cleanup(dev);
@@ -1205,7 +1208,6 @@ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot)
 {
 	int err;
 
-	mlx5_stop_health_poll(dev, boot);
 	err = mlx5_cmd_teardown_hca(dev);
 	if (err) {
 		mlx5_core_err(dev, "tear_down_hca failed, skip cleanup\n");
@@ -1213,6 +1215,7 @@ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot)
 	}
 	mlx5_reclaim_startup_pages(dev);
 	mlx5_core_disable_hca(dev, 0);
+	mlx5_stop_health_poll(dev, boot);
 	mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN);
 	mlx5_cmd_cleanup(dev);
 
@@ -1362,7 +1365,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
 	mutex_lock(&dev->intf_state_mutex);
 	dev->state = MLX5_DEVICE_STATE_UP;
 
-	err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
+	err = mlx5_function_setup(dev, true, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
 	if (err)
 		goto err_function;
 
@@ -1450,7 +1453,7 @@ int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery)
 		timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
 	else
 		timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
-	err = mlx5_function_setup(dev, timeout);
+	err = mlx5_function_setup(dev, false, timeout);
 	if (err)
 		goto err_function;
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 285f301a6390..a12929bc31b2 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1017,6 +1017,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
 void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health);
+void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev);
 void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
 void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
 int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
-- 
cgit v1.2.3


From 1330bd9884efc49f5b5ca854cf1185f1bec705d0 Mon Sep 17 00:00:00 2001
From: Maher Sanalla <msanalla@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:29 -0700
Subject: net/mlx5: Set default grace period based on function type

Currently, driver sets the same grace period for fw fatal health reporter
to any type of function.

Since the lower level functions are more vulnerable to fw fatal errors as a
result of parent function closure/reload, set a smaller grace period for
the lower level functions, as follows:

1. For ECPF: 180 seconds.
2. For PF: 60 seconds.
3. For VF/SF: 30 seconds.

Signed-off-by: Maher Sanalla <msanalla@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 5bfc54a10621..86ed87d704f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -702,11 +702,25 @@ static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
 		.dump = mlx5_fw_fatal_reporter_dump,
 };
 
-#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 1200000
+#define MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD 180000
+#define MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD 60000
+#define MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD 30000
+#define MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD MLX5_FW_REPORTER_VF_GRACEFUL_PERIOD
+
 static void mlx5_fw_reporters_create(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
 	struct devlink *devlink = priv_to_devlink(dev);
+	u64 grace_period;
+
+	if (mlx5_core_is_ecpf(dev)) {
+		grace_period = MLX5_FW_REPORTER_ECPF_GRACEFUL_PERIOD;
+	} else if (mlx5_core_is_pf(dev)) {
+		grace_period = MLX5_FW_REPORTER_PF_GRACEFUL_PERIOD;
+	} else {
+		/* VF or SF */
+		grace_period = MLX5_FW_REPORTER_DEFAULT_GRACEFUL_PERIOD;
+	}
 
 	health->fw_reporter =
 		devlink_health_reporter_create(devlink, &mlx5_fw_reporter_ops,
@@ -718,7 +732,7 @@ static void mlx5_fw_reporters_create(struct mlx5_core_dev *dev)
 	health->fw_fatal_reporter =
 		devlink_health_reporter_create(devlink,
 					       &mlx5_fw_fatal_reporter_ops,
-					       MLX5_REPORTER_FW_GRACEFUL_PERIOD,
+					       grace_period,
 					       dev);
 	if (IS_ERR(health->fw_fatal_reporter))
 		mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n",
-- 
cgit v1.2.3


From 8c9cc1eb90c13ee3ec2a8a52af4e564a9b161047 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:30 -0700
Subject: net/mlx5: E-Switch, Allow offloading fwd dest flow table with vport

Before this commit a fwd dest flow table resulted in ignoring vport dests
which is incorrect and is supported.
With this commit the dests can be a mix of flow table and vport dests.
There is still a limitation that there cannot be more than one flow table dest.

Signed-off-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Maor Dickman <maord@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../net/ethernet/mellanox/mlx5/core/eswitch_offloads.c   | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index c98c6af21581..4e50df3139c6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -483,25 +483,27 @@ esw_setup_dests(struct mlx5_flow_destination *dest,
 	    !(attr->flags & MLX5_ATTR_FLAG_SLOW_PATH)) {
 		esw_setup_sampler_dest(dest, flow_act, attr->sample_attr.sampler_id, *i);
 		(*i)++;
-	} else if (attr->dest_ft) {
-		esw_setup_ft_dest(dest, flow_act, esw, attr, spec, *i);
-		(*i)++;
 	} else if (attr->flags & MLX5_ATTR_FLAG_SLOW_PATH) {
 		esw_setup_slow_path_dest(dest, flow_act, esw, *i);
 		(*i)++;
 	} else if (attr->flags & MLX5_ATTR_FLAG_ACCEPT) {
 		esw_setup_accept_dest(dest, flow_act, chains, *i);
 		(*i)++;
-	} else if (attr->dest_chain) {
-		err = esw_setup_chain_dest(dest, flow_act, chains, attr->dest_chain,
-					   1, 0, *i);
-		(*i)++;
 	} else if (esw_is_indir_table(esw, attr)) {
 		err = esw_setup_indir_table(dest, flow_act, esw, attr, spec, true, i);
 	} else if (esw_is_chain_src_port_rewrite(esw, esw_attr)) {
 		err = esw_setup_chain_src_port_rewrite(dest, flow_act, esw, chains, attr, i);
 	} else {
 		*i = esw_setup_vport_dests(dest, flow_act, esw, esw_attr, *i);
+
+		if (attr->dest_ft) {
+			err = esw_setup_ft_dest(dest, flow_act, esw, attr, spec, *i);
+			(*i)++;
+		} else if (attr->dest_chain) {
+			err = esw_setup_chain_dest(dest, flow_act, chains, attr->dest_chain,
+						   1, 0, *i);
+			(*i)++;
+		}
 	}
 
 	return err;
-- 
cgit v1.2.3


From 909ffe462a18041f064656b796999bb524c72a66 Mon Sep 17 00:00:00 2001
From: Chris Mi <cmi@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:31 -0700
Subject: net/mlx5: E-switch, Don't update group if qos is not enabled

Currently, qos group will be updated and qos will be enabled when
unregistering devlink port. Actually no need to update group if qos
is not enabled.

Add a check to prevent unnecessary enabling and disabling qos for
every port.

Signed-off-by: Chris Mi <cmi@nvidia.com>
Reviewed-by: Dmytro Linkin <dlinkin@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 694c54066955..4f8a24d84a86 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -924,12 +924,16 @@ int mlx5_esw_qos_vport_update_group(struct mlx5_eswitch *esw,
 				    struct mlx5_esw_rate_group *group,
 				    struct netlink_ext_ack *extack)
 {
-	int err;
+	int err = 0;
 
 	mutex_lock(&esw->state_lock);
+	if (!vport->qos.enabled && !group)
+		goto unlock;
+
 	err = esw_qos_vport_enable(esw, vport, 0, 0, extack);
 	if (!err)
 		err = esw_qos_vport_update_group(esw, vport, group, extack);
+unlock:
 	mutex_unlock(&esw->state_lock);
 	return err;
 }
-- 
cgit v1.2.3


From 794131c40850a9c68ba9955aa7749e92b903d73f Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@nvidia.com>
Date: Sat, 1 Oct 2022 21:56:32 -0700
Subject: net/mlx5: E-Switch, Return EBUSY if can't get mode lock

It is to avoid tc retrying during device mode change.

Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 0872a214d2a3..70a7a61f9708 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -4463,7 +4463,7 @@ int mlx5e_configure_flower(struct net_device *dev, struct mlx5e_priv *priv,
 	int err = 0;
 
 	if (!mlx5_esw_hold(priv->mdev))
-		return -EAGAIN;
+		return -EBUSY;
 
 	mlx5_esw_get(priv->mdev);
 
-- 
cgit v1.2.3


From 899b8cd0d3922f57176f5a7f552ce93d8a5cd90b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Sun, 2 Oct 2022 10:56:50 -0700
Subject: eth: octeon: fix build after netif_napi_add() changes

Guenter reports I missed a netif_napi_add() call
in one of the platform-specific drivers:

drivers/net/ethernet/cavium/octeon/octeon_mgmt.c: In function 'octeon_mgmt_probe':
drivers/net/ethernet/cavium/octeon/octeon_mgmt.c:1399:9: error: too many arguments to function 'netif_napi_add'
 1399 |         netif_napi_add(netdev, &p->napi, octeon_mgmt_napi_poll,
      |         ^~~~~~~~~~~~~~

Reported-by: Guenter Roeck <linux@roeck-us.net>
Fixes: b48b89f9c189 ("net: drop the weight argument from netif_napi_add")
Link: https://lore.kernel.org/r/20221002175650.1491124-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/cavium/octeon/octeon_mgmt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c
index 369bfd376d6f..edde0b8fa49c 100644
--- a/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c
+++ b/drivers/net/ethernet/cavium/octeon/octeon_mgmt.c
@@ -1396,8 +1396,8 @@ static int octeon_mgmt_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, netdev);
 	p = netdev_priv(netdev);
-	netif_napi_add(netdev, &p->napi, octeon_mgmt_napi_poll,
-		       OCTEON_MGMT_NAPI_WEIGHT);
+	netif_napi_add_weight(netdev, &p->napi, octeon_mgmt_napi_poll,
+			      OCTEON_MGMT_NAPI_WEIGHT);
 
 	p->netdev = netdev;
 	p->dev = &pdev->dev;
-- 
cgit v1.2.3


From 0a23ae23717156f1cebb8dbc3ad68765765d7bd7 Mon Sep 17 00:00:00 2001
From: Yevhen Orlov <yevhen.orlov@plvision.eu>
Date: Sat, 1 Oct 2022 12:34:09 +0300
Subject: net: marvell: prestera: Add router nexthops ABI

- Add functions to allocate/delete/set nexthop group
  - NOTE: non-ECMP nexthop is nexthop group with allocated size = 1
- Add function to read state of HW nh (if packets going through it)

Co-developed-by: Taras Chornyi <tchornyi@marvell.com>
Signed-off-by: Taras Chornyi <tchornyi@marvell.com>
Co-developed-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Yevhen Orlov <yevhen.orlov@plvision.eu>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/prestera/prestera.h   |   5 +
 .../net/ethernet/marvell/prestera/prestera_hw.c    | 130 ++++++++
 .../net/ethernet/marvell/prestera/prestera_hw.h    |  11 +
 .../ethernet/marvell/prestera/prestera_router.c    |  16 +-
 .../ethernet/marvell/prestera/prestera_router_hw.c | 354 ++++++++++++++++++++-
 .../ethernet/marvell/prestera/prestera_router_hw.h |  74 ++++-
 6 files changed, 582 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/marvell/prestera/prestera.h b/drivers/net/ethernet/marvell/prestera/prestera.h
index e5a4381a88b3..903e2e13e687 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera.h
@@ -306,17 +306,22 @@ struct prestera_switch {
 	struct prestera_counter *counter;
 	u8 lag_member_max;
 	u8 lag_max;
+	u32 size_tbl_router_nexthop;
 };
 
 struct prestera_router {
 	struct prestera_switch *sw;
 	struct list_head vr_list;
 	struct list_head rif_entry_list;
+	struct rhashtable nh_neigh_ht;
+	struct rhashtable nexthop_group_ht;
 	struct rhashtable fib_ht;
 	struct rhashtable kern_fib_cache_ht;
 	struct notifier_block inetaddr_nb;
 	struct notifier_block inetaddr_valid_nb;
 	struct notifier_block fib_nb;
+	u8 *nhgrp_hw_state_cache; /* Bitmap cached hw state of nhs */
+	unsigned long nhgrp_hw_cache_kick; /* jiffies */
 };
 
 struct prestera_rxtx_params {
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_hw.c
index 5803a28050e1..fc6f7d2746e8 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_hw.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_hw.c
@@ -10,11 +10,14 @@
 #include "prestera_hw.h"
 #include "prestera_acl.h"
 #include "prestera_counter.h"
+#include "prestera_router_hw.h"
 
 #define PRESTERA_SWITCH_INIT_TIMEOUT_MS (30 * 1000)
 
 #define PRESTERA_MIN_MTU 64
 
+#define PRESTERA_MSG_CHUNK_SIZE 1024
+
 enum prestera_cmd_type_t {
 	PRESTERA_CMD_TYPE_SWITCH_INIT = 0x1,
 	PRESTERA_CMD_TYPE_SWITCH_ATTR_SET = 0x2,
@@ -57,6 +60,10 @@ enum prestera_cmd_type_t {
 	PRESTERA_CMD_TYPE_ROUTER_RIF_DELETE = 0x601,
 	PRESTERA_CMD_TYPE_ROUTER_LPM_ADD = 0x610,
 	PRESTERA_CMD_TYPE_ROUTER_LPM_DELETE = 0x611,
+	PRESTERA_CMD_TYPE_ROUTER_NH_GRP_SET = 0x622,
+	PRESTERA_CMD_TYPE_ROUTER_NH_GRP_BLK_GET = 0x645,
+	PRESTERA_CMD_TYPE_ROUTER_NH_GRP_ADD = 0x623,
+	PRESTERA_CMD_TYPE_ROUTER_NH_GRP_DELETE = 0x624,
 	PRESTERA_CMD_TYPE_ROUTER_VR_CREATE = 0x630,
 	PRESTERA_CMD_TYPE_ROUTER_VR_DELETE = 0x631,
 
@@ -542,6 +549,14 @@ struct prestera_msg_ip_addr {
 	u8 __pad[3];
 };
 
+struct prestera_msg_nh {
+	struct prestera_msg_iface oif;
+	__le32 hw_id;
+	u8 mac[ETH_ALEN];
+	u8 is_active;
+	u8 pad;
+};
+
 struct prestera_msg_rif_req {
 	struct prestera_msg_cmd cmd;
 	struct prestera_msg_iface iif;
@@ -567,6 +582,34 @@ struct prestera_msg_lpm_req {
 	u8 __pad[2];
 };
 
+struct prestera_msg_nh_req {
+	struct prestera_msg_cmd cmd;
+	struct prestera_msg_nh nh[PRESTERA_NHGR_SIZE_MAX];
+	__le32 size;
+	__le32 grp_id;
+};
+
+struct prestera_msg_nh_chunk_req {
+	struct prestera_msg_cmd cmd;
+	__le32 offset;
+};
+
+struct prestera_msg_nh_chunk_resp {
+	struct prestera_msg_ret ret;
+	u8 hw_state[PRESTERA_MSG_CHUNK_SIZE];
+};
+
+struct prestera_msg_nh_grp_req {
+	struct prestera_msg_cmd cmd;
+	__le32 grp_id;
+	__le32 size;
+};
+
+struct prestera_msg_nh_grp_resp {
+	struct prestera_msg_ret ret;
+	__le32 grp_id;
+};
+
 struct prestera_msg_vr_req {
 	struct prestera_msg_cmd cmd;
 	__le16 vr_id;
@@ -729,11 +772,15 @@ static void prestera_hw_build_tests(void)
 	BUILD_BUG_ON(sizeof(struct prestera_msg_flood_domain_ports_reset_req) != 8);
 	BUILD_BUG_ON(sizeof(struct prestera_msg_mdb_create_req) != 16);
 	BUILD_BUG_ON(sizeof(struct prestera_msg_mdb_destroy_req) != 16);
+	BUILD_BUG_ON(sizeof(struct prestera_msg_nh_req) != 124);
+	BUILD_BUG_ON(sizeof(struct prestera_msg_nh_chunk_req) != 8);
+	BUILD_BUG_ON(sizeof(struct prestera_msg_nh_grp_req) != 12);
 
 	/*  structure that are part of req/resp fw messages */
 	BUILD_BUG_ON(sizeof(struct prestera_msg_iface) != 16);
 	BUILD_BUG_ON(sizeof(struct prestera_msg_ip_addr) != 20);
 	BUILD_BUG_ON(sizeof(struct prestera_msg_flood_domain_port) != 12);
+	BUILD_BUG_ON(sizeof(struct prestera_msg_nh) != 28);
 
 	/* check responses */
 	BUILD_BUG_ON(sizeof(struct prestera_msg_common_resp) != 8);
@@ -750,6 +797,8 @@ static void prestera_hw_build_tests(void)
 	BUILD_BUG_ON(sizeof(struct prestera_msg_vr_resp) != 12);
 	BUILD_BUG_ON(sizeof(struct prestera_msg_policer_resp) != 12);
 	BUILD_BUG_ON(sizeof(struct prestera_msg_flood_domain_create_resp) != 12);
+	BUILD_BUG_ON(sizeof(struct prestera_msg_nh_chunk_resp) != 1032);
+	BUILD_BUG_ON(sizeof(struct prestera_msg_nh_grp_resp) != 12);
 
 	/* check events */
 	BUILD_BUG_ON(sizeof(struct prestera_msg_event_port) != 20);
@@ -1027,6 +1076,8 @@ int prestera_hw_switch_init(struct prestera_switch *sw)
 	sw->id = resp.switch_id;
 	sw->lag_member_max = resp.lag_member_max;
 	sw->lag_max = resp.lag_max;
+	sw->size_tbl_router_nexthop =
+		__le32_to_cpu(resp.size_tbl_router_nexthop);
 
 	return 0;
 }
@@ -2037,6 +2088,85 @@ int prestera_hw_lpm_del(struct prestera_switch *sw, u16 vr_id,
 			    sizeof(req));
 }
 
+int prestera_hw_nh_entries_set(struct prestera_switch *sw, int count,
+			       struct prestera_neigh_info *nhs, u32 grp_id)
+{
+	struct prestera_msg_nh_req req = { .size = __cpu_to_le32((u32)count),
+			.grp_id = __cpu_to_le32(grp_id) };
+	int i, err;
+
+	for (i = 0; i < count; i++) {
+		req.nh[i].is_active = nhs[i].connected;
+		memcpy(&req.nh[i].mac, nhs[i].ha, ETH_ALEN);
+		err = prestera_iface_to_msg(&nhs[i].iface, &req.nh[i].oif);
+		if (err)
+			return err;
+	}
+
+	return prestera_cmd(sw, PRESTERA_CMD_TYPE_ROUTER_NH_GRP_SET, &req.cmd,
+			    sizeof(req));
+}
+
+int prestera_hw_nhgrp_blk_get(struct prestera_switch *sw,
+			      u8 *hw_state, u32 buf_size /* Buffer in bytes */)
+{
+	static struct prestera_msg_nh_chunk_resp resp;
+	struct prestera_msg_nh_chunk_req req;
+	u32 buf_offset;
+	int err;
+
+	memset(&hw_state[0], 0, buf_size);
+	buf_offset = 0;
+	while (1) {
+		if (buf_offset >= buf_size)
+			break;
+
+		memset(&req, 0, sizeof(req));
+		req.offset = __cpu_to_le32(buf_offset * 8); /* 8 bits in u8 */
+		err = prestera_cmd_ret(sw,
+				       PRESTERA_CMD_TYPE_ROUTER_NH_GRP_BLK_GET,
+				       &req.cmd, sizeof(req), &resp.ret,
+				       sizeof(resp));
+		if (err)
+			return err;
+
+		memcpy(&hw_state[buf_offset], &resp.hw_state[0],
+		       buf_offset + PRESTERA_MSG_CHUNK_SIZE > buf_size ?
+			buf_size - buf_offset : PRESTERA_MSG_CHUNK_SIZE);
+		buf_offset += PRESTERA_MSG_CHUNK_SIZE;
+	}
+
+	return 0;
+}
+
+int prestera_hw_nh_group_create(struct prestera_switch *sw, u16 nh_count,
+				u32 *grp_id)
+{
+	struct prestera_msg_nh_grp_req req = { .size = __cpu_to_le32((u32)nh_count) };
+	struct prestera_msg_nh_grp_resp resp;
+	int err;
+
+	err = prestera_cmd_ret(sw, PRESTERA_CMD_TYPE_ROUTER_NH_GRP_ADD,
+			       &req.cmd, sizeof(req), &resp.ret, sizeof(resp));
+	if (err)
+		return err;
+
+	*grp_id = __le32_to_cpu(resp.grp_id);
+	return err;
+}
+
+int prestera_hw_nh_group_delete(struct prestera_switch *sw, u16 nh_count,
+				u32 grp_id)
+{
+	struct prestera_msg_nh_grp_req req = {
+	    .grp_id = __cpu_to_le32(grp_id),
+	    .size = __cpu_to_le32(nh_count)
+	};
+
+	return prestera_cmd(sw, PRESTERA_CMD_TYPE_ROUTER_NH_GRP_DELETE,
+			    &req.cmd, sizeof(req));
+}
+
 int prestera_hw_rxtx_init(struct prestera_switch *sw,
 			  struct prestera_rxtx_params *params)
 {
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_hw.h b/drivers/net/ethernet/marvell/prestera/prestera_hw.h
index 21078a2256b2..0a929279e1ce 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_hw.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera_hw.h
@@ -146,6 +146,7 @@ struct prestera_counter_stats;
 struct prestera_iface;
 struct prestera_flood_domain;
 struct prestera_mdb_entry;
+struct prestera_neigh_info;
 
 /* Switch API */
 int prestera_hw_switch_init(struct prestera_switch *sw);
@@ -266,6 +267,16 @@ int prestera_hw_lpm_add(struct prestera_switch *sw, u16 vr_id,
 int prestera_hw_lpm_del(struct prestera_switch *sw, u16 vr_id,
 			__be32 dst, u32 dst_len);
 
+/* NH API */
+int prestera_hw_nh_entries_set(struct prestera_switch *sw, int count,
+			       struct prestera_neigh_info *nhs, u32 grp_id);
+int prestera_hw_nhgrp_blk_get(struct prestera_switch *sw,
+			      u8 *hw_state, u32 buf_size /* Buffer in bytes */);
+int prestera_hw_nh_group_create(struct prestera_switch *sw, u16 nh_count,
+				u32 *grp_id);
+int prestera_hw_nh_group_delete(struct prestera_switch *sw, u16 nh_count,
+				u32 grp_id);
+
 /* Event handlers */
 int prestera_hw_event_handler_register(struct prestera_switch *sw,
 				       enum prestera_event_type type,
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router.c b/drivers/net/ethernet/marvell/prestera/prestera_router.c
index 58f4e44d5ad7..a8548b9f9cf1 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_router.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_router.c
@@ -191,7 +191,7 @@ static int __prestera_k_arb_f_lpm_set(struct prestera_switch *sw,
 		return 0;
 
 	fib_node = prestera_fib_node_create(sw, &fc->lpm_info.fib_key,
-					    fc->lpm_info.fib_type);
+					    fc->lpm_info.fib_type, NULL);
 
 	if (!fib_node) {
 		dev_err(sw->dev->dev, "fib_node=NULL %pI4n/%d kern_tb_id = %d",
@@ -220,6 +220,8 @@ static int __prestera_k_arb_fc_apply(struct prestera_switch *sw,
 	}
 
 	switch (fc->lpm_info.fib_type) {
+	case PRESTERA_FIB_TYPE_UC_NH:
+		break;
 	case PRESTERA_FIB_TYPE_TRAP:
 		__prestera_k_arb_fib_lpm_offload_set(sw, fc, false,
 						     false, fc->reachable);
@@ -537,7 +539,7 @@ static int __prestera_router_fib_event(struct notifier_block *nb,
 int prestera_router_init(struct prestera_switch *sw)
 {
 	struct prestera_router *router;
-	int err;
+	int err, nhgrp_cache_bytes;
 
 	router = kzalloc(sizeof(*sw->router), GFP_KERNEL);
 	if (!router)
@@ -555,6 +557,13 @@ int prestera_router_init(struct prestera_switch *sw)
 	if (err)
 		goto err_kern_fib_cache_ht_init;
 
+	nhgrp_cache_bytes = sw->size_tbl_router_nexthop / 8 + 1;
+	router->nhgrp_hw_state_cache = kzalloc(nhgrp_cache_bytes, GFP_KERNEL);
+	if (!router->nhgrp_hw_state_cache) {
+		err = -ENOMEM;
+		goto err_nh_state_cache_alloc;
+	}
+
 	router->inetaddr_valid_nb.notifier_call = __prestera_inetaddr_valid_cb;
 	err = register_inetaddr_validator_notifier(&router->inetaddr_valid_nb);
 	if (err)
@@ -578,6 +587,8 @@ err_register_fib_notifier:
 err_register_inetaddr_notifier:
 	unregister_inetaddr_validator_notifier(&router->inetaddr_valid_nb);
 err_register_inetaddr_validator_notifier:
+	kfree(router->nhgrp_hw_state_cache);
+err_nh_state_cache_alloc:
 	rhashtable_destroy(&router->kern_fib_cache_ht);
 err_kern_fib_cache_ht_init:
 	prestera_router_hw_fini(sw);
@@ -591,6 +602,7 @@ void prestera_router_fini(struct prestera_switch *sw)
 	unregister_fib_notifier(&init_net, &sw->router->fib_nb);
 	unregister_inetaddr_notifier(&sw->router->inetaddr_nb);
 	unregister_inetaddr_validator_notifier(&sw->router->inetaddr_valid_nb);
+	kfree(sw->router->nhgrp_hw_state_cache);
 	rhashtable_destroy(&sw->router->kern_fib_cache_ht);
 	prestera_router_hw_fini(sw);
 	kfree(sw->router);
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c
index 5b0cf3be9a9e..db9d2e9d9904 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c
@@ -8,10 +8,16 @@
 #include "prestera_router_hw.h"
 #include "prestera_acl.h"
 
-/*            +--+
- *   +------->|vr|<-+
- *   |        +--+  |
- *   |              |
+/*                                Nexthop is pointed
+ *                                to port (not rif)
+ *                                +-------+
+ *                              +>|nexthop|
+ *                              | +-------+
+ *                              |
+ *            +--+        +-----++
+ *   +------->|vr|<-+   +>|nh_grp|
+ *   |        +--+  |   | +------+
+ *   |              |   |
  * +-+-------+   +--+---+-+
  * |rif_entry|   |fib_node|
  * +---------+   +--------+
@@ -23,6 +29,8 @@
 
 #define PRESTERA_NHGR_UNUSED (0)
 #define PRESTERA_NHGR_DROP (0xFFFFFFFF)
+/* Need to merge it with router_manager */
+#define PRESTERA_NH_ACTIVE_JIFFER_FILTER 3000 /* ms */
 
 static const struct rhashtable_params __prestera_fib_ht_params = {
 	.key_offset  = offsetof(struct prestera_fib_node, key),
@@ -31,10 +39,44 @@ static const struct rhashtable_params __prestera_fib_ht_params = {
 	.automatic_shrinking = true,
 };
 
+static const struct rhashtable_params __prestera_nh_neigh_ht_params = {
+	.key_offset  = offsetof(struct prestera_nh_neigh, key),
+	.key_len     = sizeof(struct prestera_nh_neigh_key),
+	.head_offset = offsetof(struct prestera_nh_neigh, ht_node),
+};
+
+static const struct rhashtable_params __prestera_nexthop_group_ht_params = {
+	.key_offset  = offsetof(struct prestera_nexthop_group, key),
+	.key_len     = sizeof(struct prestera_nexthop_group_key),
+	.head_offset = offsetof(struct prestera_nexthop_group, ht_node),
+};
+
+static int prestera_nexthop_group_set(struct prestera_switch *sw,
+				      struct prestera_nexthop_group *nh_grp);
+static bool
+prestera_nexthop_group_util_hw_state(struct prestera_switch *sw,
+				     struct prestera_nexthop_group *nh_grp);
+
+/* TODO: move to router.h as macros */
+static bool prestera_nh_neigh_key_is_valid(struct prestera_nh_neigh_key *key)
+{
+	return memchr_inv(key, 0, sizeof(*key)) ? true : false;
+}
+
 int prestera_router_hw_init(struct prestera_switch *sw)
 {
 	int err;
 
+	err = rhashtable_init(&sw->router->nh_neigh_ht,
+			      &__prestera_nh_neigh_ht_params);
+	if (err)
+		goto err_nh_neigh_ht_init;
+
+	err = rhashtable_init(&sw->router->nexthop_group_ht,
+			      &__prestera_nexthop_group_ht_params);
+	if (err)
+		goto err_nexthop_grp_ht_init;
+
 	err = rhashtable_init(&sw->router->fib_ht,
 			      &__prestera_fib_ht_params);
 	if (err)
@@ -43,7 +85,13 @@ int prestera_router_hw_init(struct prestera_switch *sw)
 	INIT_LIST_HEAD(&sw->router->vr_list);
 	INIT_LIST_HEAD(&sw->router->rif_entry_list);
 
+	return 0;
+
 err_fib_ht_init:
+	rhashtable_destroy(&sw->router->nexthop_group_ht);
+err_nexthop_grp_ht_init:
+	rhashtable_destroy(&sw->router->nh_neigh_ht);
+err_nh_neigh_ht_init:
 	return 0;
 }
 
@@ -52,6 +100,8 @@ void prestera_router_hw_fini(struct prestera_switch *sw)
 	WARN_ON(!list_empty(&sw->router->vr_list));
 	WARN_ON(!list_empty(&sw->router->rif_entry_list));
 	rhashtable_destroy(&sw->router->fib_ht);
+	rhashtable_destroy(&sw->router->nexthop_group_ht);
+	rhashtable_destroy(&sw->router->nh_neigh_ht);
 }
 
 static struct prestera_vr *__prestera_vr_find(struct prestera_switch *sw,
@@ -232,6 +282,286 @@ err_kzalloc:
 	return NULL;
 }
 
+static void __prestera_nh_neigh_destroy(struct prestera_switch *sw,
+					struct prestera_nh_neigh *neigh)
+{
+	rhashtable_remove_fast(&sw->router->nh_neigh_ht,
+			       &neigh->ht_node,
+			       __prestera_nh_neigh_ht_params);
+	kfree(neigh);
+}
+
+static struct prestera_nh_neigh *
+__prestera_nh_neigh_create(struct prestera_switch *sw,
+			   struct prestera_nh_neigh_key *key)
+{
+	struct prestera_nh_neigh *neigh;
+	int err;
+
+	neigh = kzalloc(sizeof(*neigh), GFP_KERNEL);
+	if (!neigh)
+		goto err_kzalloc;
+
+	memcpy(&neigh->key, key, sizeof(*key));
+	neigh->info.connected = false;
+	INIT_LIST_HEAD(&neigh->nexthop_group_list);
+	err = rhashtable_insert_fast(&sw->router->nh_neigh_ht,
+				     &neigh->ht_node,
+				     __prestera_nh_neigh_ht_params);
+	if (err)
+		goto err_rhashtable_insert;
+
+	return neigh;
+
+err_rhashtable_insert:
+	kfree(neigh);
+err_kzalloc:
+	return NULL;
+}
+
+struct prestera_nh_neigh *
+prestera_nh_neigh_find(struct prestera_switch *sw,
+		       struct prestera_nh_neigh_key *key)
+{
+	struct prestera_nh_neigh *nh_neigh;
+
+	nh_neigh = rhashtable_lookup_fast(&sw->router->nh_neigh_ht,
+					  key, __prestera_nh_neigh_ht_params);
+	return IS_ERR(nh_neigh) ? NULL : nh_neigh;
+}
+
+struct prestera_nh_neigh *
+prestera_nh_neigh_get(struct prestera_switch *sw,
+		      struct prestera_nh_neigh_key *key)
+{
+	struct prestera_nh_neigh *neigh;
+
+	neigh = prestera_nh_neigh_find(sw, key);
+	if (!neigh)
+		return __prestera_nh_neigh_create(sw, key);
+
+	return neigh;
+}
+
+void prestera_nh_neigh_put(struct prestera_switch *sw,
+			   struct prestera_nh_neigh *neigh)
+{
+	if (list_empty(&neigh->nexthop_group_list))
+		__prestera_nh_neigh_destroy(sw, neigh);
+}
+
+/* Updates new prestera_neigh_info */
+int prestera_nh_neigh_set(struct prestera_switch *sw,
+			  struct prestera_nh_neigh *neigh)
+{
+	struct prestera_nh_neigh_head *nh_head;
+	struct prestera_nexthop_group *nh_grp;
+	int err;
+
+	list_for_each_entry(nh_head, &neigh->nexthop_group_list, head) {
+		nh_grp = nh_head->this;
+		err = prestera_nexthop_group_set(sw, nh_grp);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+bool prestera_nh_neigh_util_hw_state(struct prestera_switch *sw,
+				     struct prestera_nh_neigh *nh_neigh)
+{
+	bool state;
+	struct prestera_nh_neigh_head *nh_head, *tmp;
+
+	state = false;
+	list_for_each_entry_safe(nh_head, tmp,
+				 &nh_neigh->nexthop_group_list, head) {
+		state = prestera_nexthop_group_util_hw_state(sw, nh_head->this);
+		if (state)
+			goto out;
+	}
+
+out:
+	return state;
+}
+
+static struct prestera_nexthop_group *
+__prestera_nexthop_group_create(struct prestera_switch *sw,
+				struct prestera_nexthop_group_key *key)
+{
+	struct prestera_nexthop_group *nh_grp;
+	struct prestera_nh_neigh *nh_neigh;
+	int nh_cnt, err, gid;
+
+	nh_grp = kzalloc(sizeof(*nh_grp), GFP_KERNEL);
+	if (!nh_grp)
+		goto err_kzalloc;
+
+	memcpy(&nh_grp->key, key, sizeof(*key));
+	for (nh_cnt = 0; nh_cnt < PRESTERA_NHGR_SIZE_MAX; nh_cnt++) {
+		if (!prestera_nh_neigh_key_is_valid(&nh_grp->key.neigh[nh_cnt]))
+			break;
+
+		nh_neigh = prestera_nh_neigh_get(sw,
+						 &nh_grp->key.neigh[nh_cnt]);
+		if (!nh_neigh)
+			goto err_nh_neigh_get;
+
+		nh_grp->nh_neigh_head[nh_cnt].neigh = nh_neigh;
+		nh_grp->nh_neigh_head[nh_cnt].this = nh_grp;
+		list_add(&nh_grp->nh_neigh_head[nh_cnt].head,
+			 &nh_neigh->nexthop_group_list);
+	}
+
+	err = prestera_hw_nh_group_create(sw, nh_cnt, &nh_grp->grp_id);
+	if (err)
+		goto err_nh_group_create;
+
+	err = prestera_nexthop_group_set(sw, nh_grp);
+	if (err)
+		goto err_nexthop_group_set;
+
+	err = rhashtable_insert_fast(&sw->router->nexthop_group_ht,
+				     &nh_grp->ht_node,
+				     __prestera_nexthop_group_ht_params);
+	if (err)
+		goto err_ht_insert;
+
+	/* reset cache for created group */
+	gid = nh_grp->grp_id;
+	sw->router->nhgrp_hw_state_cache[gid / 8] &= ~BIT(gid % 8);
+
+	return nh_grp;
+
+err_ht_insert:
+err_nexthop_group_set:
+	prestera_hw_nh_group_delete(sw, nh_cnt, nh_grp->grp_id);
+err_nh_group_create:
+err_nh_neigh_get:
+	for (nh_cnt--; nh_cnt >= 0; nh_cnt--) {
+		list_del(&nh_grp->nh_neigh_head[nh_cnt].head);
+		prestera_nh_neigh_put(sw, nh_grp->nh_neigh_head[nh_cnt].neigh);
+	}
+
+	kfree(nh_grp);
+err_kzalloc:
+	return NULL;
+}
+
+static void
+__prestera_nexthop_group_destroy(struct prestera_switch *sw,
+				 struct prestera_nexthop_group *nh_grp)
+{
+	struct prestera_nh_neigh *nh_neigh;
+	int nh_cnt;
+
+	rhashtable_remove_fast(&sw->router->nexthop_group_ht,
+			       &nh_grp->ht_node,
+			       __prestera_nexthop_group_ht_params);
+
+	for (nh_cnt = 0; nh_cnt < PRESTERA_NHGR_SIZE_MAX; nh_cnt++) {
+		nh_neigh = nh_grp->nh_neigh_head[nh_cnt].neigh;
+		if (!nh_neigh)
+			break;
+
+		list_del(&nh_grp->nh_neigh_head[nh_cnt].head);
+		prestera_nh_neigh_put(sw, nh_neigh);
+	}
+
+	prestera_hw_nh_group_delete(sw, nh_cnt, nh_grp->grp_id);
+	kfree(nh_grp);
+}
+
+static struct prestera_nexthop_group *
+__prestera_nexthop_group_find(struct prestera_switch *sw,
+			      struct prestera_nexthop_group_key *key)
+{
+	struct prestera_nexthop_group *nh_grp;
+
+	nh_grp = rhashtable_lookup_fast(&sw->router->nexthop_group_ht,
+					key, __prestera_nexthop_group_ht_params);
+	return IS_ERR(nh_grp) ? NULL : nh_grp;
+}
+
+static struct prestera_nexthop_group *
+prestera_nexthop_group_get(struct prestera_switch *sw,
+			   struct prestera_nexthop_group_key *key)
+{
+	struct prestera_nexthop_group *nh_grp;
+
+	nh_grp = __prestera_nexthop_group_find(sw, key);
+	if (nh_grp) {
+		refcount_inc(&nh_grp->refcount);
+	} else {
+		nh_grp = __prestera_nexthop_group_create(sw, key);
+		if (IS_ERR(nh_grp))
+			return ERR_CAST(nh_grp);
+
+		refcount_set(&nh_grp->refcount, 1);
+	}
+
+	return nh_grp;
+}
+
+static void prestera_nexthop_group_put(struct prestera_switch *sw,
+				       struct prestera_nexthop_group *nh_grp)
+{
+	if (refcount_dec_and_test(&nh_grp->refcount))
+		__prestera_nexthop_group_destroy(sw, nh_grp);
+}
+
+/* Updates with new nh_neigh's info */
+static int prestera_nexthop_group_set(struct prestera_switch *sw,
+				      struct prestera_nexthop_group *nh_grp)
+{
+	struct prestera_neigh_info info[PRESTERA_NHGR_SIZE_MAX];
+	struct prestera_nh_neigh *neigh;
+	int nh_cnt;
+
+	memset(&info[0], 0, sizeof(info));
+	for (nh_cnt = 0; nh_cnt < PRESTERA_NHGR_SIZE_MAX; nh_cnt++) {
+		neigh = nh_grp->nh_neigh_head[nh_cnt].neigh;
+		if (!neigh)
+			break;
+
+		memcpy(&info[nh_cnt], &neigh->info, sizeof(neigh->info));
+	}
+
+	return prestera_hw_nh_entries_set(sw, nh_cnt, &info[0], nh_grp->grp_id);
+}
+
+static bool
+prestera_nexthop_group_util_hw_state(struct prestera_switch *sw,
+				     struct prestera_nexthop_group *nh_grp)
+{
+	int err;
+	u32 buf_size = sw->size_tbl_router_nexthop / 8 + 1;
+	u32 gid = nh_grp->grp_id;
+	u8 *cache = sw->router->nhgrp_hw_state_cache;
+
+	/* Antijitter
+	 * Prevent situation, when we read state of nh_grp twice in short time,
+	 * and state bit is still cleared on second call. So just stuck active
+	 * state for PRESTERA_NH_ACTIVE_JIFFER_FILTER, after last occurred.
+	 */
+	if (!time_before(jiffies, sw->router->nhgrp_hw_cache_kick +
+			msecs_to_jiffies(PRESTERA_NH_ACTIVE_JIFFER_FILTER))) {
+		err = prestera_hw_nhgrp_blk_get(sw, cache, buf_size);
+		if (err) {
+			pr_err("Failed to get hw state nh_grp's");
+			return false;
+		}
+
+		sw->router->nhgrp_hw_cache_kick = jiffies;
+	}
+
+	if (cache[gid / 8] & BIT(gid % 8))
+		return true;
+
+	return false;
+}
+
 struct prestera_fib_node *
 prestera_fib_node_find(struct prestera_switch *sw, struct prestera_fib_key *key)
 {
@@ -251,6 +581,9 @@ static void __prestera_fib_node_destruct(struct prestera_switch *sw,
 	prestera_hw_lpm_del(sw, vr->hw_vr_id, fib_node->key.addr.u.ipv4,
 			    fib_node->key.prefix_len);
 	switch (fib_node->info.type) {
+	case PRESTERA_FIB_TYPE_UC_NH:
+		prestera_nexthop_group_put(sw, fib_node->info.nh_grp);
+		break;
 	case PRESTERA_FIB_TYPE_TRAP:
 		break;
 	case PRESTERA_FIB_TYPE_DROP:
@@ -275,7 +608,8 @@ void prestera_fib_node_destroy(struct prestera_switch *sw,
 struct prestera_fib_node *
 prestera_fib_node_create(struct prestera_switch *sw,
 			 struct prestera_fib_key *key,
-			 enum prestera_fib_type fib_type)
+			 enum prestera_fib_type fib_type,
+			 struct prestera_nexthop_group_key *nh_grp_key)
 {
 	struct prestera_fib_node *fib_node;
 	u32 grp_id;
@@ -302,6 +636,14 @@ prestera_fib_node_create(struct prestera_switch *sw,
 	case PRESTERA_FIB_TYPE_DROP:
 		grp_id = PRESTERA_NHGR_DROP;
 		break;
+	case PRESTERA_FIB_TYPE_UC_NH:
+		fib_node->info.nh_grp = prestera_nexthop_group_get(sw,
+								   nh_grp_key);
+		if (!fib_node->info.nh_grp)
+			goto err_nh_grp_get;
+
+		grp_id = fib_node->info.nh_grp->grp_id;
+		break;
 	default:
 		pr_err("Unsupported fib_type %d", fib_type);
 		goto err_nh_grp_get;
@@ -323,6 +665,8 @@ err_ht_insert:
 	prestera_hw_lpm_del(sw, vr->hw_vr_id, key->addr.u.ipv4,
 			    key->prefix_len);
 err_lpm_add:
+	if (fib_type == PRESTERA_FIB_TYPE_UC_NH)
+		prestera_nexthop_group_put(sw, fib_node->info.nh_grp);
 err_nh_grp_get:
 	prestera_vr_put(sw, vr);
 err_vr_get:
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h
index 67dbb49c8bd4..43bad23f38ec 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h
@@ -33,6 +33,61 @@ struct prestera_ip_addr {
 	} v;
 };
 
+struct prestera_nh_neigh_key {
+	struct prestera_ip_addr addr;
+	/* Seems like rif is obsolete, because there is iface in info ?
+	 * Key can contain functional fields, or fields, which is used to
+	 * filter duplicate objects on logical level (before you pass it to
+	 * HW)... also key can be used to cover hardware restrictions.
+	 * In our case rif - is logical interface (even can be VLAN), which
+	 * is used in combination with IP address (which is also not related to
+	 * hardware nexthop) to provide logical compression of created nexthops.
+	 * You even can imagine, that rif+IPaddr is just cookie.
+	 */
+	/* struct prestera_rif *rif; */
+	/* Use just as cookie, to divide ARP domains (in order with addr) */
+	void *rif;
+};
+
+/* Used for hw call */
+struct prestera_neigh_info {
+	struct prestera_iface iface;
+	unsigned char ha[ETH_ALEN];
+	u8 connected; /* bool. indicate, if mac/oif valid */
+	u8 __pad[1];
+};
+
+/* Used to notify nh about neigh change */
+struct prestera_nh_neigh {
+	struct prestera_nh_neigh_key key;
+	struct prestera_neigh_info info;
+	struct rhash_head ht_node; /* node of prestera_vr */
+	struct list_head nexthop_group_list;
+};
+
+#define PRESTERA_NHGR_SIZE_MAX 4
+
+struct prestera_nexthop_group {
+	struct prestera_nexthop_group_key {
+		struct prestera_nh_neigh_key neigh[PRESTERA_NHGR_SIZE_MAX];
+	} key;
+	/* Store intermediate object here.
+	 * This prevent overhead kzalloc call.
+	 */
+	/* nh_neigh is used only to notify nexthop_group */
+	struct prestera_nh_neigh_head {
+		struct prestera_nexthop_group *this;
+		struct list_head head;
+		/* ptr to neigh is not necessary.
+		 * It used to prevent lookup of nh_neigh by key (n) on destroy
+		 */
+		struct prestera_nh_neigh *neigh;
+	} nh_neigh_head[PRESTERA_NHGR_SIZE_MAX];
+	struct rhash_head ht_node; /* node of prestera_vr */
+	refcount_t refcount;
+	u32 grp_id; /* hw */
+};
+
 struct prestera_fib_key {
 	struct prestera_ip_addr addr;
 	u32 prefix_len;
@@ -44,12 +99,16 @@ struct prestera_fib_info {
 	struct list_head vr_node;
 	enum prestera_fib_type {
 		PRESTERA_FIB_TYPE_INVALID = 0,
+		/* must be pointer to nh_grp id */
+		PRESTERA_FIB_TYPE_UC_NH,
 		/* It can be connected route
 		 * and will be overlapped with neighbours
 		 */
 		PRESTERA_FIB_TYPE_TRAP,
 		PRESTERA_FIB_TYPE_DROP
 	} type;
+	/* Valid only if type = UC_NH*/
+	struct prestera_nexthop_group *nh_grp;
 };
 
 struct prestera_fib_node {
@@ -67,6 +126,18 @@ struct prestera_rif_entry *
 prestera_rif_entry_create(struct prestera_switch *sw,
 			  struct prestera_rif_entry_key *k,
 			  u32 tb_id, const unsigned char *addr);
+struct prestera_nh_neigh *
+prestera_nh_neigh_find(struct prestera_switch *sw,
+		       struct prestera_nh_neigh_key *key);
+struct prestera_nh_neigh *
+prestera_nh_neigh_get(struct prestera_switch *sw,
+		      struct prestera_nh_neigh_key *key);
+void prestera_nh_neigh_put(struct prestera_switch *sw,
+			   struct prestera_nh_neigh *neigh);
+int prestera_nh_neigh_set(struct prestera_switch *sw,
+			  struct prestera_nh_neigh *neigh);
+bool prestera_nh_neigh_util_hw_state(struct prestera_switch *sw,
+				     struct prestera_nh_neigh *nh_neigh);
 struct prestera_fib_node *prestera_fib_node_find(struct prestera_switch *sw,
 						 struct prestera_fib_key *key);
 void prestera_fib_node_destroy(struct prestera_switch *sw,
@@ -74,7 +145,8 @@ void prestera_fib_node_destroy(struct prestera_switch *sw,
 struct prestera_fib_node *
 prestera_fib_node_create(struct prestera_switch *sw,
 			 struct prestera_fib_key *key,
-			 enum prestera_fib_type fib_type);
+			 enum prestera_fib_type fib_type,
+			 struct prestera_nexthop_group_key *nh_grp_key);
 int prestera_router_hw_init(struct prestera_switch *sw);
 void prestera_router_hw_fini(struct prestera_switch *sw);
 
-- 
cgit v1.2.3


From 1e7313e83ef7859ecc7fb9458d094a084d30bc7f Mon Sep 17 00:00:00 2001
From: Yevhen Orlov <yevhen.orlov@plvision.eu>
Date: Sat, 1 Oct 2022 12:34:10 +0300
Subject: net: marvell: prestera: Add cleanup of allocated fib_nodes

Do explicity cleanup on router_hw_fini, to ensure, that all allocated
objects cleaned. This will be used in cases,
when upper layer (cache) is not mapped to router_hw layer.

Co-developed-by: Taras Chornyi <tchornyi@marvell.com>
Signed-off-by: Taras Chornyi <tchornyi@marvell.com>
Co-developed-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Yevhen Orlov <yevhen.orlov@plvision.eu>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/prestera/prestera_router_hw.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c
index db9d2e9d9904..4f65df0ae5e8 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.c
@@ -56,6 +56,7 @@ static int prestera_nexthop_group_set(struct prestera_switch *sw,
 static bool
 prestera_nexthop_group_util_hw_state(struct prestera_switch *sw,
 				     struct prestera_nexthop_group *nh_grp);
+static void prestera_fib_node_destroy_ht_cb(void *ptr, void *arg);
 
 /* TODO: move to router.h as macros */
 static bool prestera_nh_neigh_key_is_valid(struct prestera_nh_neigh_key *key)
@@ -97,6 +98,8 @@ err_nh_neigh_ht_init:
 
 void prestera_router_hw_fini(struct prestera_switch *sw)
 {
+	rhashtable_free_and_destroy(&sw->router->fib_ht,
+				    prestera_fib_node_destroy_ht_cb, sw);
 	WARN_ON(!list_empty(&sw->router->vr_list));
 	WARN_ON(!list_empty(&sw->router->rif_entry_list));
 	rhashtable_destroy(&sw->router->fib_ht);
@@ -605,6 +608,15 @@ void prestera_fib_node_destroy(struct prestera_switch *sw,
 	kfree(fib_node);
 }
 
+static void prestera_fib_node_destroy_ht_cb(void *ptr, void *arg)
+{
+	struct prestera_fib_node *node = ptr;
+	struct prestera_switch *sw = arg;
+
+	__prestera_fib_node_destruct(sw, node);
+	kfree(node);
+}
+
 struct prestera_fib_node *
 prestera_fib_node_create(struct prestera_switch *sw,
 			 struct prestera_fib_key *key,
-- 
cgit v1.2.3


From 333fe4d033fa5aac139f66d8f185877f47427c3d Mon Sep 17 00:00:00 2001
From: Yevhen Orlov <yevhen.orlov@plvision.eu>
Date: Sat, 1 Oct 2022 12:34:11 +0300
Subject: net: marvell: prestera: Add strict cleanup of fib arbiter

This will, ensure, that there is no more, preciously allocated fib_cache
entries left after deinit.
Will be used to free allocated resources of nexthop routes, that points
to "not our" port (e.g. eth0).

Signed-off-by: Yevhen Orlov <yevhen.orlov@plvision.eu>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../ethernet/marvell/prestera/prestera_router.c    | 42 +++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router.c b/drivers/net/ethernet/marvell/prestera/prestera_router.c
index a8548b9f9cf1..b4fd8276bbce 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_router.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_router.c
@@ -72,14 +72,21 @@ prestera_kern_fib_cache_find(struct prestera_switch *sw,
 	return fib_cache;
 }
 
+static void
+__prestera_kern_fib_cache_destruct(struct prestera_switch *sw,
+				   struct prestera_kern_fib_cache *fib_cache)
+{
+	fib_info_put(fib_cache->fi);
+}
+
 static void
 prestera_kern_fib_cache_destroy(struct prestera_switch *sw,
 				struct prestera_kern_fib_cache *fib_cache)
 {
-	fib_info_put(fib_cache->fi);
 	rhashtable_remove_fast(&sw->router->kern_fib_cache_ht,
 			       &fib_cache->ht_node,
 			       __prestera_kern_fib_cache_ht_params);
+	__prestera_kern_fib_cache_destruct(sw, fib_cache);
 	kfree(fib_cache);
 }
 
@@ -336,6 +343,36 @@ prestera_k_arb_fib_evt(struct prestera_switch *sw,
 	return 0;
 }
 
+static void __prestera_k_arb_abort_fib_ht_cb(void *ptr, void *arg)
+{
+	struct prestera_kern_fib_cache *fib_cache = ptr;
+	struct prestera_switch *sw = arg;
+
+	__prestera_k_arb_fib_lpm_offload_set(sw, fib_cache,
+					     false, false,
+					     false);
+	/* No need to destroy lpm.
+	 * It will be aborted by destroy_ht
+	 */
+	__prestera_kern_fib_cache_destruct(sw, fib_cache);
+	kfree(fib_cache);
+}
+
+static void prestera_k_arb_abort(struct prestera_switch *sw)
+{
+	/* Function to remove all arbiter entries and related hw objects. */
+	/* Sequence:
+	 *   1) Clear arbiter tables, but don't touch hw
+	 *   2) Clear hw
+	 * We use such approach, because arbiter object is not directly mapped
+	 * to hw. So deletion of one arbiter object may even lead to creation of
+	 * hw object (e.g. in case of overlapped routes).
+	 */
+	rhashtable_free_and_destroy(&sw->router->kern_fib_cache_ht,
+				    __prestera_k_arb_abort_fib_ht_cb,
+				    sw);
+}
+
 static int __prestera_inetaddr_port_event(struct net_device *port_dev,
 					  unsigned long event,
 					  struct netlink_ext_ack *extack)
@@ -602,6 +639,9 @@ void prestera_router_fini(struct prestera_switch *sw)
 	unregister_fib_notifier(&init_net, &sw->router->fib_nb);
 	unregister_inetaddr_notifier(&sw->router->inetaddr_nb);
 	unregister_inetaddr_validator_notifier(&sw->router->inetaddr_valid_nb);
+
+	prestera_k_arb_abort(sw);
+
 	kfree(sw->router->nhgrp_hw_state_cache);
 	rhashtable_destroy(&sw->router->kern_fib_cache_ht);
 	prestera_router_hw_fini(sw);
-- 
cgit v1.2.3


From 90b6f9c098512b72b77028893e42f60316409fb1 Mon Sep 17 00:00:00 2001
From: Yevhen Orlov <yevhen.orlov@plvision.eu>
Date: Sat, 1 Oct 2022 12:34:12 +0300
Subject: net: marvell: prestera: add delayed wq and flush wq on deinit

Flushing workqueues ensures, that no more pending works, related to just
unregistered or deinitialized notifiers. After that we can free memory.

Delayed wq will be used for neighbours in next patches.

Co-developed-by: Taras Chornyi <tchornyi@marvell.com>
Signed-off-by: Taras Chornyi <tchornyi@marvell.com>
Co-developed-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Yevhen Orlov <yevhen.orlov@plvision.eu>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/prestera/prestera.h        |  2 ++
 drivers/net/ethernet/marvell/prestera/prestera_main.c   | 11 +++++++++++
 drivers/net/ethernet/marvell/prestera/prestera_router.c |  1 +
 3 files changed, 14 insertions(+)

diff --git a/drivers/net/ethernet/marvell/prestera/prestera.h b/drivers/net/ethernet/marvell/prestera/prestera.h
index 903e2e13e687..fe0d6001a6b6 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera.h
@@ -367,6 +367,8 @@ int prestera_port_cfg_mac_write(struct prestera_port *port,
 struct prestera_port *prestera_port_dev_lower_find(struct net_device *dev);
 
 void prestera_queue_work(struct work_struct *work);
+void prestera_queue_delayed_work(struct delayed_work *work, unsigned long delay);
+void prestera_queue_drain(void);
 
 int prestera_port_learning_set(struct prestera_port *port, bool learn_enable);
 int prestera_port_uc_flood_set(struct prestera_port *port, bool flood);
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c
index 639d3e940a88..24f9d6024745 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_main.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c
@@ -36,6 +36,17 @@ void prestera_queue_work(struct work_struct *work)
 	queue_work(prestera_owq, work);
 }
 
+void prestera_queue_delayed_work(struct delayed_work *work, unsigned long delay)
+{
+	queue_delayed_work(prestera_wq, work, delay);
+}
+
+void prestera_queue_drain(void)
+{
+	drain_workqueue(prestera_wq);
+	drain_workqueue(prestera_owq);
+}
+
 int prestera_port_learning_set(struct prestera_port *port, bool learn)
 {
 	return prestera_hw_port_learning_set(port, learn);
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router.c b/drivers/net/ethernet/marvell/prestera/prestera_router.c
index b4fd8276bbce..9625c5870847 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_router.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_router.c
@@ -639,6 +639,7 @@ void prestera_router_fini(struct prestera_switch *sw)
 	unregister_fib_notifier(&init_net, &sw->router->fib_nb);
 	unregister_inetaddr_notifier(&sw->router->inetaddr_nb);
 	unregister_inetaddr_validator_notifier(&sw->router->inetaddr_valid_nb);
+	prestera_queue_drain();
 
 	prestera_k_arb_abort(sw);
 
-- 
cgit v1.2.3


From 59b44ea8aa56cec1c6b29019bf00b1f9474a9c51 Mon Sep 17 00:00:00 2001
From: Yevhen Orlov <yevhen.orlov@plvision.eu>
Date: Sat, 1 Oct 2022 12:34:13 +0300
Subject: net: marvell: prestera: Add length macros for prestera_ip_addr

Add macros to determine IP address length (internal driver types).
This will be used in next patches for nexthops logic.

Co-developed-by: Taras Chornyi <tchornyi@marvell.com>
Signed-off-by: Taras Chornyi <tchornyi@marvell.com>
Co-developed-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Yevhen Orlov <yevhen.orlov@plvision.eu>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/prestera/prestera_router_hw.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h
index 43bad23f38ec..9ca97919c863 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera_router_hw.h
@@ -31,6 +31,8 @@ struct prestera_ip_addr {
 		PRESTERA_IPV4 = 0,
 		PRESTERA_IPV6
 	} v;
+#define PRESTERA_IP_ADDR_PLEN(V) ((V) == PRESTERA_IPV4 ? 32 : \
+				  /* (V) == PRESTERA_IPV6 ? */ 128 /* : 0 */)
 };
 
 struct prestera_nh_neigh_key {
-- 
cgit v1.2.3


From 04f24a1e6de6e9cf9358ed5fe6677bc56dbd11fa Mon Sep 17 00:00:00 2001
From: Yevhen Orlov <yevhen.orlov@plvision.eu>
Date: Sat, 1 Oct 2022 12:34:14 +0300
Subject: net: marvell: prestera: Add heplers to interact with
 fib_notifier_info

This will be used to implement nexthops related logic in next patches.
Also try to keep ipv4/6 abstraction to be able to reuse helpers for ipv6
in the future.

Co-developed-by: Taras Chornyi <tchornyi@marvell.com>
Signed-off-by: Taras Chornyi <tchornyi@marvell.com>
Co-developed-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Yevhen Orlov <yevhen.orlov@plvision.eu>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../ethernet/marvell/prestera/prestera_router.c    | 99 ++++++++++++++--------
 1 file changed, 65 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router.c b/drivers/net/ethernet/marvell/prestera/prestera_router.c
index 9625c5870847..607efd481782 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_router.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_router.c
@@ -7,6 +7,7 @@
 #include <net/inet_dscp.h>
 #include <net/switchdev.h>
 #include <linux/rhashtable.h>
+#include <net/nexthop.h>
 
 #include "prestera.h"
 #include "prestera_router_hw.h"
@@ -26,9 +27,10 @@ struct prestera_kern_fib_cache {
 	} lpm_info; /* hold prepared lpm info */
 	/* Indicate if route is not overlapped by another table */
 	struct rhash_head ht_node; /* node of prestera_router */
-	struct fib_info *fi;
-	dscp_t kern_dscp;
-	u8 kern_type;
+	union {
+		struct fib_notifier_info info; /* point to any of 4/6 */
+		struct fib_entry_notifier_info fen4_info;
+	};
 	bool reachable;
 };
 
@@ -51,15 +53,41 @@ static u32 prestera_fix_tb_id(u32 tb_id)
 }
 
 static void
-prestera_util_fen_info2fib_cache_key(struct fib_entry_notifier_info *fen_info,
+prestera_util_fen_info2fib_cache_key(struct fib_notifier_info *info,
 				     struct prestera_kern_fib_cache_key *key)
 {
+	struct fib_entry_notifier_info *fen_info =
+		container_of(info, struct fib_entry_notifier_info, info);
+
 	memset(key, 0, sizeof(*key));
+	key->addr.v = PRESTERA_IPV4;
 	key->addr.u.ipv4 = cpu_to_be32(fen_info->dst);
 	key->prefix_len = fen_info->dst_len;
 	key->kern_tb_id = fen_info->tb_id;
 }
 
+static unsigned char
+prestera_kern_fib_info_type(struct fib_notifier_info *info)
+{
+	struct fib6_entry_notifier_info *fen6_info;
+	struct fib_entry_notifier_info *fen4_info;
+
+	if (info->family == AF_INET) {
+		fen4_info = container_of(info, struct fib_entry_notifier_info,
+					 info);
+		return fen4_info->fi->fib_type;
+	} else if (info->family == AF_INET6) {
+		fen6_info = container_of(info, struct fib6_entry_notifier_info,
+					 info);
+		/* TODO: ECMP in ipv6 is several routes.
+		 * Every route has single nh.
+		 */
+		return fen6_info->rt->fib6_type;
+	}
+
+	return RTN_UNSPEC;
+}
+
 static struct prestera_kern_fib_cache *
 prestera_kern_fib_cache_find(struct prestera_switch *sw,
 			     struct prestera_kern_fib_cache_key *key)
@@ -76,7 +104,7 @@ static void
 __prestera_kern_fib_cache_destruct(struct prestera_switch *sw,
 				   struct prestera_kern_fib_cache *fib_cache)
 {
-	fib_info_put(fib_cache->fi);
+	fib_info_put(fib_cache->fen4_info.fi);
 }
 
 static void
@@ -96,8 +124,10 @@ prestera_kern_fib_cache_destroy(struct prestera_switch *sw,
 static struct prestera_kern_fib_cache *
 prestera_kern_fib_cache_create(struct prestera_switch *sw,
 			       struct prestera_kern_fib_cache_key *key,
-			       struct fib_info *fi, dscp_t dscp, u8 type)
+			       struct fib_notifier_info *info)
 {
+	struct fib_entry_notifier_info *fen_info =
+		container_of(info, struct fib_entry_notifier_info, info);
 	struct prestera_kern_fib_cache *fib_cache;
 	int err;
 
@@ -106,10 +136,8 @@ prestera_kern_fib_cache_create(struct prestera_switch *sw,
 		goto err_kzalloc;
 
 	memcpy(&fib_cache->key, key, sizeof(*key));
-	fib_info_hold(fi);
-	fib_cache->fi = fi;
-	fib_cache->kern_dscp = dscp;
-	fib_cache->kern_type = type;
+	fib_info_hold(fen_info->fi);
+	memcpy(&fib_cache->fen4_info, fen_info, sizeof(*fen_info));
 
 	err = rhashtable_insert_fast(&sw->router->kern_fib_cache_ht,
 				     &fib_cache->ht_node,
@@ -120,7 +148,7 @@ prestera_kern_fib_cache_create(struct prestera_switch *sw,
 	return fib_cache;
 
 err_ht_insert:
-	fib_info_put(fi);
+	fib_info_put(fen_info->fi);
 	kfree(fib_cache);
 err_kzalloc:
 	return NULL;
@@ -133,21 +161,25 @@ __prestera_k_arb_fib_lpm_offload_set(struct prestera_switch *sw,
 {
 	struct fib_rt_info fri;
 
-	if (fc->key.addr.v != PRESTERA_IPV4)
+	switch (fc->key.addr.v) {
+	case PRESTERA_IPV4:
+		fri.fi = fc->fen4_info.fi;
+		fri.tb_id = fc->key.kern_tb_id;
+		fri.dst = fc->key.addr.u.ipv4;
+		fri.dst_len = fc->key.prefix_len;
+		fri.dscp = fc->fen4_info.dscp;
+		fri.type = fc->fen4_info.type;
+		/* flags begin */
+		fri.offload = offload;
+		fri.trap = trap;
+		fri.offload_failed = fail;
+		/* flags end */
+		fib_alias_hw_flags_set(&init_net, &fri);
 		return;
-
-	fri.fi = fc->fi;
-	fri.tb_id = fc->key.kern_tb_id;
-	fri.dst = fc->key.addr.u.ipv4;
-	fri.dst_len = fc->key.prefix_len;
-	fri.dscp = fc->kern_dscp;
-	fri.type = fc->kern_type;
-	/* flags begin */
-	fri.offload = offload;
-	fri.trap = trap;
-	fri.offload_failed = fail;
-	/* flags end */
-	fib_alias_hw_flags_set(&init_net, &fri);
+	case PRESTERA_IPV6:
+		/* TODO */
+		return;
+	}
 }
 
 static int
@@ -156,7 +188,7 @@ __prestera_pr_k_arb_fc_lpm_info_calc(struct prestera_switch *sw,
 {
 	memset(&fc->lpm_info, 0, sizeof(fc->lpm_info));
 
-	switch (fc->fi->fib_type) {
+	switch (prestera_kern_fib_info_type(&fc->info)) {
 	case RTN_UNICAST:
 		fc->lpm_info.fib_type = PRESTERA_FIB_TYPE_TRAP;
 		break;
@@ -283,14 +315,14 @@ __prestera_k_arb_util_fib_overlapped(struct prestera_switch *sw,
 static int
 prestera_k_arb_fib_evt(struct prestera_switch *sw,
 		       bool replace, /* replace or del */
-		       struct fib_entry_notifier_info *fen_info)
+		       struct fib_notifier_info *info)
 {
 	struct prestera_kern_fib_cache *tfib_cache, *bfib_cache; /* top/btm */
 	struct prestera_kern_fib_cache_key fc_key;
 	struct prestera_kern_fib_cache *fib_cache;
 	int err;
 
-	prestera_util_fen_info2fib_cache_key(fen_info, &fc_key);
+	prestera_util_fen_info2fib_cache_key(info, &fc_key);
 	fib_cache = prestera_kern_fib_cache_find(sw, &fc_key);
 	if (fib_cache) {
 		fib_cache->reachable = false;
@@ -313,10 +345,7 @@ prestera_k_arb_fib_evt(struct prestera_switch *sw,
 	}
 
 	if (replace) {
-		fib_cache = prestera_kern_fib_cache_create(sw, &fc_key,
-							   fen_info->fi,
-							   fen_info->dscp,
-							   fen_info->type);
+		fib_cache = prestera_kern_fib_cache_create(sw, &fc_key, info);
 		if (!fib_cache) {
 			dev_err(sw->dev->dev, "fib_cache == NULL");
 			return -ENOENT;
@@ -508,13 +537,15 @@ static void __prestera_router_fib_event_work(struct work_struct *work)
 
 	switch (fib_work->event) {
 	case FIB_EVENT_ENTRY_REPLACE:
-		err = prestera_k_arb_fib_evt(sw, true, &fib_work->fen_info);
+		err = prestera_k_arb_fib_evt(sw, true,
+					     &fib_work->fen_info.info);
 		if (err)
 			goto err_out;
 
 		break;
 	case FIB_EVENT_ENTRY_DEL:
-		err = prestera_k_arb_fib_evt(sw, false, &fib_work->fen_info);
+		err = prestera_k_arb_fib_evt(sw, false,
+					     &fib_work->fen_info.info);
 		if (err)
 			goto err_out;
 
-- 
cgit v1.2.3


From 8b1ef4911a41c18df730eb913129f4259a98dab9 Mon Sep 17 00:00:00 2001
From: Yevhen Orlov <yevhen.orlov@plvision.eu>
Date: Sat, 1 Oct 2022 12:34:15 +0300
Subject: net: marvell: prestera: add stub handler neighbour events

Actual handler will be added in next patches

Co-developed-by: Taras Chornyi <tchornyi@marvell.com>
Signed-off-by: Taras Chornyi <tchornyi@marvell.com>
Co-developed-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Yevhen Orlov <yevhen.orlov@plvision.eu>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/prestera/prestera.h   |  1 +
 .../ethernet/marvell/prestera/prestera_router.c    | 59 ++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/drivers/net/ethernet/marvell/prestera/prestera.h b/drivers/net/ethernet/marvell/prestera/prestera.h
index fe0d6001a6b6..2f2f80e7e358 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera.h
@@ -320,6 +320,7 @@ struct prestera_router {
 	struct notifier_block inetaddr_nb;
 	struct notifier_block inetaddr_valid_nb;
 	struct notifier_block fib_nb;
+	struct notifier_block netevent_nb;
 	u8 *nhgrp_hw_state_cache; /* Bitmap cached hw state of nhs */
 	unsigned long nhgrp_hw_cache_kick; /* jiffies */
 };
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router.c b/drivers/net/ethernet/marvell/prestera/prestera_router.c
index 607efd481782..d31dd1fe6633 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_router.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_router.c
@@ -8,6 +8,7 @@
 #include <net/switchdev.h>
 #include <linux/rhashtable.h>
 #include <net/nexthop.h>
+#include <net/netevent.h>
 
 #include "prestera.h"
 #include "prestera_router_hw.h"
@@ -604,6 +605,56 @@ static int __prestera_router_fib_event(struct notifier_block *nb,
 	return NOTIFY_DONE;
 }
 
+struct prestera_netevent_work {
+	struct work_struct work;
+	struct prestera_switch *sw;
+	struct neighbour *n;
+};
+
+static void prestera_router_neigh_event_work(struct work_struct *work)
+{
+	struct prestera_netevent_work *net_work =
+		container_of(work, struct prestera_netevent_work, work);
+	struct neighbour *n = net_work->n;
+
+	/* neigh - its not hw related object. It stored only in kernel. So... */
+	rtnl_lock();
+
+	/* TODO: handler */
+
+	neigh_release(n);
+	rtnl_unlock();
+	kfree(net_work);
+}
+
+static int prestera_router_netevent_event(struct notifier_block *nb,
+					  unsigned long event, void *ptr)
+{
+	struct prestera_netevent_work *net_work;
+	struct prestera_router *router;
+	struct neighbour *n = ptr;
+
+	router = container_of(nb, struct prestera_router, netevent_nb);
+
+	switch (event) {
+	case NETEVENT_NEIGH_UPDATE:
+		if (n->tbl->family != AF_INET)
+			return NOTIFY_DONE;
+
+		net_work = kzalloc(sizeof(*net_work), GFP_ATOMIC);
+		if (WARN_ON(!net_work))
+			return NOTIFY_BAD;
+
+		neigh_clone(n);
+		net_work->n = n;
+		net_work->sw = router->sw;
+		INIT_WORK(&net_work->work, prestera_router_neigh_event_work);
+		prestera_queue_work(&net_work->work);
+	}
+
+	return NOTIFY_DONE;
+}
+
 int prestera_router_init(struct prestera_switch *sw)
 {
 	struct prestera_router *router;
@@ -642,6 +693,11 @@ int prestera_router_init(struct prestera_switch *sw)
 	if (err)
 		goto err_register_inetaddr_notifier;
 
+	router->netevent_nb.notifier_call = prestera_router_netevent_event;
+	err = register_netevent_notifier(&router->netevent_nb);
+	if (err)
+		goto err_register_netevent_notifier;
+
 	router->fib_nb.notifier_call = __prestera_router_fib_event;
 	err = register_fib_notifier(&init_net, &router->fib_nb,
 				    /* TODO: flush fib entries */ NULL, NULL);
@@ -651,6 +707,8 @@ int prestera_router_init(struct prestera_switch *sw)
 	return 0;
 
 err_register_fib_notifier:
+	unregister_netevent_notifier(&router->netevent_nb);
+err_register_netevent_notifier:
 	unregister_inetaddr_notifier(&router->inetaddr_nb);
 err_register_inetaddr_notifier:
 	unregister_inetaddr_validator_notifier(&router->inetaddr_valid_nb);
@@ -668,6 +726,7 @@ err_router_lib_init:
 void prestera_router_fini(struct prestera_switch *sw)
 {
 	unregister_fib_notifier(&init_net, &sw->router->fib_nb);
+	unregister_netevent_notifier(&sw->router->netevent_nb);
 	unregister_inetaddr_notifier(&sw->router->inetaddr_nb);
 	unregister_inetaddr_validator_notifier(&sw->router->inetaddr_valid_nb);
 	prestera_queue_drain();
-- 
cgit v1.2.3


From 396b80cb5cc8006a488ea25ef84fae245dc1b43c Mon Sep 17 00:00:00 2001
From: Yevhen Orlov <yevhen.orlov@plvision.eu>
Date: Sat, 1 Oct 2022 12:34:16 +0300
Subject: net: marvell: prestera: Add neighbour cache accounting

Move forward and use new PRESTERA_FIB_TYPE_UC_NH to provide basic
nexthop routes support.
Provide deinitialization sequence for all created router objects.

Limitations:
- Only "local" and "main" tables supported
- Only generic interfaces supported for router (no bridges or vlans)

Co-developed-by: Taras Chornyi <tchornyi@marvell.com>
Signed-off-by: Taras Chornyi <tchornyi@marvell.com>
Co-developed-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Yevhen Orlov <yevhen.orlov@plvision.eu>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/prestera/prestera.h   |   1 +
 .../ethernet/marvell/prestera/prestera_router.c    | 799 ++++++++++++++++++++-
 2 files changed, 797 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/prestera/prestera.h b/drivers/net/ethernet/marvell/prestera/prestera.h
index 2f2f80e7e358..540a36069b79 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera.h
@@ -316,6 +316,7 @@ struct prestera_router {
 	struct rhashtable nh_neigh_ht;
 	struct rhashtable nexthop_group_ht;
 	struct rhashtable fib_ht;
+	struct rhashtable kern_neigh_cache_ht;
 	struct rhashtable kern_fib_cache_ht;
 	struct notifier_block inetaddr_nb;
 	struct notifier_block inetaddr_valid_nb;
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router.c b/drivers/net/ethernet/marvell/prestera/prestera_router.c
index d31dd1fe6633..af7d24390d2e 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_router.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_router.c
@@ -8,11 +8,30 @@
 #include <net/switchdev.h>
 #include <linux/rhashtable.h>
 #include <net/nexthop.h>
+#include <net/arp.h>
+#include <linux/if_vlan.h>
+#include <linux/if_macvlan.h>
 #include <net/netevent.h>
 
 #include "prestera.h"
 #include "prestera_router_hw.h"
 
+struct prestera_kern_neigh_cache_key {
+	struct prestera_ip_addr addr;
+	struct net_device *dev;
+};
+
+struct prestera_kern_neigh_cache {
+	struct prestera_kern_neigh_cache_key key;
+	struct rhash_head ht_node;
+	struct list_head kern_fib_cache_list;
+	/* Hold prepared nh_neigh info if is in_kernel */
+	struct prestera_neigh_info nh_neigh_info;
+	/* Indicate if neighbour is reachable by direct route */
+	bool reachable;
+	/* Lock cache if neigh is present in kernel */
+	bool in_kernel;
+};
 struct prestera_kern_fib_cache_key {
 	struct prestera_ip_addr addr;
 	u32 prefix_len;
@@ -25,9 +44,15 @@ struct prestera_kern_fib_cache {
 	struct {
 		struct prestera_fib_key fib_key;
 		enum prestera_fib_type fib_type;
+		struct prestera_nexthop_group_key nh_grp_key;
 	} lpm_info; /* hold prepared lpm info */
 	/* Indicate if route is not overlapped by another table */
 	struct rhash_head ht_node; /* node of prestera_router */
+	struct prestera_kern_neigh_cache_head {
+		struct prestera_kern_fib_cache *this;
+		struct list_head head;
+		struct prestera_kern_neigh_cache *n_cache;
+	} kern_neigh_cache_head[PRESTERA_NHGR_SIZE_MAX];
 	union {
 		struct fib_notifier_info info; /* point to any of 4/6 */
 		struct fib_entry_notifier_info fen4_info;
@@ -35,6 +60,13 @@ struct prestera_kern_fib_cache {
 	bool reachable;
 };
 
+static const struct rhashtable_params __prestera_kern_neigh_cache_ht_params = {
+	.key_offset  = offsetof(struct prestera_kern_neigh_cache, key),
+	.head_offset = offsetof(struct prestera_kern_neigh_cache, ht_node),
+	.key_len     = sizeof(struct prestera_kern_neigh_cache_key),
+	.automatic_shrinking = true,
+};
+
 static const struct rhashtable_params __prestera_kern_fib_cache_ht_params = {
 	.key_offset  = offsetof(struct prestera_kern_fib_cache, key),
 	.head_offset = offsetof(struct prestera_kern_fib_cache, ht_node),
@@ -67,6 +99,268 @@ prestera_util_fen_info2fib_cache_key(struct fib_notifier_info *info,
 	key->kern_tb_id = fen_info->tb_id;
 }
 
+static int prestera_util_nhc2nc_key(struct prestera_switch *sw,
+				    struct fib_nh_common *nhc,
+				    struct prestera_kern_neigh_cache_key *nk)
+{
+	memset(nk, 0, sizeof(*nk));
+	if (nhc->nhc_gw_family == AF_INET) {
+		nk->addr.v = PRESTERA_IPV4;
+		nk->addr.u.ipv4 = nhc->nhc_gw.ipv4;
+	} else {
+		nk->addr.v = PRESTERA_IPV6;
+		nk->addr.u.ipv6 = nhc->nhc_gw.ipv6;
+	}
+
+	nk->dev = nhc->nhc_dev;
+	return 0;
+}
+
+static void
+prestera_util_nc_key2nh_key(struct prestera_kern_neigh_cache_key *ck,
+			    struct prestera_nh_neigh_key *nk)
+{
+	memset(nk, 0, sizeof(*nk));
+	nk->addr = ck->addr;
+	nk->rif = (void *)ck->dev;
+}
+
+static bool
+prestera_util_nhc_eq_n_cache_key(struct prestera_switch *sw,
+				 struct fib_nh_common *nhc,
+				 struct prestera_kern_neigh_cache_key *nk)
+{
+	struct prestera_kern_neigh_cache_key tk;
+	int err;
+
+	err = prestera_util_nhc2nc_key(sw, nhc, &tk);
+	if (err)
+		return false;
+
+	if (memcmp(&tk, nk, sizeof(tk)))
+		return false;
+
+	return true;
+}
+
+static int
+prestera_util_neigh2nc_key(struct prestera_switch *sw, struct neighbour *n,
+			   struct prestera_kern_neigh_cache_key *key)
+{
+	memset(key, 0, sizeof(*key));
+	if (n->tbl->family == AF_INET) {
+		key->addr.v = PRESTERA_IPV4;
+		key->addr.u.ipv4 = *(__be32 *)n->primary_key;
+	} else {
+		return -ENOENT;
+	}
+
+	key->dev = n->dev;
+
+	return 0;
+}
+
+static bool __prestera_fi_is_direct(struct fib_info *fi)
+{
+	struct fib_nh *fib_nh;
+
+	if (fib_info_num_path(fi) == 1) {
+		fib_nh = fib_info_nh(fi, 0);
+		if (fib_nh->fib_nh_gw_family == AF_UNSPEC)
+			return true;
+	}
+
+	return false;
+}
+
+static bool prestera_fi_is_direct(struct fib_info *fi)
+{
+	if (fi->fib_type != RTN_UNICAST)
+		return false;
+
+	return __prestera_fi_is_direct(fi);
+}
+
+static bool prestera_fi_is_nh(struct fib_info *fi)
+{
+	if (fi->fib_type != RTN_UNICAST)
+		return false;
+
+	return !__prestera_fi_is_direct(fi);
+}
+
+static bool __prestera_fi6_is_direct(struct fib6_info *fi)
+{
+	if (!fi->fib6_nh->nh_common.nhc_gw_family)
+		return true;
+
+	return false;
+}
+
+static bool prestera_fi6_is_direct(struct fib6_info *fi)
+{
+	if (fi->fib6_type != RTN_UNICAST)
+		return false;
+
+	return __prestera_fi6_is_direct(fi);
+}
+
+static bool prestera_fi6_is_nh(struct fib6_info *fi)
+{
+	if (fi->fib6_type != RTN_UNICAST)
+		return false;
+
+	return !__prestera_fi6_is_direct(fi);
+}
+
+static bool prestera_fib_info_is_direct(struct fib_notifier_info *info)
+{
+	struct fib6_entry_notifier_info *fen6_info =
+		container_of(info, struct fib6_entry_notifier_info, info);
+	struct fib_entry_notifier_info *fen_info =
+		container_of(info, struct fib_entry_notifier_info, info);
+
+	if (info->family == AF_INET)
+		return prestera_fi_is_direct(fen_info->fi);
+	else
+		return prestera_fi6_is_direct(fen6_info->rt);
+}
+
+static bool prestera_fib_info_is_nh(struct fib_notifier_info *info)
+{
+	struct fib6_entry_notifier_info *fen6_info =
+		container_of(info, struct fib6_entry_notifier_info, info);
+	struct fib_entry_notifier_info *fen_info =
+		container_of(info, struct fib_entry_notifier_info, info);
+
+	if (info->family == AF_INET)
+		return prestera_fi_is_nh(fen_info->fi);
+	else
+		return prestera_fi6_is_nh(fen6_info->rt);
+}
+
+/* must be called with rcu_read_lock() */
+static int prestera_util_kern_get_route(struct fib_result *res, u32 tb_id,
+					__be32 *addr)
+{
+	struct flowi4 fl4;
+
+	/* TODO: walkthrough appropriate tables in kernel
+	 * to know if the same prefix exists in several tables
+	 */
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.daddr = *addr;
+	return fib_lookup(&init_net, &fl4, res, 0 /* FIB_LOOKUP_NOREF */);
+}
+
+static bool
+__prestera_util_kern_n_is_reachable_v4(u32 tb_id, __be32 *addr,
+				       struct net_device *dev)
+{
+	struct fib_nh *fib_nh;
+	struct fib_result res;
+	bool reachable;
+
+	reachable = false;
+
+	if (!prestera_util_kern_get_route(&res, tb_id, addr))
+		if (prestera_fi_is_direct(res.fi)) {
+			fib_nh = fib_info_nh(res.fi, 0);
+			if (dev == fib_nh->fib_nh_dev)
+				reachable = true;
+		}
+
+	return reachable;
+}
+
+/* Check if neigh route is reachable */
+static bool
+prestera_util_kern_n_is_reachable(u32 tb_id,
+				  struct prestera_ip_addr *addr,
+				  struct net_device *dev)
+{
+	if (addr->v == PRESTERA_IPV4)
+		return __prestera_util_kern_n_is_reachable_v4(tb_id,
+							      &addr->u.ipv4,
+							      dev);
+	else
+		return false;
+}
+
+static void prestera_util_kern_set_neigh_offload(struct neighbour *n,
+						 bool offloaded)
+{
+	if (offloaded)
+		n->flags |= NTF_OFFLOADED;
+	else
+		n->flags &= ~NTF_OFFLOADED;
+}
+
+static void
+prestera_util_kern_set_nh_offload(struct fib_nh_common *nhc, bool offloaded, bool trap)
+{
+		if (offloaded)
+			nhc->nhc_flags |= RTNH_F_OFFLOAD;
+		else
+			nhc->nhc_flags &= ~RTNH_F_OFFLOAD;
+
+		if (trap)
+			nhc->nhc_flags |= RTNH_F_TRAP;
+		else
+			nhc->nhc_flags &= ~RTNH_F_TRAP;
+}
+
+static struct fib_nh_common *
+prestera_kern_fib_info_nhc(struct fib_notifier_info *info, int n)
+{
+	struct fib6_entry_notifier_info *fen6_info;
+	struct fib_entry_notifier_info *fen4_info;
+	struct fib6_info *iter;
+
+	if (info->family == AF_INET) {
+		fen4_info = container_of(info, struct fib_entry_notifier_info,
+					 info);
+		return &fib_info_nh(fen4_info->fi, n)->nh_common;
+	} else if (info->family == AF_INET6) {
+		fen6_info = container_of(info, struct fib6_entry_notifier_info,
+					 info);
+		if (!n)
+			return &fen6_info->rt->fib6_nh->nh_common;
+
+		list_for_each_entry(iter, &fen6_info->rt->fib6_siblings,
+				    fib6_siblings) {
+			if (!--n)
+				return &iter->fib6_nh->nh_common;
+		}
+	}
+
+	/* if family is incorrect - than upper functions has BUG */
+	/* if doesn't find requested index - there is alsi bug, because
+	 * valid index must be produced by nhs, which checks list length
+	 */
+	WARN(1, "Invalid parameters passed to %s n=%d i=%p",
+	     __func__, n, info);
+	return NULL;
+}
+
+static int prestera_kern_fib_info_nhs(struct fib_notifier_info *info)
+{
+	struct fib6_entry_notifier_info *fen6_info;
+	struct fib_entry_notifier_info *fen4_info;
+
+	if (info->family == AF_INET) {
+		fen4_info = container_of(info, struct fib_entry_notifier_info,
+					 info);
+		return fib_info_num_path(fen4_info->fi);
+	} else if (info->family == AF_INET6) {
+		fen6_info = container_of(info, struct fib6_entry_notifier_info,
+					 info);
+		return fen6_info->rt->fib6_nsiblings + 1;
+	}
+
+	return 0;
+}
+
 static unsigned char
 prestera_kern_fib_info_type(struct fib_notifier_info *info)
 {
@@ -89,6 +383,153 @@ prestera_kern_fib_info_type(struct fib_notifier_info *info)
 	return RTN_UNSPEC;
 }
 
+/* Decided, that uc_nh route with key==nh is obviously neighbour route */
+static bool
+prestera_fib_node_util_is_neighbour(struct prestera_fib_node *fib_node)
+{
+	if (fib_node->info.type != PRESTERA_FIB_TYPE_UC_NH)
+		return false;
+
+	if (fib_node->info.nh_grp->nh_neigh_head[1].neigh)
+		return false;
+
+	if (!fib_node->info.nh_grp->nh_neigh_head[0].neigh)
+		return false;
+
+	if (memcmp(&fib_node->info.nh_grp->nh_neigh_head[0].neigh->key.addr,
+		   &fib_node->key.addr, sizeof(struct prestera_ip_addr)))
+		return false;
+
+	return true;
+}
+
+static int prestera_dev_if_type(const struct net_device *dev)
+{
+	struct macvlan_dev *vlan;
+
+	if (is_vlan_dev(dev) &&
+	    netif_is_bridge_master(vlan_dev_real_dev(dev))) {
+		return PRESTERA_IF_VID_E;
+	} else if (netif_is_bridge_master(dev)) {
+		return PRESTERA_IF_VID_E;
+	} else if (netif_is_lag_master(dev)) {
+		return PRESTERA_IF_LAG_E;
+	} else if (netif_is_macvlan(dev)) {
+		vlan = netdev_priv(dev);
+		return prestera_dev_if_type(vlan->lowerdev);
+	} else {
+		return PRESTERA_IF_PORT_E;
+	}
+}
+
+static int
+prestera_neigh_iface_init(struct prestera_switch *sw,
+			  struct prestera_iface *iface,
+			  struct neighbour *n)
+{
+	struct prestera_port *port;
+
+	iface->vlan_id = 0; /* TODO: vlan egress */
+	iface->type = prestera_dev_if_type(n->dev);
+	if (iface->type != PRESTERA_IF_PORT_E)
+		return -EINVAL;
+
+	if (!prestera_netdev_check(n->dev))
+		return -EINVAL;
+
+	port = netdev_priv(n->dev);
+	iface->dev_port.hw_dev_num = port->dev_id;
+	iface->dev_port.port_num = port->hw_id;
+
+	return 0;
+}
+
+static struct prestera_kern_neigh_cache *
+prestera_kern_neigh_cache_find(struct prestera_switch *sw,
+			       struct prestera_kern_neigh_cache_key *key)
+{
+	struct prestera_kern_neigh_cache *n_cache;
+
+	n_cache =
+	 rhashtable_lookup_fast(&sw->router->kern_neigh_cache_ht, key,
+				__prestera_kern_neigh_cache_ht_params);
+	return IS_ERR(n_cache) ? NULL : n_cache;
+}
+
+static void
+__prestera_kern_neigh_cache_destruct(struct prestera_switch *sw,
+				     struct prestera_kern_neigh_cache *n_cache)
+{
+	dev_put(n_cache->key.dev);
+}
+
+static void
+__prestera_kern_neigh_cache_destroy(struct prestera_switch *sw,
+				    struct prestera_kern_neigh_cache *n_cache)
+{
+	rhashtable_remove_fast(&sw->router->kern_neigh_cache_ht,
+			       &n_cache->ht_node,
+			       __prestera_kern_neigh_cache_ht_params);
+	__prestera_kern_neigh_cache_destruct(sw, n_cache);
+	kfree(n_cache);
+}
+
+static struct prestera_kern_neigh_cache *
+__prestera_kern_neigh_cache_create(struct prestera_switch *sw,
+				   struct prestera_kern_neigh_cache_key *key)
+{
+	struct prestera_kern_neigh_cache *n_cache;
+	int err;
+
+	n_cache = kzalloc(sizeof(*n_cache), GFP_KERNEL);
+	if (!n_cache)
+		goto err_kzalloc;
+
+	memcpy(&n_cache->key, key, sizeof(*key));
+	dev_hold(n_cache->key.dev);
+
+	INIT_LIST_HEAD(&n_cache->kern_fib_cache_list);
+	err = rhashtable_insert_fast(&sw->router->kern_neigh_cache_ht,
+				     &n_cache->ht_node,
+				     __prestera_kern_neigh_cache_ht_params);
+	if (err)
+		goto err_ht_insert;
+
+	return n_cache;
+
+err_ht_insert:
+	dev_put(n_cache->key.dev);
+	kfree(n_cache);
+err_kzalloc:
+	return NULL;
+}
+
+static struct prestera_kern_neigh_cache *
+prestera_kern_neigh_cache_get(struct prestera_switch *sw,
+			      struct prestera_kern_neigh_cache_key *key)
+{
+	struct prestera_kern_neigh_cache *n_cache;
+
+	n_cache = prestera_kern_neigh_cache_find(sw, key);
+	if (!n_cache)
+		n_cache = __prestera_kern_neigh_cache_create(sw, key);
+
+	return n_cache;
+}
+
+static struct prestera_kern_neigh_cache *
+prestera_kern_neigh_cache_put(struct prestera_switch *sw,
+			      struct prestera_kern_neigh_cache *n_cache)
+{
+	if (!n_cache->in_kernel &&
+	    list_empty(&n_cache->kern_fib_cache_list)) {
+		__prestera_kern_neigh_cache_destroy(sw, n_cache);
+		return NULL;
+	}
+
+	return n_cache;
+}
+
 static struct prestera_kern_fib_cache *
 prestera_kern_fib_cache_find(struct prestera_switch *sw,
 			     struct prestera_kern_fib_cache_key *key)
@@ -105,6 +546,17 @@ static void
 __prestera_kern_fib_cache_destruct(struct prestera_switch *sw,
 				   struct prestera_kern_fib_cache *fib_cache)
 {
+	struct prestera_kern_neigh_cache *n_cache;
+	int i;
+
+	for (i = 0; i < PRESTERA_NHGR_SIZE_MAX; i++) {
+		n_cache = fib_cache->kern_neigh_cache_head[i].n_cache;
+		if (n_cache) {
+			list_del(&fib_cache->kern_neigh_cache_head[i].head);
+			prestera_kern_neigh_cache_put(sw, n_cache);
+		}
+	}
+
 	fib_info_put(fib_cache->fen4_info.fi);
 }
 
@@ -119,6 +571,41 @@ prestera_kern_fib_cache_destroy(struct prestera_switch *sw,
 	kfree(fib_cache);
 }
 
+static int
+__prestera_kern_fib_cache_create_nhs(struct prestera_switch *sw,
+				     struct prestera_kern_fib_cache *fc)
+{
+	struct prestera_kern_neigh_cache_key nc_key;
+	struct prestera_kern_neigh_cache *n_cache;
+	struct fib_nh_common *nhc;
+	int i, nhs, err;
+
+	if (!prestera_fib_info_is_nh(&fc->info))
+		return 0;
+
+	nhs = prestera_kern_fib_info_nhs(&fc->info);
+	if (nhs > PRESTERA_NHGR_SIZE_MAX)
+		return 0;
+
+	for (i = 0; i < nhs; i++) {
+		nhc = prestera_kern_fib_info_nhc(&fc->fen4_info.info, i);
+		err = prestera_util_nhc2nc_key(sw, nhc, &nc_key);
+		if (err)
+			return 0;
+
+		n_cache = prestera_kern_neigh_cache_get(sw, &nc_key);
+		if (!n_cache)
+			return 0;
+
+		fc->kern_neigh_cache_head[i].this = fc;
+		fc->kern_neigh_cache_head[i].n_cache = n_cache;
+		list_add(&fc->kern_neigh_cache_head[i].head,
+			 &n_cache->kern_fib_cache_list);
+	}
+
+	return 0;
+}
+
 /* Operations on fi (offload, etc) must be wrapped in utils.
  * This function just create storage.
  */
@@ -146,6 +633,12 @@ prestera_kern_fib_cache_create(struct prestera_switch *sw,
 	if (err)
 		goto err_ht_insert;
 
+	/* Handle nexthops */
+	err = __prestera_kern_fib_cache_create_nhs(sw, fib_cache);
+	if (err)
+		goto out; /* Not critical */
+
+out:
 	return fib_cache;
 
 err_ht_insert:
@@ -155,6 +648,46 @@ err_kzalloc:
 	return NULL;
 }
 
+static void
+__prestera_k_arb_fib_nh_offload_set(struct prestera_switch *sw,
+				    struct prestera_kern_fib_cache *fibc,
+				    struct prestera_kern_neigh_cache *nc,
+				    bool offloaded, bool trap)
+{
+	struct fib_nh_common *nhc;
+	int i, nhs;
+
+	nhs = prestera_kern_fib_info_nhs(&fibc->info);
+	for (i = 0; i < nhs; i++) {
+		nhc = prestera_kern_fib_info_nhc(&fibc->info, i);
+		if (!nc) {
+			prestera_util_kern_set_nh_offload(nhc, offloaded, trap);
+			continue;
+		}
+
+		if (prestera_util_nhc_eq_n_cache_key(sw, nhc, &nc->key)) {
+			prestera_util_kern_set_nh_offload(nhc, offloaded, trap);
+			break;
+		}
+	}
+}
+
+static void
+__prestera_k_arb_n_offload_set(struct prestera_switch *sw,
+			       struct prestera_kern_neigh_cache *nc,
+			       bool offloaded)
+{
+	struct neighbour *n;
+
+	n = neigh_lookup(&arp_tbl, &nc->key.addr.u.ipv4,
+			 nc->key.dev);
+	if (!n)
+		return;
+
+	prestera_util_kern_set_neigh_offload(n, offloaded);
+	neigh_release(n);
+}
+
 static void
 __prestera_k_arb_fib_lpm_offload_set(struct prestera_switch *sw,
 				     struct prestera_kern_fib_cache *fc,
@@ -183,15 +716,187 @@ __prestera_k_arb_fib_lpm_offload_set(struct prestera_switch *sw,
 	}
 }
 
+static void
+__prestera_k_arb_n_lpm_set(struct prestera_switch *sw,
+			   struct prestera_kern_neigh_cache *n_cache,
+			   bool enabled)
+{
+	struct prestera_nexthop_group_key nh_grp_key;
+	struct prestera_kern_fib_cache_key fc_key;
+	struct prestera_kern_fib_cache *fib_cache;
+	struct prestera_fib_node *fib_node;
+	struct prestera_fib_key fib_key;
+
+	/* Exception for fc with prefix 32: LPM entry is already used by fib */
+	memset(&fc_key, 0, sizeof(fc_key));
+	fc_key.addr = n_cache->key.addr;
+	fc_key.prefix_len = PRESTERA_IP_ADDR_PLEN(n_cache->key.addr.v);
+	/* But better to use tb_id of route, which pointed to this neighbour. */
+	/* We take it from rif, because rif inconsistent.
+	 * Must be separated in_rif and out_rif.
+	 * Also note: for each fib pointed to this neigh should be separated
+	 *            neigh lpm entry (for each ingress vr)
+	 */
+	fc_key.kern_tb_id = l3mdev_fib_table(n_cache->key.dev);
+	fib_cache = prestera_kern_fib_cache_find(sw, &fc_key);
+	memset(&fib_key, 0, sizeof(fib_key));
+	fib_key.addr = n_cache->key.addr;
+	fib_key.prefix_len = PRESTERA_IP_ADDR_PLEN(n_cache->key.addr.v);
+	fib_key.tb_id = prestera_fix_tb_id(fc_key.kern_tb_id);
+	fib_node = prestera_fib_node_find(sw, &fib_key);
+	if (!fib_cache || !fib_cache->reachable) {
+		if (!enabled && fib_node) {
+			if (prestera_fib_node_util_is_neighbour(fib_node))
+				prestera_fib_node_destroy(sw, fib_node);
+			return;
+		}
+	}
+
+	if (enabled && !fib_node) {
+		memset(&nh_grp_key, 0, sizeof(nh_grp_key));
+		prestera_util_nc_key2nh_key(&n_cache->key,
+					    &nh_grp_key.neigh[0]);
+		fib_node = prestera_fib_node_create(sw, &fib_key,
+						    PRESTERA_FIB_TYPE_UC_NH,
+						    &nh_grp_key);
+		if (!fib_node)
+			pr_err("%s failed ip=%pI4n", "prestera_fib_node_create",
+			       &fib_key.addr.u.ipv4);
+		return;
+	}
+}
+
+static void
+__prestera_k_arb_nc_kern_fib_fetch(struct prestera_switch *sw,
+				   struct prestera_kern_neigh_cache *nc)
+{
+	if (prestera_util_kern_n_is_reachable(l3mdev_fib_table(nc->key.dev),
+					      &nc->key.addr, nc->key.dev))
+		nc->reachable = true;
+	else
+		nc->reachable = false;
+}
+
+/* Kernel neighbour -> neigh_cache info */
+static void
+__prestera_k_arb_nc_kern_n_fetch(struct prestera_switch *sw,
+				 struct prestera_kern_neigh_cache *nc)
+{
+	struct neighbour *n;
+	int err;
+
+	memset(&nc->nh_neigh_info, 0, sizeof(nc->nh_neigh_info));
+	n = neigh_lookup(&arp_tbl, &nc->key.addr.u.ipv4, nc->key.dev);
+	if (!n)
+		goto out;
+
+	read_lock_bh(&n->lock);
+	if (n->nud_state & NUD_VALID && !n->dead) {
+		err = prestera_neigh_iface_init(sw, &nc->nh_neigh_info.iface,
+						n);
+		if (err)
+			goto n_read_out;
+
+		memcpy(&nc->nh_neigh_info.ha[0], &n->ha[0], ETH_ALEN);
+		nc->nh_neigh_info.connected = true;
+	}
+n_read_out:
+	read_unlock_bh(&n->lock);
+out:
+	nc->in_kernel = nc->nh_neigh_info.connected;
+	if (n)
+		neigh_release(n);
+}
+
+/* neigh_cache info -> lpm update */
+static void
+__prestera_k_arb_nc_apply(struct prestera_switch *sw,
+			  struct prestera_kern_neigh_cache *nc)
+{
+	struct prestera_kern_neigh_cache_head *nhead;
+	struct prestera_nh_neigh_key nh_key;
+	struct prestera_nh_neigh *nh_neigh;
+	int err;
+
+	__prestera_k_arb_n_lpm_set(sw, nc, nc->reachable && nc->in_kernel);
+	__prestera_k_arb_n_offload_set(sw, nc, nc->reachable && nc->in_kernel);
+
+	prestera_util_nc_key2nh_key(&nc->key, &nh_key);
+	nh_neigh = prestera_nh_neigh_find(sw, &nh_key);
+	if (!nh_neigh)
+		goto out;
+
+	/* Do hw update only if something changed to prevent nh flap */
+	if (memcmp(&nc->nh_neigh_info, &nh_neigh->info,
+		   sizeof(nh_neigh->info))) {
+		memcpy(&nh_neigh->info, &nc->nh_neigh_info,
+		       sizeof(nh_neigh->info));
+		err = prestera_nh_neigh_set(sw, nh_neigh);
+		if (err) {
+			pr_err("%s failed with err=%d ip=%pI4n mac=%pM",
+			       "prestera_nh_neigh_set", err,
+			       &nh_neigh->key.addr.u.ipv4,
+			       &nh_neigh->info.ha[0]);
+			goto out;
+		}
+	}
+
+out:
+	list_for_each_entry(nhead, &nc->kern_fib_cache_list, head) {
+		__prestera_k_arb_fib_nh_offload_set(sw, nhead->this, nc,
+						    nc->in_kernel,
+						    !nc->in_kernel);
+	}
+}
+
 static int
 __prestera_pr_k_arb_fc_lpm_info_calc(struct prestera_switch *sw,
 				     struct prestera_kern_fib_cache *fc)
 {
+	struct fib_nh_common *nhc;
+	int nh_cnt;
+
 	memset(&fc->lpm_info, 0, sizeof(fc->lpm_info));
 
 	switch (prestera_kern_fib_info_type(&fc->info)) {
 	case RTN_UNICAST:
-		fc->lpm_info.fib_type = PRESTERA_FIB_TYPE_TRAP;
+		if (prestera_fib_info_is_direct(&fc->info) &&
+		    fc->key.prefix_len ==
+			PRESTERA_IP_ADDR_PLEN(fc->key.addr.v)) {
+			/* This is special case.
+			 * When prefix is 32. Than we will have conflict in lpm
+			 * for direct route - once TRAP added, there is no
+			 * place for neighbour entry. So represent direct route
+			 * with prefix 32, as NH. So neighbour will be resolved
+			 * as nexthop of this route.
+			 */
+			nhc = prestera_kern_fib_info_nhc(&fc->info, 0);
+			fc->lpm_info.fib_type = PRESTERA_FIB_TYPE_UC_NH;
+			fc->lpm_info.nh_grp_key.neigh[0].addr =
+				fc->key.addr;
+			fc->lpm_info.nh_grp_key.neigh[0].rif =
+				nhc->nhc_dev;
+
+			break;
+		}
+
+		/* We can also get nh_grp_key from fi. This will be correct to
+		 * because cache not always represent, what actually written to
+		 * lpm. But we use nh cache, as well for now (for this case).
+		 */
+		for (nh_cnt = 0; nh_cnt < PRESTERA_NHGR_SIZE_MAX; nh_cnt++) {
+			if (!fc->kern_neigh_cache_head[nh_cnt].n_cache)
+				break;
+
+			fc->lpm_info.nh_grp_key.neigh[nh_cnt].addr =
+				fc->kern_neigh_cache_head[nh_cnt].n_cache->key.addr;
+			fc->lpm_info.nh_grp_key.neigh[nh_cnt].rif =
+				fc->kern_neigh_cache_head[nh_cnt].n_cache->key.dev;
+		}
+
+		fc->lpm_info.fib_type = nh_cnt ?
+					PRESTERA_FIB_TYPE_UC_NH :
+					PRESTERA_FIB_TYPE_TRAP;
 		break;
 	/* Unsupported. Leave it for kernel: */
 	case RTN_BROADCAST:
@@ -231,7 +936,8 @@ static int __prestera_k_arb_f_lpm_set(struct prestera_switch *sw,
 		return 0;
 
 	fib_node = prestera_fib_node_create(sw, &fc->lpm_info.fib_key,
-					    fc->lpm_info.fib_type, NULL);
+					    fc->lpm_info.fib_type,
+					    &fc->lpm_info.nh_grp_key);
 
 	if (!fib_node) {
 		dev_err(sw->dev->dev, "fib_node=NULL %pI4n/%d kern_tb_id = %d",
@@ -261,6 +967,8 @@ static int __prestera_k_arb_fc_apply(struct prestera_switch *sw,
 
 	switch (fc->lpm_info.fib_type) {
 	case PRESTERA_FIB_TYPE_UC_NH:
+		__prestera_k_arb_fib_lpm_offload_set(sw, fc, false,
+						     fc->reachable, false);
 		break;
 	case PRESTERA_FIB_TYPE_TRAP:
 		__prestera_k_arb_fib_lpm_offload_set(sw, fc, false,
@@ -313,6 +1021,57 @@ __prestera_k_arb_util_fib_overlapped(struct prestera_switch *sw,
 	return rfc;
 }
 
+/* Propagate kernel event to hw */
+static void prestera_k_arb_n_evt(struct prestera_switch *sw,
+				 struct neighbour *n)
+{
+	struct prestera_kern_neigh_cache_key n_key;
+	struct prestera_kern_neigh_cache *n_cache;
+	int err;
+
+	err = prestera_util_neigh2nc_key(sw, n, &n_key);
+	if (err)
+		return;
+
+	n_cache = prestera_kern_neigh_cache_find(sw, &n_key);
+	if (!n_cache) {
+		n_cache = prestera_kern_neigh_cache_get(sw, &n_key);
+		if (!n_cache)
+			return;
+		__prestera_k_arb_nc_kern_fib_fetch(sw, n_cache);
+	}
+
+	__prestera_k_arb_nc_kern_n_fetch(sw, n_cache);
+	__prestera_k_arb_nc_apply(sw, n_cache);
+
+	prestera_kern_neigh_cache_put(sw, n_cache);
+}
+
+static void __prestera_k_arb_fib_evt2nc(struct prestera_switch *sw)
+{
+	struct prestera_kern_neigh_cache *n_cache;
+	struct rhashtable_iter iter;
+
+	rhashtable_walk_enter(&sw->router->kern_neigh_cache_ht, &iter);
+	rhashtable_walk_start(&iter);
+	while (1) {
+		n_cache = rhashtable_walk_next(&iter);
+
+		if (!n_cache)
+			break;
+
+		if (IS_ERR(n_cache))
+			continue;
+
+		rhashtable_walk_stop(&iter);
+		__prestera_k_arb_nc_kern_fib_fetch(sw, n_cache);
+		__prestera_k_arb_nc_apply(sw, n_cache);
+		rhashtable_walk_start(&iter);
+	}
+	rhashtable_walk_stop(&iter);
+	rhashtable_walk_exit(&iter);
+}
+
 static int
 prestera_k_arb_fib_evt(struct prestera_switch *sw,
 		       bool replace, /* replace or del */
@@ -370,9 +1129,30 @@ prestera_k_arb_fib_evt(struct prestera_switch *sw,
 			dev_err(sw->dev->dev, "Applying fib_cache failed");
 	}
 
+	/* Update all neighs to resolve overlapped and apply related */
+	__prestera_k_arb_fib_evt2nc(sw);
+
 	return 0;
 }
 
+static void __prestera_k_arb_abort_neigh_ht_cb(void *ptr, void *arg)
+{
+	struct prestera_kern_neigh_cache *n_cache = ptr;
+	struct prestera_switch *sw = arg;
+
+	if (!list_empty(&n_cache->kern_fib_cache_list)) {
+		WARN_ON(1); /* BUG */
+		return;
+	}
+	__prestera_k_arb_n_offload_set(sw, n_cache, false);
+	n_cache->in_kernel = false;
+	/* No need to destroy lpm.
+	 * It will be aborted by destroy_ht
+	 */
+	__prestera_kern_neigh_cache_destruct(sw, n_cache);
+	kfree(n_cache);
+}
+
 static void __prestera_k_arb_abort_fib_ht_cb(void *ptr, void *arg)
 {
 	struct prestera_kern_fib_cache *fib_cache = ptr;
@@ -381,6 +1161,8 @@ static void __prestera_k_arb_abort_fib_ht_cb(void *ptr, void *arg)
 	__prestera_k_arb_fib_lpm_offload_set(sw, fib_cache,
 					     false, false,
 					     false);
+	__prestera_k_arb_fib_nh_offload_set(sw, fib_cache, NULL,
+					    false, false);
 	/* No need to destroy lpm.
 	 * It will be aborted by destroy_ht
 	 */
@@ -401,6 +1183,9 @@ static void prestera_k_arb_abort(struct prestera_switch *sw)
 	rhashtable_free_and_destroy(&sw->router->kern_fib_cache_ht,
 				    __prestera_k_arb_abort_fib_ht_cb,
 				    sw);
+	rhashtable_free_and_destroy(&sw->router->kern_neigh_cache_ht,
+				    __prestera_k_arb_abort_neigh_ht_cb,
+				    sw);
 }
 
 static int __prestera_inetaddr_port_event(struct net_device *port_dev,
@@ -615,12 +1400,13 @@ static void prestera_router_neigh_event_work(struct work_struct *work)
 {
 	struct prestera_netevent_work *net_work =
 		container_of(work, struct prestera_netevent_work, work);
+	struct prestera_switch *sw = net_work->sw;
 	struct neighbour *n = net_work->n;
 
 	/* neigh - its not hw related object. It stored only in kernel. So... */
 	rtnl_lock();
 
-	/* TODO: handler */
+	prestera_k_arb_n_evt(sw, n);
 
 	neigh_release(n);
 	rtnl_unlock();
@@ -676,6 +1462,11 @@ int prestera_router_init(struct prestera_switch *sw)
 	if (err)
 		goto err_kern_fib_cache_ht_init;
 
+	err = rhashtable_init(&router->kern_neigh_cache_ht,
+			      &__prestera_kern_neigh_cache_ht_params);
+	if (err)
+		goto err_kern_neigh_cache_ht_init;
+
 	nhgrp_cache_bytes = sw->size_tbl_router_nexthop / 8 + 1;
 	router->nhgrp_hw_state_cache = kzalloc(nhgrp_cache_bytes, GFP_KERNEL);
 	if (!router->nhgrp_hw_state_cache) {
@@ -715,6 +1506,8 @@ err_register_inetaddr_notifier:
 err_register_inetaddr_validator_notifier:
 	kfree(router->nhgrp_hw_state_cache);
 err_nh_state_cache_alloc:
+	rhashtable_destroy(&router->kern_neigh_cache_ht);
+err_kern_neigh_cache_ht_init:
 	rhashtable_destroy(&router->kern_fib_cache_ht);
 err_kern_fib_cache_ht_init:
 	prestera_router_hw_fini(sw);
-- 
cgit v1.2.3


From ae15ed6e40c9623e3a295a76cb164b4d3564a20b Mon Sep 17 00:00:00 2001
From: Yevhen Orlov <yevhen.orlov@plvision.eu>
Date: Sat, 1 Oct 2022 12:34:17 +0300
Subject: net: marvell: prestera: Propagate nh state from hw to kernel

We poll nexthops in HW and call for each active nexthop appropriate
neighbour.

Also we provide implicity neighbour resolving.
For example, user have added nexthop route:
  # ip route add 5.5.5.5 via 1.1.1.2
But neighbour 1.1.1.2 doesn't exist. In this case we will try to call
neigh_event_send, even if there is no traffic.
This is useful, when you have add route, which will be used after some
time but with a lot of traffic (burst). So, we has prepared, offloaded
route in advance.

Co-developed-by: Taras Chornyi <tchornyi@marvell.com>
Signed-off-by: Taras Chornyi <tchornyi@marvell.com>
Co-developed-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
Signed-off-by: Yevhen Orlov <yevhen.orlov@plvision.eu>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/marvell/prestera/prestera.h   |   3 +
 .../ethernet/marvell/prestera/prestera_router.c    | 111 +++++++++++++++++++++
 2 files changed, 114 insertions(+)

diff --git a/drivers/net/ethernet/marvell/prestera/prestera.h b/drivers/net/ethernet/marvell/prestera/prestera.h
index 540a36069b79..35554ee805cd 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera.h
+++ b/drivers/net/ethernet/marvell/prestera/prestera.h
@@ -324,6 +324,9 @@ struct prestera_router {
 	struct notifier_block netevent_nb;
 	u8 *nhgrp_hw_state_cache; /* Bitmap cached hw state of nhs */
 	unsigned long nhgrp_hw_cache_kick; /* jiffies */
+	struct {
+		struct delayed_work dw;
+	} neighs_update;
 };
 
 struct prestera_rxtx_params {
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_router.c b/drivers/net/ethernet/marvell/prestera/prestera_router.c
index af7d24390d2e..4046be0e86ff 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_router.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_router.c
@@ -16,6 +16,9 @@
 #include "prestera.h"
 #include "prestera_router_hw.h"
 
+#define PRESTERA_IMPLICITY_RESOLVE_DEAD_NEIGH
+#define PRESTERA_NH_PROBE_INTERVAL 5000 /* ms */
+
 struct prestera_kern_neigh_cache_key {
 	struct prestera_ip_addr addr;
 	struct net_device *dev;
@@ -32,6 +35,7 @@ struct prestera_kern_neigh_cache {
 	/* Lock cache if neigh is present in kernel */
 	bool in_kernel;
 };
+
 struct prestera_kern_fib_cache_key {
 	struct prestera_ip_addr addr;
 	u32 prefix_len;
@@ -1021,6 +1025,78 @@ __prestera_k_arb_util_fib_overlapped(struct prestera_switch *sw,
 	return rfc;
 }
 
+static void __prestera_k_arb_hw_state_upd(struct prestera_switch *sw,
+					  struct prestera_kern_neigh_cache *nc)
+{
+	struct prestera_nh_neigh_key nh_key;
+	struct prestera_nh_neigh *nh_neigh;
+	struct neighbour *n;
+	bool hw_active;
+
+	prestera_util_nc_key2nh_key(&nc->key, &nh_key);
+	nh_neigh = prestera_nh_neigh_find(sw, &nh_key);
+	if (!nh_neigh) {
+		pr_err("Cannot find nh_neigh for cached %pI4n",
+		       &nc->key.addr.u.ipv4);
+		return;
+	}
+
+	hw_active = prestera_nh_neigh_util_hw_state(sw, nh_neigh);
+
+#ifdef PRESTERA_IMPLICITY_RESOLVE_DEAD_NEIGH
+	if (!hw_active && nc->in_kernel)
+		goto out;
+#else /* PRESTERA_IMPLICITY_RESOLVE_DEAD_NEIGH */
+	if (!hw_active)
+		goto out;
+#endif /* PRESTERA_IMPLICITY_RESOLVE_DEAD_NEIGH */
+
+	if (nc->key.addr.v == PRESTERA_IPV4) {
+		n = neigh_lookup(&arp_tbl, &nc->key.addr.u.ipv4,
+				 nc->key.dev);
+		if (!n)
+			n = neigh_create(&arp_tbl, &nc->key.addr.u.ipv4,
+					 nc->key.dev);
+	} else {
+		n = NULL;
+	}
+
+	if (!IS_ERR(n) && n) {
+		neigh_event_send(n, NULL);
+		neigh_release(n);
+	} else {
+		pr_err("Cannot create neighbour %pI4n", &nc->key.addr.u.ipv4);
+	}
+
+out:
+	return;
+}
+
+/* Propagate hw state to kernel */
+static void prestera_k_arb_hw_evt(struct prestera_switch *sw)
+{
+	struct prestera_kern_neigh_cache *n_cache;
+	struct rhashtable_iter iter;
+
+	rhashtable_walk_enter(&sw->router->kern_neigh_cache_ht, &iter);
+	rhashtable_walk_start(&iter);
+	while (1) {
+		n_cache = rhashtable_walk_next(&iter);
+
+		if (!n_cache)
+			break;
+
+		if (IS_ERR(n_cache))
+			continue;
+
+		rhashtable_walk_stop(&iter);
+		__prestera_k_arb_hw_state_upd(sw, n_cache);
+		rhashtable_walk_start(&iter);
+	}
+	rhashtable_walk_stop(&iter);
+	rhashtable_walk_exit(&iter);
+}
+
 /* Propagate kernel event to hw */
 static void prestera_k_arb_n_evt(struct prestera_switch *sw,
 				 struct neighbour *n)
@@ -1441,6 +1517,34 @@ static int prestera_router_netevent_event(struct notifier_block *nb,
 	return NOTIFY_DONE;
 }
 
+static void prestera_router_update_neighs_work(struct work_struct *work)
+{
+	struct prestera_router *router;
+
+	router = container_of(work, struct prestera_router,
+			      neighs_update.dw.work);
+	rtnl_lock();
+
+	prestera_k_arb_hw_evt(router->sw);
+
+	rtnl_unlock();
+	prestera_queue_delayed_work(&router->neighs_update.dw,
+				    msecs_to_jiffies(PRESTERA_NH_PROBE_INTERVAL));
+}
+
+static int prestera_neigh_work_init(struct prestera_switch *sw)
+{
+	INIT_DELAYED_WORK(&sw->router->neighs_update.dw,
+			  prestera_router_update_neighs_work);
+	prestera_queue_delayed_work(&sw->router->neighs_update.dw, 0);
+	return 0;
+}
+
+static void prestera_neigh_work_fini(struct prestera_switch *sw)
+{
+	cancel_delayed_work_sync(&sw->router->neighs_update.dw);
+}
+
 int prestera_router_init(struct prestera_switch *sw)
 {
 	struct prestera_router *router;
@@ -1474,6 +1578,10 @@ int prestera_router_init(struct prestera_switch *sw)
 		goto err_nh_state_cache_alloc;
 	}
 
+	err = prestera_neigh_work_init(sw);
+	if (err)
+		goto err_neigh_work_init;
+
 	router->inetaddr_valid_nb.notifier_call = __prestera_inetaddr_valid_cb;
 	err = register_inetaddr_validator_notifier(&router->inetaddr_valid_nb);
 	if (err)
@@ -1504,6 +1612,8 @@ err_register_netevent_notifier:
 err_register_inetaddr_notifier:
 	unregister_inetaddr_validator_notifier(&router->inetaddr_valid_nb);
 err_register_inetaddr_validator_notifier:
+	prestera_neigh_work_fini(sw);
+err_neigh_work_init:
 	kfree(router->nhgrp_hw_state_cache);
 err_nh_state_cache_alloc:
 	rhashtable_destroy(&router->kern_neigh_cache_ht);
@@ -1522,6 +1632,7 @@ void prestera_router_fini(struct prestera_switch *sw)
 	unregister_netevent_notifier(&sw->router->netevent_nb);
 	unregister_inetaddr_notifier(&sw->router->inetaddr_nb);
 	unregister_inetaddr_validator_notifier(&sw->router->inetaddr_valid_nb);
+	prestera_neigh_work_fini(sw);
 	prestera_queue_drain();
 
 	prestera_k_arb_abort(sw);
-- 
cgit v1.2.3


From 4f526fef91b24197d489ff86789744c67f475bb4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 3 Oct 2022 20:26:08 -0400
Subject: [brown paperbag] fix coredump breakage

Let me count the ways in which I'd screwed up:

* when emitting a page, handling of gaps in coredump should happen
before fetching the current file position.
* fix for a problem that occurs on rather uncommon setups (and hadn't
been observed in the wild) had been sent very late in the cycle.
* ... with badly insufficient testing, introducing an easily
reproducible breakage.  Without giving it time to soak in -next.

Fucked-up-by: Al Viro <viro@zeniv.linux.org.uk>
Reported-by: "J. R. Okajima" <hooanon05g@gmail.com>
Tested-by: "J. R. Okajima" <hooanon05g@gmail.com>
Fixes: 06bbaa6dc53c "[coredump] don't use __kernel_write() on kmap_local_page()"
Cc: stable@kernel.org	# v6.0-only
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coredump.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/coredump.c b/fs/coredump.c
index 1ab4f5b76a1e..3538f3a63965 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -841,7 +841,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 	};
 	struct iov_iter iter;
 	struct file *file = cprm->file;
-	loff_t pos = file->f_pos;
+	loff_t pos;
 	ssize_t n;
 
 	if (cprm->to_skip) {
@@ -853,6 +853,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 		return 0;
 	if (dump_interrupted())
 		return 0;
+	pos = file->f_pos;
 	iov_iter_bvec(&iter, WRITE, &bvec, 1, PAGE_SIZE);
 	n = __kernel_write_iter(cprm->file, &iter, &pos);
 	if (n != PAGE_SIZE)
-- 
cgit v1.2.3


From e9554b31aff011c4d0f11a4692d2d45c92cb508d Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 3 Oct 2022 08:51:56 +0200
Subject: dt-bindings: net: phy: add PoDL PSE property

Add property to reference node representing a PoDL Power Sourcing Equipment.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/devicetree/bindings/net/ethernet-phy.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
index ed1415a4381f..ad808e9ce5b9 100644
--- a/Documentation/devicetree/bindings/net/ethernet-phy.yaml
+++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
@@ -144,6 +144,12 @@ properties:
       Mark the corresponding energy efficient ethernet mode as
       broken and request the ethernet to stop advertising it.
 
+  pses:
+    $ref: /schemas/types.yaml#/definitions/phandle-array
+    maxItems: 1
+    description:
+      Specifies a reference to a node representing a Power Sourcing Equipment.
+
   phy-is-integrated:
     $ref: /schemas/types.yaml#/definitions/flag
     description:
-- 
cgit v1.2.3


From 3114b075eb2531dea31a961944309485d6a53040 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 3 Oct 2022 08:51:57 +0200
Subject: net: add framework to support Ethernet PSE and PDs devices

This framework was create with intention to provide support for Ethernet PSE
(Power Sourcing Equipment) and PDs (Powered Device).

At current step this patch implements generic PSE support for PoDL (Power over
Data Lines 802.3bu) specification with reserving name space for PD devices as
well.

This framework can be extended to support 802.3af and 802.3at "Power via the
Media Dependent Interface" (or PoE/Power over Ethernet)

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/Kconfig           |   2 +
 drivers/net/Makefile          |   1 +
 drivers/net/pse-pd/Kconfig    |  11 ++
 drivers/net/pse-pd/Makefile   |   4 +
 drivers/net/pse-pd/pse_core.c | 256 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/pse-pd/pse.h    |  67 +++++++++++
 6 files changed, 341 insertions(+)
 create mode 100644 drivers/net/pse-pd/Kconfig
 create mode 100644 drivers/net/pse-pd/Makefile
 create mode 100644 drivers/net/pse-pd/pse_core.c
 create mode 100644 include/linux/pse-pd/pse.h

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 94c889802566..15d4a38b1351 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -500,6 +500,8 @@ config NET_SB1000
 
 source "drivers/net/phy/Kconfig"
 
+source "drivers/net/pse-pd/Kconfig"
+
 source "drivers/net/can/Kconfig"
 
 source "drivers/net/mctp/Kconfig"
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 3f1192d3c52d..6ce076462dbf 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_NET) += loopback.o
 obj-$(CONFIG_NETDEV_LEGACY_INIT) += Space.o
 obj-$(CONFIG_NETCONSOLE) += netconsole.o
 obj-y += phy/
+obj-y += pse-pd/
 obj-y += mdio/
 obj-y += pcs/
 obj-$(CONFIG_RIONET) += rionet.o
diff --git a/drivers/net/pse-pd/Kconfig b/drivers/net/pse-pd/Kconfig
new file mode 100644
index 000000000000..49c7f0bcff52
--- /dev/null
+++ b/drivers/net/pse-pd/Kconfig
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Ethernet Power Sourcing Equipment drivers
+#
+
+menuconfig PSE_CONTROLLER
+	bool "Ethernet Power Sourcing Equipment Support"
+	help
+	  Generic Power Sourcing Equipment Controller support.
+
+	  If unsure, say no.
diff --git a/drivers/net/pse-pd/Makefile b/drivers/net/pse-pd/Makefile
new file mode 100644
index 000000000000..cfa780c7801d
--- /dev/null
+++ b/drivers/net/pse-pd/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Makefile for Linux PSE drivers
+
+obj-$(CONFIG_PSE_CONTROLLER) += pse_core.o
diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
new file mode 100644
index 000000000000..f431159fcc0b
--- /dev/null
+++ b/drivers/net/pse-pd/pse_core.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Framework for Ethernet Power Sourcing Equipment
+//
+// Copyright (c) 2022 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+//
+
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/pse-pd/pse.h>
+
+static DEFINE_MUTEX(pse_list_mutex);
+static LIST_HEAD(pse_controller_list);
+
+/**
+ * struct pse_control - a PSE control
+ * @pcdev: a pointer to the PSE controller device
+ *         this PSE control belongs to
+ * @list: list entry for the pcdev's PSE controller list
+ * @id: ID of the PSE line in the PSE controller device
+ * @refcnt: Number of gets of this pse_control
+ */
+struct pse_control {
+	struct pse_controller_dev *pcdev;
+	struct list_head list;
+	unsigned int id;
+	struct kref refcnt;
+};
+
+/**
+ * of_pse_zero_xlate - dummy function for controllers with one only control
+ * @pcdev: a pointer to the PSE controller device
+ * @pse_spec: PSE line specifier as found in the device tree
+ *
+ * This static translation function is used by default if of_xlate in
+ * :c:type:`pse_controller_dev` is not set. It is useful for all PSE
+ * controllers with #pse-cells = <0>.
+ */
+static int of_pse_zero_xlate(struct pse_controller_dev *pcdev,
+			     const struct of_phandle_args *pse_spec)
+{
+	return 0;
+}
+
+/**
+ * of_pse_simple_xlate - translate pse_spec to the PSE line number
+ * @pcdev: a pointer to the PSE controller device
+ * @pse_spec: PSE line specifier as found in the device tree
+ *
+ * This static translation function is used by default if of_xlate in
+ * :c:type:`pse_controller_dev` is not set. It is useful for all PSE
+ * controllers with 1:1 mapping, where PSE lines can be indexed by number
+ * without gaps.
+ */
+static int of_pse_simple_xlate(struct pse_controller_dev *pcdev,
+			       const struct of_phandle_args *pse_spec)
+{
+	if (pse_spec->args[0] >= pcdev->nr_lines)
+		return -EINVAL;
+
+	return pse_spec->args[0];
+}
+
+/**
+ * pse_controller_register - register a PSE controller device
+ * @pcdev: a pointer to the initialized PSE controller device
+ */
+int pse_controller_register(struct pse_controller_dev *pcdev)
+{
+	if (!pcdev->of_xlate) {
+		if (pcdev->of_pse_n_cells == 0)
+			pcdev->of_xlate = of_pse_zero_xlate;
+		else if (pcdev->of_pse_n_cells == 1)
+			pcdev->of_xlate = of_pse_simple_xlate;
+	}
+
+	mutex_init(&pcdev->lock);
+	INIT_LIST_HEAD(&pcdev->pse_control_head);
+
+	mutex_lock(&pse_list_mutex);
+	list_add(&pcdev->list, &pse_controller_list);
+	mutex_unlock(&pse_list_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pse_controller_register);
+
+/**
+ * pse_controller_unregister - unregister a PSE controller device
+ * @pcdev: a pointer to the PSE controller device
+ */
+void pse_controller_unregister(struct pse_controller_dev *pcdev)
+{
+	mutex_lock(&pse_list_mutex);
+	list_del(&pcdev->list);
+	mutex_unlock(&pse_list_mutex);
+}
+EXPORT_SYMBOL_GPL(pse_controller_unregister);
+
+static void devm_pse_controller_release(struct device *dev, void *res)
+{
+	pse_controller_unregister(*(struct pse_controller_dev **)res);
+}
+
+/**
+ * devm_pse_controller_register - resource managed pse_controller_register()
+ * @dev: device that is registering this PSE controller
+ * @pcdev: a pointer to the initialized PSE controller device
+ *
+ * Managed pse_controller_register(). For PSE controllers registered by
+ * this function, pse_controller_unregister() is automatically called on
+ * driver detach. See pse_controller_register() for more information.
+ */
+int devm_pse_controller_register(struct device *dev,
+				 struct pse_controller_dev *pcdev)
+{
+	struct pse_controller_dev **pcdevp;
+	int ret;
+
+	pcdevp = devres_alloc(devm_pse_controller_release, sizeof(*pcdevp),
+			      GFP_KERNEL);
+	if (!pcdevp)
+		return -ENOMEM;
+
+	ret = pse_controller_register(pcdev);
+	if (ret) {
+		devres_free(pcdevp);
+		return ret;
+	}
+
+	*pcdevp = pcdev;
+	devres_add(dev, pcdevp);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_pse_controller_register);
+
+/* PSE control section */
+
+static void __pse_control_release(struct kref *kref)
+{
+	struct pse_control *psec = container_of(kref, struct pse_control,
+						  refcnt);
+
+	lockdep_assert_held(&pse_list_mutex);
+
+	module_put(psec->pcdev->owner);
+
+	list_del(&psec->list);
+	kfree(psec);
+}
+
+static void __pse_control_put_internal(struct pse_control *psec)
+{
+	lockdep_assert_held(&pse_list_mutex);
+
+	kref_put(&psec->refcnt, __pse_control_release);
+}
+
+/**
+ * pse_control_put - free the PSE control
+ * @psec: PSE control pointer
+ */
+void pse_control_put(struct pse_control *psec)
+{
+	if (IS_ERR_OR_NULL(psec))
+		return;
+
+	mutex_lock(&pse_list_mutex);
+	__pse_control_put_internal(psec);
+	mutex_unlock(&pse_list_mutex);
+}
+EXPORT_SYMBOL_GPL(pse_control_put);
+
+static struct pse_control *
+pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index)
+{
+	struct pse_control *psec;
+
+	lockdep_assert_held(&pse_list_mutex);
+
+	list_for_each_entry(psec, &pcdev->pse_control_head, list) {
+		if (psec->id == index) {
+			kref_get(&psec->refcnt);
+			return psec;
+		}
+	}
+
+	psec = kzalloc(sizeof(*psec), GFP_KERNEL);
+	if (!psec)
+		return ERR_PTR(-ENOMEM);
+
+	if (!try_module_get(pcdev->owner)) {
+		kfree(psec);
+		return ERR_PTR(-ENODEV);
+	}
+
+	psec->pcdev = pcdev;
+	list_add(&psec->list, &pcdev->pse_control_head);
+	psec->id = index;
+	kref_init(&psec->refcnt);
+
+	return psec;
+}
+
+struct pse_control *
+of_pse_control_get(struct device_node *node)
+{
+	struct pse_controller_dev *r, *pcdev;
+	struct of_phandle_args args;
+	struct pse_control *psec;
+	int psec_id;
+	int ret;
+
+	if (!node)
+		return ERR_PTR(-EINVAL);
+
+	ret = of_parse_phandle_with_args(node, "pses", "#pse-cells", 0, &args);
+	if (ret)
+		return ERR_PTR(ret);
+
+	mutex_lock(&pse_list_mutex);
+	pcdev = NULL;
+	list_for_each_entry(r, &pse_controller_list, list) {
+		if (args.np == r->dev->of_node) {
+			pcdev = r;
+			break;
+		}
+	}
+
+	if (!pcdev) {
+		psec = ERR_PTR(-EPROBE_DEFER);
+		goto out;
+	}
+
+	if (WARN_ON(args.args_count != pcdev->of_pse_n_cells)) {
+		psec = ERR_PTR(-EINVAL);
+		goto out;
+	}
+
+	psec_id = pcdev->of_xlate(pcdev, &args);
+	if (psec_id < 0) {
+		psec = ERR_PTR(psec_id);
+		goto out;
+	}
+
+	/* pse_list_mutex also protects the pcdev's pse_control list */
+	psec = pse_control_get_internal(pcdev, psec_id);
+
+out:
+	mutex_unlock(&pse_list_mutex);
+	of_node_put(args.np);
+
+	return psec;
+}
+EXPORT_SYMBOL_GPL(of_pse_control_get);
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
new file mode 100644
index 000000000000..3ba787a48b15
--- /dev/null
+++ b/include/linux/pse-pd/pse.h
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+// Copyright (c) 2022 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+ */
+#ifndef _LINUX_PSE_CONTROLLER_H
+#define _LINUX_PSE_CONTROLLER_H
+
+#include <linux/ethtool.h>
+#include <linux/list.h>
+#include <uapi/linux/ethtool.h>
+
+struct module;
+struct device_node;
+struct of_phandle_args;
+struct pse_control;
+
+/**
+ * struct pse_controller_dev - PSE controller entity that might
+ *                             provide multiple PSE controls
+ * @ops: a pointer to device specific struct pse_controller_ops
+ * @owner: kernel module of the PSE controller driver
+ * @list: internal list of PSE controller devices
+ * @pse_control_head: head of internal list of requested PSE controls
+ * @dev: corresponding driver model device struct
+ * @of_pse_n_cells: number of cells in PSE line specifiers
+ * @of_xlate: translation function to translate from specifier as found in the
+ *            device tree to id as given to the PSE control ops
+ * @nr_lines: number of PSE controls in this controller device
+ * @lock: Mutex for serialization access to the PSE controller
+ */
+struct pse_controller_dev {
+	const struct pse_controller_ops *ops;
+	struct module *owner;
+	struct list_head list;
+	struct list_head pse_control_head;
+	struct device *dev;
+	int of_pse_n_cells;
+	int (*of_xlate)(struct pse_controller_dev *pcdev,
+			const struct of_phandle_args *pse_spec);
+	unsigned int nr_lines;
+	struct mutex lock;
+};
+
+#if IS_ENABLED(CONFIG_PSE_CONTROLLER)
+int pse_controller_register(struct pse_controller_dev *pcdev);
+void pse_controller_unregister(struct pse_controller_dev *pcdev);
+struct device;
+int devm_pse_controller_register(struct device *dev,
+				 struct pse_controller_dev *pcdev);
+
+struct pse_control *of_pse_control_get(struct device_node *node);
+void pse_control_put(struct pse_control *psec);
+
+#else
+
+static inline struct pse_control *of_pse_control_get(struct device_node *node)
+{
+	return ERR_PTR(-ENOENT);
+}
+
+static inline void pse_control_put(struct pse_control *psec)
+{
+}
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From cfaa202a73eafaf91a3d0a86b5e5df006562f5c0 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 3 Oct 2022 08:51:58 +0200
Subject: net: mdiobus: fwnode_mdiobus_register_phy() rework error handling

Rework error handling as preparation for PSE patch. This patch should
make it easier to extend this function.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/mdio/fwnode_mdio.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/net/mdio/fwnode_mdio.c b/drivers/net/mdio/fwnode_mdio.c
index 1c1584fca632..7ff7349a27a2 100644
--- a/drivers/net/mdio/fwnode_mdio.c
+++ b/drivers/net/mdio/fwnode_mdio.c
@@ -110,8 +110,8 @@ int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 	else
 		phy = phy_device_create(bus, addr, phy_id, 0, NULL);
 	if (IS_ERR(phy)) {
-		unregister_mii_timestamper(mii_ts);
-		return PTR_ERR(phy);
+		rc = PTR_ERR(phy);
+		goto clean_mii_ts;
 	}
 
 	if (is_acpi_node(child)) {
@@ -125,17 +125,13 @@ int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 		/* All data is now stored in the phy struct, so register it */
 		rc = phy_device_register(phy);
 		if (rc) {
-			phy_device_free(phy);
 			fwnode_handle_put(phy->mdio.dev.fwnode);
-			return rc;
+			goto clean_phy;
 		}
 	} else if (is_of_node(child)) {
 		rc = fwnode_mdiobus_phy_device_register(bus, phy, child, addr);
-		if (rc) {
-			unregister_mii_timestamper(mii_ts);
-			phy_device_free(phy);
-			return rc;
-		}
+		if (rc)
+			goto clean_phy;
 	}
 
 	/* phy->mii_ts may already be defined by the PHY driver. A
@@ -145,5 +141,12 @@ int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 	if (mii_ts)
 		phy->mii_ts = mii_ts;
 	return 0;
+
+clean_phy:
+	phy_device_free(phy);
+clean_mii_ts:
+	unregister_mii_timestamper(mii_ts);
+
+	return rc;
 }
 EXPORT_SYMBOL(fwnode_mdiobus_register_phy);
-- 
cgit v1.2.3


From 5e82147de1cbd758bb280908daa39d95ed467538 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 3 Oct 2022 08:51:59 +0200
Subject: net: mdiobus: search for PSE nodes by parsing PHY nodes.

Some PHYs can be linked with PSE (Power Sourcing Equipment), so search
for related nodes and attach it to the phydev.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/mdio/fwnode_mdio.c | 37 +++++++++++++++++++++++++++++++++++--
 drivers/net/phy/phy_device.c   |  2 ++
 include/linux/phy.h            |  2 ++
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mdio/fwnode_mdio.c b/drivers/net/mdio/fwnode_mdio.c
index 7ff7349a27a2..689e728345ce 100644
--- a/drivers/net/mdio/fwnode_mdio.c
+++ b/drivers/net/mdio/fwnode_mdio.c
@@ -10,10 +10,31 @@
 #include <linux/fwnode_mdio.h>
 #include <linux/of.h>
 #include <linux/phy.h>
+#include <linux/pse-pd/pse.h>
 
 MODULE_AUTHOR("Calvin Johnson <calvin.johnson@oss.nxp.com>");
 MODULE_LICENSE("GPL");
 
+static struct pse_control *
+fwnode_find_pse_control(struct fwnode_handle *fwnode)
+{
+	struct pse_control *psec;
+	struct device_node *np;
+
+	if (!IS_ENABLED(CONFIG_PSE_CONTROLLER))
+		return NULL;
+
+	np = to_of_node(fwnode);
+	if (!np)
+		return NULL;
+
+	psec = of_pse_control_get(np);
+	if (PTR_ERR(psec) == -ENOENT)
+		return NULL;
+
+	return psec;
+}
+
 static struct mii_timestamper *
 fwnode_find_mii_timestamper(struct fwnode_handle *fwnode)
 {
@@ -91,14 +112,21 @@ int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 				struct fwnode_handle *child, u32 addr)
 {
 	struct mii_timestamper *mii_ts = NULL;
+	struct pse_control *psec = NULL;
 	struct phy_device *phy;
 	bool is_c45 = false;
 	u32 phy_id;
 	int rc;
 
+	psec = fwnode_find_pse_control(child);
+	if (IS_ERR(psec))
+		return PTR_ERR(psec);
+
 	mii_ts = fwnode_find_mii_timestamper(child);
-	if (IS_ERR(mii_ts))
-		return PTR_ERR(mii_ts);
+	if (IS_ERR(mii_ts)) {
+		rc = PTR_ERR(mii_ts);
+		goto clean_pse;
+	}
 
 	rc = fwnode_property_match_string(child, "compatible",
 					  "ethernet-phy-ieee802.3-c45");
@@ -134,18 +162,23 @@ int fwnode_mdiobus_register_phy(struct mii_bus *bus,
 			goto clean_phy;
 	}
 
+	phy->psec = psec;
+
 	/* phy->mii_ts may already be defined by the PHY driver. A
 	 * mii_timestamper probed via the device tree will still have
 	 * precedence.
 	 */
 	if (mii_ts)
 		phy->mii_ts = mii_ts;
+
 	return 0;
 
 clean_phy:
 	phy_device_free(phy);
 clean_mii_ts:
 	unregister_mii_timestamper(mii_ts);
+clean_pse:
+	pse_control_put(psec);
 
 	return rc;
 }
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index a4f5f151014a..57849ac0384e 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -26,6 +26,7 @@
 #include <linux/netdevice.h>
 #include <linux/phy.h>
 #include <linux/phy_led_triggers.h>
+#include <linux/pse-pd/pse.h>
 #include <linux/property.h>
 #include <linux/sfp.h>
 #include <linux/skbuff.h>
@@ -991,6 +992,7 @@ EXPORT_SYMBOL(phy_device_register);
 void phy_device_remove(struct phy_device *phydev)
 {
 	unregister_mii_timestamper(phydev->mii_ts);
+	pse_control_put(phydev->psec);
 
 	device_del(&phydev->mdio.dev);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index d65fc76fe0ae..ddf66198f751 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -597,6 +597,7 @@ struct macsec_ops;
  * @master_slave_get: Current master/slave advertisement
  * @master_slave_state: Current master/slave configuration
  * @mii_ts: Pointer to time stamper callbacks
+ * @psec: Pointer to Power Sourcing Equipment control struct
  * @lock:  Mutex for serialization access to PHY
  * @state_queue: Work queue for state machine
  * @shared: Pointer to private data shared by phys in one package
@@ -715,6 +716,7 @@ struct phy_device {
 	struct phylink *phylink;
 	struct net_device *attached_dev;
 	struct mii_timestamper *mii_ts;
+	struct pse_control *psec;
 
 	u8 mdix;
 	u8 mdix_ctrl;
-- 
cgit v1.2.3


From 18ff0bcda6d1dd3d53b4ce3f03e61bf1a648f960 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 3 Oct 2022 08:52:00 +0200
Subject: ethtool: add interface to interact with Ethernet Power Equipment

Add interface to support Power Sourcing Equipment. At current step it
provides generic way to address all variants of PSE devices as defined
in IEEE 802.3-2018 but support only objects specified for IEEE 802.3-2018 104.4
PoDL Power Sourcing Equipment (PSE).

Currently supported and mandatory objects are:
IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus
IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState
IEEE 802.3-2018 30.15.1.2.1 acPoDLPSEAdminControl

This is minimal interface needed to control PSE on each separate
ethernet port but it provides not all mandatory objects specified in
IEEE 802.3-2018.

Since "PoDL PSE" and "PSE" have similar names, but some different values
I decide to not merge them and keep separate naming schema. This should
allow as to be as close to IEEE 802.3 spec as possible and avoid name
conflicts in the future.

This implementation is connected to PHYs instead of MACs because PSE
auto classification can potentially interfere with PHY auto negotiation.
So, may be some extra PHY related initialization will be needed.

With WIP version of ethtools interaction with PSE capable link looks
as following:

$ ip l
...
5: t1l1@eth0: <BROADCAST,MULTICAST> ..
...

$ ethtool --show-pse t1l1
PSE attributs for t1l1:
PoDL PSE Admin State: disabled
PoDL PSE Power Detection Status: disabled

$ ethtool --set-pse t1l1 podl-pse-admin-control enable
$ ethtool --show-pse t1l1
PSE attributs for t1l1:
PoDL PSE Admin State: enabled
PoDL PSE Power Detection Status: delivering power

Signed-off-by: kernel test robot <lkp@intel.com>
Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Bagas Sanjaya <bagasdotme@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ethtool-netlink.rst |  59 +++++++++
 drivers/net/pse-pd/pse_core.c                |  58 +++++++++
 include/linux/pse-pd/pse.h                   |  62 +++++++++
 include/uapi/linux/ethtool.h                 |  45 +++++++
 include/uapi/linux/ethtool_netlink.h         |  16 +++
 net/ethtool/Makefile                         |   3 +-
 net/ethtool/common.h                         |   1 +
 net/ethtool/netlink.c                        |  17 +++
 net/ethtool/netlink.h                        |   4 +
 net/ethtool/pse-pd.c                         | 185 +++++++++++++++++++++++++++
 10 files changed, 449 insertions(+), 1 deletion(-)
 create mode 100644 net/ethtool/pse-pd.c

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 09fb1d5ba67f..d578b8bcd8a4 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -220,6 +220,8 @@ Userspace to kernel:
   ``ETHTOOL_MSG_PHC_VCLOCKS_GET``       get PHC virtual clocks info
   ``ETHTOOL_MSG_MODULE_SET``            set transceiver module parameters
   ``ETHTOOL_MSG_MODULE_GET``            get transceiver module parameters
+  ``ETHTOOL_MSG_PSE_SET``               set PSE parameters
+  ``ETHTOOL_MSG_PSE_GET``               get PSE parameters
   ===================================== =================================
 
 Kernel to userspace:
@@ -260,6 +262,7 @@ Kernel to userspace:
   ``ETHTOOL_MSG_STATS_GET_REPLY``          standard statistics
   ``ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY``    PHC virtual clocks info
   ``ETHTOOL_MSG_MODULE_GET_REPLY``         transceiver module parameters
+  ``ETHTOOL_MSG_PSE_GET_REPLY``            PSE parameters
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -1627,6 +1630,62 @@ For SFF-8636 modules, low power mode is forced by the host according to table
 For CMIS modules, low power mode is forced by the host according to table 6-12
 in revision 5.0 of the specification.
 
+PSE_GET
+=======
+
+Gets PSE attributes.
+
+Request contents:
+
+  =====================================  ======  ==========================
+  ``ETHTOOL_A_PSE_HEADER``               nested  request header
+  =====================================  ======  ==========================
+
+Kernel response contents:
+
+  ======================================  ======  =============================
+  ``ETHTOOL_A_PSE_HEADER``                nested  reply header
+  ``ETHTOOL_A_PODL_PSE_ADMIN_STATE``         u32  Operational state of the PoDL
+                                                  PSE functions
+  ``ETHTOOL_A_PODL_PSE_PW_D_STATUS``         u32  power detection status of the
+                                                  PoDL PSE.
+  ======================================  ======  =============================
+
+When set, the optional ``ETHTOOL_A_PODL_PSE_ADMIN_STATE`` attribute identifies
+the operational state of the PoDL PSE functions.  The operational state of the
+PSE function can be changed using the ``ETHTOOL_A_PODL_PSE_ADMIN_CONTROL``
+action. This option is corresponding to ``IEEE 802.3-2018`` 30.15.1.1.2
+aPoDLPSEAdminState. Possible values are:
+
+.. kernel-doc:: include/uapi/linux/ethtool.h
+    :identifiers: ethtool_podl_pse_admin_state
+
+When set, the optional ``ETHTOOL_A_PODL_PSE_PW_D_STATUS`` attribute identifies
+the power detection status of the PoDL PSE.  The status depend on internal PSE
+state machine and automatic PD classification support. This option is
+corresponding to ``IEEE 802.3-2018`` 30.15.1.1.3 aPoDLPSEPowerDetectionStatus.
+Possible values are:
+
+.. kernel-doc:: include/uapi/linux/ethtool.h
+    :identifiers: ethtool_podl_pse_pw_d_status
+
+PSE_SET
+=======
+
+Sets PSE parameters.
+
+Request contents:
+
+  ======================================  ======  =============================
+  ``ETHTOOL_A_PSE_HEADER``                nested  request header
+  ``ETHTOOL_A_PODL_PSE_ADMIN_CONTROL``       u32  Control PoDL PSE Admin state
+  ======================================  ======  =============================
+
+When set, the optional ``ETHTOOL_A_PODL_PSE_ADMIN_CONTROL`` attribute is used
+to control PoDL PSE Admin functions. This option is implementing
+``IEEE 802.3-2018`` 30.15.1.2.1 acPoDLPSEAdminControl. See
+``ETHTOOL_A_PODL_PSE_ADMIN_STATE`` for supported values.
+
 Request translation
 ===================
 
diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index f431159fcc0b..146b81f08a89 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -254,3 +254,61 @@ out:
 	return psec;
 }
 EXPORT_SYMBOL_GPL(of_pse_control_get);
+
+/**
+ * pse_ethtool_get_status - get status of PSE control
+ * @psec: PSE control pointer
+ * @extack: extack for reporting useful error messages
+ * @status: struct to store PSE status
+ */
+int pse_ethtool_get_status(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   struct pse_control_status *status)
+{
+	const struct pse_controller_ops *ops;
+	int err;
+
+	ops = psec->pcdev->ops;
+
+	if (!ops->ethtool_get_status) {
+		NL_SET_ERR_MSG(extack,
+			       "PSE driver does not support status report");
+		return -EOPNOTSUPP;
+	}
+
+	mutex_lock(&psec->pcdev->lock);
+	err = ops->ethtool_get_status(psec->pcdev, psec->id, extack, status);
+	mutex_unlock(&psec->pcdev->lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(pse_ethtool_get_status);
+
+/**
+ * pse_ethtool_set_config - set PSE control configuration
+ * @psec: PSE control pointer
+ * @extack: extack for reporting useful error messages
+ * @config: Configuration of the test to run
+ */
+int pse_ethtool_set_config(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   const struct pse_control_config *config)
+{
+	const struct pse_controller_ops *ops;
+	int err;
+
+	ops = psec->pcdev->ops;
+
+	if (!ops->ethtool_set_config) {
+		NL_SET_ERR_MSG(extack,
+			       "PSE driver does not configuration");
+		return -EOPNOTSUPP;
+	}
+
+	mutex_lock(&psec->pcdev->lock);
+	err = ops->ethtool_set_config(psec->pcdev, psec->id, extack, config);
+	mutex_unlock(&psec->pcdev->lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(pse_ethtool_set_config);
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index 3ba787a48b15..fd1a916eeeba 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -9,6 +9,47 @@
 #include <linux/list.h>
 #include <uapi/linux/ethtool.h>
 
+struct phy_device;
+struct pse_controller_dev;
+
+/**
+ * struct pse_control_config - PSE control/channel configuration.
+ *
+ * @admin_cotrol: set PoDL PSE admin control as described in
+ *	IEEE 802.3-2018 30.15.1.2.1 acPoDLPSEAdminControl
+ */
+struct pse_control_config {
+	enum ethtool_podl_pse_admin_state admin_cotrol;
+};
+
+/**
+ * struct pse_control_status - PSE control/channel status.
+ *
+ * @podl_admin_state: operational state of the PoDL PSE
+ *	functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState
+ * @podl_pw_status: power detection status of the PoDL PSE.
+ *	IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus:
+ */
+struct pse_control_status {
+	enum ethtool_podl_pse_admin_state podl_admin_state;
+	enum ethtool_podl_pse_pw_d_status podl_pw_status;
+};
+
+/**
+ * struct pse_controller_ops - PSE controller driver callbacks
+ *
+ * @ethtool_get_status: get PSE control status for ethtool interface
+ * @ethtool_set_config: set PSE control configuration over ethtool interface
+ */
+struct pse_controller_ops {
+	int (*ethtool_get_status)(struct pse_controller_dev *pcdev,
+		unsigned long id, struct netlink_ext_ack *extack,
+		struct pse_control_status *status);
+	int (*ethtool_set_config)(struct pse_controller_dev *pcdev,
+		unsigned long id, struct netlink_ext_ack *extack,
+		const struct pse_control_config *config);
+};
+
 struct module;
 struct device_node;
 struct of_phandle_args;
@@ -51,6 +92,13 @@ int devm_pse_controller_register(struct device *dev,
 struct pse_control *of_pse_control_get(struct device_node *node);
 void pse_control_put(struct pse_control *psec);
 
+int pse_ethtool_get_status(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   struct pse_control_status *status);
+int pse_ethtool_set_config(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   const struct pse_control_config *config);
+
 #else
 
 static inline struct pse_control *of_pse_control_get(struct device_node *node)
@@ -62,6 +110,20 @@ static inline void pse_control_put(struct pse_control *psec)
 {
 }
 
+int pse_ethtool_get_status(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   struct pse_control_status *status)
+{
+	return -ENOTSUPP;
+}
+
+int pse_ethtool_set_config(struct pse_control *psec,
+			   struct netlink_ext_ack *extack,
+			   const struct pse_control_config *config)
+{
+	return -ENOTSUPP;
+}
+
 #endif
 
 #endif
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index fe9893d1485d..dc2aa3d75b39 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -736,6 +736,51 @@ enum ethtool_module_power_mode {
 	ETHTOOL_MODULE_POWER_MODE_HIGH,
 };
 
+/**
+ * enum ethtool_podl_pse_admin_state - operational state of the PoDL PSE
+ *	functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState
+ * @ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN: state of PoDL PSE functions are
+ * 	unknown
+ * @ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED: PoDL PSE functions are disabled
+ * @ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED: PoDL PSE functions are enabled
+ */
+enum ethtool_podl_pse_admin_state {
+	ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN = 1,
+	ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED,
+	ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED,
+};
+
+/**
+ * enum ethtool_podl_pse_pw_d_status - power detection status of the PoDL PSE.
+ *	IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus:
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN: PoDL PSE
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED: "The enumeration “disabled” is
+ *	asserted true when the PoDL PSE state diagram variable mr_pse_enable is
+ *	false"
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING: "The enumeration “searching” is
+ *	asserted true when either of the PSE state diagram variables
+ *	pi_detecting or pi_classifying is true."
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING: "The enumeration “deliveringPower”
+ *	is asserted true when the PoDL PSE state diagram variable pi_powered is
+ *	true."
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP: "The enumeration “sleep” is asserted
+ *	true when the PoDL PSE state diagram variable pi_sleeping is true."
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE: "The enumeration “idle” is asserted true
+ *	when the logical combination of the PoDL PSE state diagram variables
+ *	pi_prebiased*!pi_sleeping is true."
+ * @ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR: "The enumeration “error” is asserted
+ *	true when the PoDL PSE state diagram variable overload_held is true."
+ */
+enum ethtool_podl_pse_pw_d_status {
+	ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN = 1,
+	ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED,
+	ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING,
+	ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING,
+	ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP,
+	ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE,
+	ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR,
+};
+
 /**
  * struct ethtool_gstrings - string set for data tagging
  * @cmd: Command number = %ETHTOOL_GSTRINGS
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 408a664fad59..bb57084ac524 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -49,6 +49,8 @@ enum {
 	ETHTOOL_MSG_PHC_VCLOCKS_GET,
 	ETHTOOL_MSG_MODULE_GET,
 	ETHTOOL_MSG_MODULE_SET,
+	ETHTOOL_MSG_PSE_GET,
+	ETHTOOL_MSG_PSE_SET,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -94,6 +96,7 @@ enum {
 	ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY,
 	ETHTOOL_MSG_MODULE_GET_REPLY,
 	ETHTOOL_MSG_MODULE_NTF,
+	ETHTOOL_MSG_PSE_GET_REPLY,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_KERNEL_CNT,
@@ -863,6 +866,19 @@ enum {
 	ETHTOOL_A_MODULE_MAX = (__ETHTOOL_A_MODULE_CNT - 1)
 };
 
+/* Power Sourcing Equipment */
+enum {
+	ETHTOOL_A_PSE_UNSPEC,
+	ETHTOOL_A_PSE_HEADER,			/* nest - _A_HEADER_* */
+	ETHTOOL_A_PODL_PSE_ADMIN_STATE,		/* u32 */
+	ETHTOOL_A_PODL_PSE_ADMIN_CONTROL,	/* u32 */
+	ETHTOOL_A_PODL_PSE_PW_D_STATUS,		/* u32 */
+
+	/* add new constants above here */
+	__ETHTOOL_A_PSE_CNT,
+	ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1)
+};
+
 /* generic netlink info */
 #define ETHTOOL_GENL_NAME "ethtool"
 #define ETHTOOL_GENL_VERSION 1
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index b76432e70e6b..72ab0944262a 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -7,4 +7,5 @@ obj-$(CONFIG_ETHTOOL_NETLINK)	+= ethtool_nl.o
 ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
-		   tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o
+		   tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o \
+		   pse-pd.o
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
index 2dc2b80aea5f..c1779657e074 100644
--- a/net/ethtool/common.h
+++ b/net/ethtool/common.h
@@ -46,6 +46,7 @@ int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max);
 int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info);
 
 extern const struct ethtool_phy_ops *ethtool_phy_ops;
+extern const struct ethtool_pse_ops *ethtool_pse_ops;
 
 int ethtool_get_module_info_call(struct net_device *dev,
 				 struct ethtool_modinfo *modinfo);
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index f4e41a6e0163..1a4c11356c96 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -286,6 +286,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_STATS_GET]		= &ethnl_stats_request_ops,
 	[ETHTOOL_MSG_PHC_VCLOCKS_GET]	= &ethnl_phc_vclocks_request_ops,
 	[ETHTOOL_MSG_MODULE_GET]	= &ethnl_module_request_ops,
+	[ETHTOOL_MSG_PSE_GET]		= &ethnl_pse_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -1023,6 +1024,22 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_module_set_policy,
 		.maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_PSE_GET,
+		.doit	= ethnl_default_doit,
+		.start	= ethnl_default_start,
+		.dumpit	= ethnl_default_dumpit,
+		.done	= ethnl_default_done,
+		.policy = ethnl_pse_get_policy,
+		.maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1,
+	},
+	{
+		.cmd	= ETHTOOL_MSG_PSE_SET,
+		.flags	= GENL_UNS_ADMIN_PERM,
+		.doit	= ethnl_set_pse,
+		.policy = ethnl_pse_set_policy,
+		.maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index c0d587611854..1bfd374f9718 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -345,6 +345,7 @@ extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops;
 extern const struct ethnl_request_ops ethnl_stats_request_ops;
 extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops;
 extern const struct ethnl_request_ops ethnl_module_request_ops;
+extern const struct ethnl_request_ops ethnl_pse_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -383,6 +384,8 @@ extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1
 extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1];
 extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1];
 extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1];
+extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1];
+extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1];
 
 int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
@@ -402,6 +405,7 @@ int ethnl_tunnel_info_start(struct netlink_callback *cb);
 int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
 int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_module(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_pse(struct sk_buff *skb, struct genl_info *info);
 
 extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
 extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
new file mode 100644
index 000000000000..5a471e115b66
--- /dev/null
+++ b/net/ethtool/pse-pd.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// ethtool interface for for Ethernet PSE (Power Sourcing Equipment)
+// and PD (Powered Device)
+//
+// Copyright (c) 2022 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+//
+
+#include "common.h"
+#include "linux/pse-pd/pse.h"
+#include "netlink.h"
+#include <linux/ethtool_netlink.h>
+#include <linux/ethtool.h>
+#include <linux/phy.h>
+
+struct pse_req_info {
+	struct ethnl_req_info base;
+};
+
+struct pse_reply_data {
+	struct ethnl_reply_data	base;
+	struct pse_control_status status;
+};
+
+#define PSE_REPDATA(__reply_base) \
+	container_of(__reply_base, struct pse_reply_data, base)
+
+/* PSE_GET */
+
+const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1] = {
+	[ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int pse_get_pse_attributes(struct net_device *dev,
+				  struct netlink_ext_ack *extack,
+				  struct pse_reply_data *data)
+{
+	struct phy_device *phydev = dev->phydev;
+
+	if (!phydev) {
+		NL_SET_ERR_MSG(extack, "No PHY is attached");
+		return -EOPNOTSUPP;
+	}
+
+	if (!phydev->psec) {
+		NL_SET_ERR_MSG(extack, "No PSE is attached");
+		return -EOPNOTSUPP;
+	}
+
+	memset(&data->status, 0, sizeof(data->status));
+
+	return pse_ethtool_get_status(phydev->psec, extack, &data->status);
+}
+
+static int pse_prepare_data(const struct ethnl_req_info *req_base,
+			       struct ethnl_reply_data *reply_base,
+			       struct genl_info *info)
+{
+	struct pse_reply_data *data = PSE_REPDATA(reply_base);
+	struct net_device *dev = reply_base->dev;
+	int ret;
+
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		return ret;
+
+	ret = pse_get_pse_attributes(dev, info->extack, data);
+
+	ethnl_ops_complete(dev);
+
+	return ret;
+}
+
+static int pse_reply_size(const struct ethnl_req_info *req_base,
+			  const struct ethnl_reply_data *reply_base)
+{
+	const struct pse_reply_data *data = PSE_REPDATA(reply_base);
+	const struct pse_control_status *st = &data->status;
+	int len = 0;
+
+	if (st->podl_admin_state > 0)
+		len += nla_total_size(sizeof(u32)); /* _PODL_PSE_ADMIN_STATE */
+	if (st->podl_pw_status > 0)
+		len += nla_total_size(sizeof(u32)); /* _PODL_PSE_PW_D_STATUS */
+
+	return len;
+}
+
+static int pse_fill_reply(struct sk_buff *skb,
+			  const struct ethnl_req_info *req_base,
+			  const struct ethnl_reply_data *reply_base)
+{
+	const struct pse_reply_data *data = PSE_REPDATA(reply_base);
+	const struct pse_control_status *st = &data->status;
+
+	if (st->podl_admin_state > 0 &&
+	    nla_put_u32(skb, ETHTOOL_A_PODL_PSE_ADMIN_STATE,
+			st->podl_admin_state))
+		return -EMSGSIZE;
+
+	if (st->podl_pw_status > 0 &&
+	    nla_put_u32(skb, ETHTOOL_A_PODL_PSE_PW_D_STATUS,
+			st->podl_pw_status))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+const struct ethnl_request_ops ethnl_pse_request_ops = {
+	.request_cmd		= ETHTOOL_MSG_PSE_GET,
+	.reply_cmd		= ETHTOOL_MSG_PSE_GET_REPLY,
+	.hdr_attr		= ETHTOOL_A_PSE_HEADER,
+	.req_info_size		= sizeof(struct pse_req_info),
+	.reply_data_size	= sizeof(struct pse_reply_data),
+
+	.prepare_data		= pse_prepare_data,
+	.reply_size		= pse_reply_size,
+	.fill_reply		= pse_fill_reply,
+};
+
+/* PSE_SET */
+
+const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = {
+	[ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+	[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] =
+		NLA_POLICY_RANGE(NLA_U32, ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED,
+				 ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED),
+};
+
+static int pse_set_pse_config(struct net_device *dev,
+			      struct netlink_ext_ack *extack,
+			      struct nlattr **tb)
+{
+	struct phy_device *phydev = dev->phydev;
+	struct pse_control_config config = {};
+
+	/* Optional attribute. Do not return error if not set. */
+	if (!tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL])
+		return 0;
+
+	/* this values are already validated by the ethnl_pse_set_policy */
+	config.admin_cotrol = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]);
+
+	if (!phydev) {
+		NL_SET_ERR_MSG(extack, "No PHY is attached");
+		return -EOPNOTSUPP;
+	}
+
+	if (!phydev->psec) {
+		NL_SET_ERR_MSG(extack, "No PSE is attached");
+		return -EOPNOTSUPP;
+	}
+
+	return pse_ethtool_set_config(phydev->psec, extack, &config);
+}
+
+int ethnl_set_pse(struct sk_buff *skb, struct genl_info *info)
+{
+	struct ethnl_req_info req_info = {};
+	struct nlattr **tb = info->attrs;
+	struct net_device *dev;
+	int ret;
+
+	ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_PSE_HEADER],
+					 genl_info_net(info), info->extack,
+					 true);
+	if (ret < 0)
+		return ret;
+
+	dev = req_info.dev;
+
+	rtnl_lock();
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		goto out_rtnl;
+
+	ret = pse_set_pse_config(dev, info->extack, tb);
+	ethnl_ops_complete(dev);
+out_rtnl:
+	rtnl_unlock();
+
+	ethnl_parse_header_dev_put(&req_info);
+
+	return ret;
+}
-- 
cgit v1.2.3


From f05dfdaf567aaa482e6e4474bbf5993c5ffffc49 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 3 Oct 2022 08:52:01 +0200
Subject: dt-bindings: net: pse-dt: add bindings for regulator based PoDL PSE
 controller

Add bindings for the regulator based Ethernet PoDL PSE controller and
generic bindings for all PSE controllers.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../bindings/net/pse-pd/podl-pse-regulator.yaml    | 40 ++++++++++++++++++++++
 .../bindings/net/pse-pd/pse-controller.yaml        | 33 ++++++++++++++++++
 2 files changed, 73 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/pse-pd/podl-pse-regulator.yaml
 create mode 100644 Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml

diff --git a/Documentation/devicetree/bindings/net/pse-pd/podl-pse-regulator.yaml b/Documentation/devicetree/bindings/net/pse-pd/podl-pse-regulator.yaml
new file mode 100644
index 000000000000..c6b1c188abf7
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/pse-pd/podl-pse-regulator.yaml
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/pse-pd/podl-pse-regulator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Regulator based Power Sourcing Equipment
+
+maintainers:
+  - Oleksij Rempel <o.rempel@pengutronix.de>
+
+description: Regulator based PoDL PSE controller. The device must be referenced
+  by the PHY node to control power injection to the Ethernet cable.
+
+allOf:
+  - $ref: "pse-controller.yaml#"
+
+properties:
+  compatible:
+    const: podl-pse-regulator
+
+  '#pse-cells':
+    const: 0
+
+  pse-supply:
+    description: Power supply for the PSE controller
+
+additionalProperties: false
+
+required:
+  - compatible
+  - pse-supply
+
+examples:
+  - |
+    ethernet-pse {
+      compatible = "podl-pse-regulator";
+      pse-supply = <&reg_t1l1>;
+      #pse-cells = <0>;
+    };
diff --git a/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml b/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml
new file mode 100644
index 000000000000..b110abb42597
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/pse-pd/pse-controller.yaml
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/pse-pd/pse-controller.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Power Sourcing Equipment (PSE).
+
+description: Binding for the Power Sourcing Equipment (PSE) as defined in the
+  IEEE 802.3 specification. It is designed for hardware which is delivering
+  power over twisted pair/ethernet cable. The ethernet-pse nodes should be
+  used to describe PSE controller and referenced by the ethernet-phy node.
+
+maintainers:
+  - Oleksij Rempel <o.rempel@pengutronix.de>
+
+properties:
+  $nodename:
+    pattern: "^ethernet-pse(@.*)?$"
+
+  "#pse-cells":
+    description:
+      Used to uniquely identify a PSE instance within an IC. Will be
+      0 on PSE nodes with only a single output and at least 1 on nodes
+      controlling several outputs.
+    enum: [0, 1]
+
+required:
+  - "#pse-cells"
+
+additionalProperties: true
+
+...
-- 
cgit v1.2.3


From 66741b4e94ca7bb162063fa930c286619e719bce Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Mon, 3 Oct 2022 08:52:02 +0200
Subject: net: pse-pd: add regulator based PSE driver

Add generic, regulator based PSE driver to support simple Power Sourcing
Equipment without automatic classification support.

This driver was tested on 10Bast-T1L switch with regulator based PoDL PSE.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/pse-pd/Kconfig         |  11 +++
 drivers/net/pse-pd/Makefile        |   2 +
 drivers/net/pse-pd/pse_regulator.c | 147 +++++++++++++++++++++++++++++++++++++
 3 files changed, 160 insertions(+)
 create mode 100644 drivers/net/pse-pd/pse_regulator.c

diff --git a/drivers/net/pse-pd/Kconfig b/drivers/net/pse-pd/Kconfig
index 49c7f0bcff52..73d163704068 100644
--- a/drivers/net/pse-pd/Kconfig
+++ b/drivers/net/pse-pd/Kconfig
@@ -9,3 +9,14 @@ menuconfig PSE_CONTROLLER
 	  Generic Power Sourcing Equipment Controller support.
 
 	  If unsure, say no.
+
+if PSE_CONTROLLER
+
+config PSE_REGULATOR
+	tristate "Regulator based PSE controller"
+	help
+	  This module provides support for simple regulator based Ethernet Power
+	  Sourcing Equipment without automatic classification support. For
+	  example for basic implementation of PoDL (802.3bu) specification.
+
+endif
diff --git a/drivers/net/pse-pd/Makefile b/drivers/net/pse-pd/Makefile
index cfa780c7801d..1b8aa4c70f0b 100644
--- a/drivers/net/pse-pd/Makefile
+++ b/drivers/net/pse-pd/Makefile
@@ -2,3 +2,5 @@
 # Makefile for Linux PSE drivers
 
 obj-$(CONFIG_PSE_CONTROLLER) += pse_core.o
+
+obj-$(CONFIG_PSE_REGULATOR) += pse_regulator.o
diff --git a/drivers/net/pse-pd/pse_regulator.c b/drivers/net/pse-pd/pse_regulator.c
new file mode 100644
index 000000000000..e2bf8306ca90
--- /dev/null
+++ b/drivers/net/pse-pd/pse_regulator.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// Driver for the regulator based Ethernet Power Sourcing Equipment, without
+// auto classification support.
+//
+// Copyright (c) 2022 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+//
+
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/pse-pd/pse.h>
+#include <linux/regulator/consumer.h>
+
+struct pse_reg_priv {
+	struct pse_controller_dev pcdev;
+	struct regulator *ps; /*power source */
+	enum ethtool_podl_pse_admin_state admin_state;
+};
+
+static struct pse_reg_priv *to_pse_reg(struct pse_controller_dev *pcdev)
+{
+	return container_of(pcdev, struct pse_reg_priv, pcdev);
+}
+
+static int
+pse_reg_ethtool_set_config(struct pse_controller_dev *pcdev, unsigned long id,
+			   struct netlink_ext_ack *extack,
+			   const struct pse_control_config *config)
+{
+	struct pse_reg_priv *priv = to_pse_reg(pcdev);
+	int ret;
+
+	if (priv->admin_state == config->admin_cotrol)
+		return 0;
+
+	switch (config->admin_cotrol) {
+	case ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED:
+		ret = regulator_enable(priv->ps);
+		break;
+	case ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED:
+		ret = regulator_disable(priv->ps);
+		break;
+	default:
+		dev_err(pcdev->dev, "Unknown admin state %i\n",
+			config->admin_cotrol);
+		ret = -ENOTSUPP;
+	}
+
+	if (ret)
+		return ret;
+
+	priv->admin_state = config->admin_cotrol;
+
+	return 0;
+}
+
+static int
+pse_reg_ethtool_get_status(struct pse_controller_dev *pcdev, unsigned long id,
+			   struct netlink_ext_ack *extack,
+			   struct pse_control_status *status)
+{
+	struct pse_reg_priv *priv = to_pse_reg(pcdev);
+	int ret;
+
+	ret = regulator_is_enabled(priv->ps);
+	if (ret < 0)
+		return ret;
+
+	if (!ret)
+		status->podl_pw_status = ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED;
+	else
+		status->podl_pw_status =
+			ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING;
+
+	status->podl_admin_state = priv->admin_state;
+
+	return 0;
+}
+
+static const struct pse_controller_ops pse_reg_ops = {
+	.ethtool_get_status = pse_reg_ethtool_get_status,
+	.ethtool_set_config = pse_reg_ethtool_set_config,
+};
+
+static int
+pse_reg_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct pse_reg_priv *priv;
+	int ret;
+
+	priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	if (!pdev->dev.of_node)
+		return -ENOENT;
+
+	priv->ps = devm_regulator_get_exclusive(dev, "pse");
+	if (IS_ERR(priv->ps))
+		return dev_err_probe(dev, PTR_ERR(priv->ps),
+				     "failed to get PSE regulator.\n");
+
+	platform_set_drvdata(pdev, priv);
+
+	ret = regulator_is_enabled(priv->ps);
+	if (ret < 0)
+		return ret;
+
+	if (ret)
+		priv->admin_state = ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED;
+	else
+		priv->admin_state = ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED;
+
+	priv->pcdev.owner = THIS_MODULE;
+	priv->pcdev.ops = &pse_reg_ops;
+	priv->pcdev.dev = dev;
+	ret = devm_pse_controller_register(dev, &priv->pcdev);
+	if (ret) {
+		dev_err(dev, "failed to register PSE controller (%pe)\n",
+			ERR_PTR(ret));
+		return ret;
+	}
+
+	return 0;
+}
+
+static const __maybe_unused struct of_device_id pse_reg_of_match[] = {
+	{ .compatible = "podl-pse-regulator", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, pse_reg_of_match);
+
+static struct platform_driver pse_reg_driver = {
+	.probe		= pse_reg_probe,
+	.driver		= {
+		.name		= "PSE regulator",
+		.of_match_table = of_match_ptr(pse_reg_of_match),
+	},
+};
+module_platform_driver(pse_reg_driver);
+
+MODULE_AUTHOR("Oleksij Rempel <kernel@pengutronix.de>");
+MODULE_DESCRIPTION("regulator based Ethernet Power Sourcing Equipment");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:pse-regulator");
-- 
cgit v1.2.3


From 2a4187f4406ec3236f8b9d0d5150d2bf8d021b68 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Mon, 3 Oct 2022 20:14:13 +0200
Subject: once: rename _SLOW to _SLEEPABLE

The _SLOW designation wasn't really descriptive of anything. This is
meant to be called from process context when it's possible to sleep. So
name this more aptly _SLEEPABLE, which better fits its intended use.

Fixes: 62c07983bef9 ("once: add DO_ONCE_SLOW() for sleepable contexts")
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20221003181413.1221968-1-Jason@zx2c4.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/once.h       | 38 +++++++++++++++++++-------------------
 lib/once.c                 | 10 +++++-----
 net/ipv4/inet_hashtables.c |  4 ++--
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/include/linux/once.h b/include/linux/once.h
index 176ab75b42df..bc714d414448 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -13,9 +13,9 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
 		    unsigned long *flags, struct module *mod);
 
 /* Variant for process contexts only. */
-bool __do_once_slow_start(bool *done);
-void __do_once_slow_done(bool *done, struct static_key_true *once_key,
-			 struct module *mod);
+bool __do_once_sleepable_start(bool *done);
+void __do_once_sleepable_done(bool *done, struct static_key_true *once_key,
+			      struct module *mod);
 
 /* Call a function exactly once. The idea of DO_ONCE() is to perform
  * a function call such as initialization of random seeds, etc, only
@@ -61,26 +61,26 @@ void __do_once_slow_done(bool *done, struct static_key_true *once_key,
 	})
 
 /* Variant of DO_ONCE() for process/sleepable contexts. */
-#define DO_ONCE_SLOW(func, ...)						     \
-	({								     \
-		bool ___ret = false;					     \
-		static bool __section(".data.once") ___done = false;	     \
-		static DEFINE_STATIC_KEY_TRUE(___once_key);		     \
-		if (static_branch_unlikely(&___once_key)) {		     \
-			___ret = __do_once_slow_start(&___done);	     \
-			if (unlikely(___ret)) {				     \
-				func(__VA_ARGS__);			     \
-				__do_once_slow_done(&___done, &___once_key,  \
-						    THIS_MODULE);	     \
-			}						     \
-		}							     \
-		___ret;							     \
+#define DO_ONCE_SLEEPABLE(func, ...)						\
+	({									\
+		bool ___ret = false;						\
+		static bool __section(".data.once") ___done = false;		\
+		static DEFINE_STATIC_KEY_TRUE(___once_key);			\
+		if (static_branch_unlikely(&___once_key)) {			\
+			___ret = __do_once_sleepable_start(&___done);		\
+			if (unlikely(___ret)) {					\
+				func(__VA_ARGS__);				\
+				__do_once_sleepable_done(&___done, &___once_key,\
+						    THIS_MODULE);		\
+			}							\
+		}								\
+		___ret;								\
 	})
 
 #define get_random_once(buf, nbytes)					     \
 	DO_ONCE(get_random_bytes, (buf), (nbytes))
 
-#define get_random_slow_once(buf, nbytes)				     \
-	DO_ONCE_SLOW(get_random_bytes, (buf), (nbytes))
+#define get_random_sleepable_once(buf, nbytes)				     \
+	DO_ONCE_SLEEPABLE(get_random_bytes, (buf), (nbytes))
 
 #endif /* _LINUX_ONCE_H */
diff --git a/lib/once.c b/lib/once.c
index 351f66aad310..2c306f0e891e 100644
--- a/lib/once.c
+++ b/lib/once.c
@@ -69,7 +69,7 @@ EXPORT_SYMBOL(__do_once_done);
 
 static DEFINE_MUTEX(once_mutex);
 
-bool __do_once_slow_start(bool *done)
+bool __do_once_sleepable_start(bool *done)
 	__acquires(once_mutex)
 {
 	mutex_lock(&once_mutex);
@@ -77,7 +77,7 @@ bool __do_once_slow_start(bool *done)
 		mutex_unlock(&once_mutex);
 		/* Keep sparse happy by restoring an even lock count on
 		 * this mutex. In case we return here, we don't call into
-		 * __do_once_done but return early in the DO_ONCE_SLOW() macro.
+		 * __do_once_done but return early in the DO_ONCE_SLEEPABLE() macro.
 		 */
 		__acquire(once_mutex);
 		return false;
@@ -85,9 +85,9 @@ bool __do_once_slow_start(bool *done)
 
 	return true;
 }
-EXPORT_SYMBOL(__do_once_slow_start);
+EXPORT_SYMBOL(__do_once_sleepable_start);
 
-void __do_once_slow_done(bool *done, struct static_key_true *once_key,
+void __do_once_sleepable_done(bool *done, struct static_key_true *once_key,
 			 struct module *mod)
 	__releases(once_mutex)
 {
@@ -95,4 +95,4 @@ void __do_once_slow_done(bool *done, struct static_key_true *once_key,
 	mutex_unlock(&once_mutex);
 	once_disable_jump(once_key, mod);
 }
-EXPORT_SYMBOL(__do_once_slow_done);
+EXPORT_SYMBOL(__do_once_sleepable_done);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index dc1c5629cd0d..a0ad34e4f044 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -958,8 +958,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 	if (likely(remaining > 1))
 		remaining &= ~1U;
 
-	get_random_slow_once(table_perturb,
-			     INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
+	get_random_sleepable_once(table_perturb,
+				  INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
 	index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
 
 	offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
-- 
cgit v1.2.3


From 681bf011b9b5989c6e9db6beb64494918aab9a43 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 3 Oct 2022 21:03:27 -0700
Subject: eth: pse: add missing static inlines

build bot reports missing 'static inline' qualifiers in the header.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: 18ff0bcda6d1 ("ethtool: add interface to interact with Ethernet Power Equipment")
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://lore.kernel.org/r/20221004040327.2034878-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/pse-pd/pse.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index fd1a916eeeba..fb724c65c77b 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -110,16 +110,16 @@ static inline void pse_control_put(struct pse_control *psec)
 {
 }
 
-int pse_ethtool_get_status(struct pse_control *psec,
-			   struct netlink_ext_ack *extack,
-			   struct pse_control_status *status)
+static inline int pse_ethtool_get_status(struct pse_control *psec,
+					 struct netlink_ext_ack *extack,
+					 struct pse_control_status *status)
 {
 	return -ENOTSUPP;
 }
 
-int pse_ethtool_set_config(struct pse_control *psec,
-			   struct netlink_ext_ack *extack,
-			   const struct pse_control_config *config)
+static inline int pse_ethtool_set_config(struct pse_control *psec,
+					 struct netlink_ext_ack *extack,
+					 const struct pse_control_config *config)
 {
 	return -ENOTSUPP;
 }
-- 
cgit v1.2.3


From 6b430f72b2bc14fd0ac922dda92eaa51c82e15a4 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 27 Sep 2022 11:38:23 +0200
Subject: wifi: mt76: fix rate reporting / throughput regression on mt7915 and
 newer

mt7915 and newer need to report the rate_info that's stored in wcid->rate,
since they don't fill info->status.rates.

Cc: Jonas Jelonek <jelonek.jonas@gmail.com>
Reported-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Link: https://lore.kernel.org/all/CABXGCsP0znm9pS-MiKtyxTXR7XiyFVqen0qzNpicGHDZKCzbwg@mail.gmail.com/
Fixes: 44fa75f207d8 ("mac80211: extend current rate control tx status API")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Tested-by: Mikhail Gavrilov <mikhail.v.gavrilov@gmail.com>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/20220927093823.6007-1-nbd@nbd.name
---
 drivers/net/wireless/mediatek/mt76/tx.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c
index e67cc7909bce..6c054850363f 100644
--- a/drivers/net/wireless/mediatek/mt76/tx.c
+++ b/drivers/net/wireless/mediatek/mt76/tx.c
@@ -60,14 +60,20 @@ mt76_tx_status_unlock(struct mt76_dev *dev, struct sk_buff_head *list)
 			.skb = skb,
 			.info = IEEE80211_SKB_CB(skb),
 		};
+		struct ieee80211_rate_status rs = {};
 		struct mt76_tx_cb *cb = mt76_tx_skb_cb(skb);
 		struct mt76_wcid *wcid;
 
 		wcid = rcu_dereference(dev->wcid[cb->wcid]);
 		if (wcid) {
 			status.sta = wcid_to_sta(wcid);
-			status.rates = NULL;
-			status.n_rates = 0;
+			if (status.sta && (wcid->rate.flags || wcid->rate.legacy)) {
+				rs.rate_idx = wcid->rate;
+				status.rates = &rs;
+				status.n_rates = 1;
+			} else {
+				status.n_rates = 0;
+			}
 		}
 
 		hw = mt76_tx_status_get_hw(dev, skb);
-- 
cgit v1.2.3


From f5369dcf5c0a76260cd301bd5c25d59c451d62c1 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Oct 2022 11:05:09 +0200
Subject: wifi: mac80211: do not drop packets smaller than the LLC-SNAP header
 on fast-rx

Since STP TCN frames are only 7 bytes, the pskb_may_pull call returns an error.
Instead of dropping those packets, bump them back to the slow path for proper
processing.

Fixes: 49ddf8e6e234 ("mac80211: add fast-rx path")
Reported-by: Chad Monroe <chad.monroe@smartrg.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index bd215fe3c796..333adad47482 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4708,7 +4708,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
 
 	if (!(status->rx_flags & IEEE80211_RX_AMSDU)) {
 		if (!pskb_may_pull(skb, snap_offs + sizeof(*payload)))
-			goto drop;
+			return false;
 
 		payload = (void *)(skb->data + snap_offs);
 
-- 
cgit v1.2.3


From b650009fcb701ea99aa133bbe18dbfc5305ddf1a Mon Sep 17 00:00:00 2001
From: James Prestwood <prestwoj@gmail.com>
Date: Wed, 28 Sep 2022 15:49:10 -0700
Subject: wifi: mac80211: fix probe req HE capabilities access

When building the probe request IEs HE support is checked for
the 6GHz band (wiphy->bands[NL80211_BAND_6GHZ]). If supported
the HE capability IE should be included according to the spec.
The problem is the 16-bit capability is obtained from the
band object (sband) that was passed in, not the 6GHz band
object (sband6). If the sband object doesn't support HE it will
result in a warning.

Fixes: 7d29bc50b30e ("mac80211: always include HE 6GHz capability in probe request")
Signed-off-by: James Prestwood <prestwoj@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index bf7461c41bef..1e929b82deef 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2046,7 +2046,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata,
 		if (he_cap) {
 			enum nl80211_iftype iftype =
 				ieee80211_vif_type_p2p(&sdata->vif);
-			__le16 cap = ieee80211_get_he_6ghz_capa(sband, iftype);
+			__le16 cap = ieee80211_get_he_6ghz_capa(sband6, iftype);
 
 			pos = ieee80211_write_he_6ghz_cap(pos, cap, end);
 		}
-- 
cgit v1.2.3


From 092197f1f47f8359b46ea62445d87561949b577d Mon Sep 17 00:00:00 2001
From: James Prestwood <prestwoj@gmail.com>
Date: Thu, 15 Sep 2022 12:55:53 -0700
Subject: wifi: mac80211: remove/avoid misleading prints

At some point a few kernel debug prints started appearing which
indicated something was sending invalid IEs:

"bad VHT capabilities, disabling VHT"
"Invalid HE elem, Disable HE"

Turns out these were being printed because the local hardware
supported HE/VHT but the peer/AP did not. Bad/invalid indicates,
to me at least, that the IE is in some way malformed, not missing.

For the HE print (ieee80211_verify_peer_he_mcs_support) it will
now silently fail if the HE capability element is missing (still
prints if the element size is wrong).

For the VHT print, it has been removed completely and will silently
set the DISABLE_VHT flag which is consistent with how DISABLE_HT
is set.

Signed-off-by: James Prestwood <prestwoj@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 54b8d5065bbd..d8484cd870de 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -4409,8 +4409,11 @@ ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata,
 	he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY,
 					     ies->data, ies->len);
 
+	if (!he_cap_elem)
+		return false;
+
 	/* invalid HE IE */
-	if (!he_cap_elem || he_cap_elem->datalen < 1 + sizeof(*he_cap)) {
+	if (he_cap_elem->datalen < 1 + sizeof(*he_cap)) {
 		sdata_info(sdata,
 			   "Invalid HE elem, Disable HE\n");
 		return false;
@@ -4676,8 +4679,6 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
 		}
 
 		if (!elems->vht_cap_elem) {
-			sdata_info(sdata,
-				   "bad VHT capabilities, disabling VHT\n");
 			*conn_flags |= IEEE80211_CONN_DISABLE_VHT;
 			vht_oper = NULL;
 		}
-- 
cgit v1.2.3


From ceb3d688f92231e9d9e663c56a1c8bee90140bad Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Mon, 12 Sep 2022 18:07:16 +0300
Subject: wifi: mac80211: unlock on error in
 ieee80211_can_powered_addr_change()

Unlock before returning -EOPNOTSUPP.

Fixes: 3c06e91b40db ("wifi: mac80211: Support POWERED_ADDR_CHANGE feature")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 572254366a0f..b15afa77b87c 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -243,7 +243,7 @@ static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata
 		 */
 		break;
 	default:
-		return -EOPNOTSUPP;
+		ret = -EOPNOTSUPP;
 	}
 
 unlock:
-- 
cgit v1.2.3


From 3bf9e30e493356912f9cb600f59b51133680639e Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 1 Oct 2022 12:01:13 +0200
Subject: wifi: mac80211: fix decap offload for stations on AP_VLAN interfaces

Since AP_VLAN interfaces are not passed to the driver, check offload_flags
on the bss vif instead.

Reported-by: Howard Hsu <howard-yh.hsu@mediatek.com>
Fixes: 80a915ec4427 ("mac80211: add rx decapsulation offload support")
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 333adad47482..589521717c35 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4352,6 +4352,7 @@ void ieee80211_check_fast_rx(struct sta_info *sta)
 		.vif_type = sdata->vif.type,
 		.control_port_protocol = sdata->control_port_protocol,
 	}, *old, *new = NULL;
+	u32 offload_flags;
 	bool set_offload = false;
 	bool assign = false;
 	bool offload;
@@ -4467,10 +4468,10 @@ void ieee80211_check_fast_rx(struct sta_info *sta)
 	if (assign)
 		new = kmemdup(&fastrx, sizeof(fastrx), GFP_KERNEL);
 
-	offload = assign &&
-		  (sdata->vif.offload_flags & IEEE80211_OFFLOAD_DECAP_ENABLED);
+	offload_flags = get_bss_sdata(sdata)->vif.offload_flags;
+	offload = offload_flags & IEEE80211_OFFLOAD_DECAP_ENABLED;
 
-	if (offload)
+	if (assign && offload)
 		set_offload = !test_and_set_sta_flag(sta, WLAN_STA_DECAP_OFFLOAD);
 	else
 		set_offload = test_and_clear_sta_flag(sta, WLAN_STA_DECAP_OFFLOAD);
-- 
cgit v1.2.3


From c95014e1d05b5acfd9e6fbe5d1f048b07c6902ff Mon Sep 17 00:00:00 2001
From: Alexander Wetzel <alexander@wetzel-home.de>
Date: Tue, 20 Sep 2022 17:55:41 +0200
Subject: wifi: mac80211: netdev compatible TX stop for iTXQ drivers

Properly handle TX stop for internal queues (iTXQs) within mac80211.

mac80211 must not stop netdev queues when using mac80211 iTXQs.
For these drivers the netdev interface is created with IFF_NO_QUEUE.

While netdev still drops frames for IFF_NO_QUEUE interfaces when we stop
the netdev queues, it also prints a warning when this happens:
Assuming the mac80211 interface is called wlan0 we would get
"Virtual device wlan0 asks to queue packet!" when netdev has to drop a
frame.

This patch is keeping the harmless netdev queue starts for iTXQ drivers.

Signed-off-by: Alexander Wetzel <alexander@wetzel-home.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c |  6 +++---
 net/mac80211/tx.c    | 10 ++++++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index b15afa77b87c..dd9ac1f7d2ea 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -461,7 +461,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
 	/*
 	 * Stop TX on this interface first.
 	 */
-	if (sdata->dev)
+	if (!local->ops->wake_tx_queue && sdata->dev)
 		netif_tx_stop_all_queues(sdata->dev);
 
 	ieee80211_roc_purge(local, sdata);
@@ -1412,8 +1412,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 			sdata->vif.type != NL80211_IFTYPE_STATION);
 	}
 
-	set_bit(SDATA_STATE_RUNNING, &sdata->state);
-
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_P2P_DEVICE:
 		rcu_assign_pointer(local->p2p_sdata, sdata);
@@ -1472,6 +1470,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 		spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
 	}
 
+	set_bit(SDATA_STATE_RUNNING, &sdata->state);
+
 	return 0;
  err_del_interface:
 	drv_remove_interface(local, sdata);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 27c964be102e..a364148149f9 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2319,6 +2319,10 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
 	u16 len_rthdr;
 	int hdrlen;
 
+	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	if (unlikely(!ieee80211_sdata_running(sdata)))
+		goto fail;
+
 	memset(info, 0, sizeof(*info));
 	info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
 		      IEEE80211_TX_CTL_INJECTED;
@@ -2378,8 +2382,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
 	 * This is necessary, for example, for old hostapd versions that
 	 * don't use nl80211-based management TX/RX.
 	 */
-	sdata = IEEE80211_DEV_TO_SUB_IF(dev);
-
 	list_for_each_entry_rcu(tmp_sdata, &local->interfaces, list) {
 		if (!ieee80211_sdata_running(tmp_sdata))
 			continue;
@@ -4169,7 +4171,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 	struct sk_buff *next;
 	int len = skb->len;
 
-	if (unlikely(skb->len < ETH_HLEN)) {
+	if (unlikely(!ieee80211_sdata_running(sdata) || skb->len < ETH_HLEN)) {
 		kfree_skb(skb);
 		return;
 	}
@@ -4566,7 +4568,7 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
 	struct ieee80211_key *key;
 	struct sta_info *sta;
 
-	if (unlikely(skb->len < ETH_HLEN)) {
+	if (unlikely(!ieee80211_sdata_running(sdata) || skb->len < ETH_HLEN)) {
 		kfree_skb(skb);
 		return NETDEV_TX_OK;
 	}
-- 
cgit v1.2.3


From d9e249704084982ac7581a560ffa284e11621d43 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 7 Oct 2022 14:56:11 +0200
Subject: wifi: cfg80211: fix ieee80211_data_to_8023_exthdr handling of small
 packets

STP topology change notification packets only have a payload of 7 bytes,
so they get dropped due to the skb->len < hdrlen + 8 check.
Fix this by removing the extra 8 from the skb->len check and checking the
return code on the skb_copy_bits calls.

Fixes: 2d1c304cb2d5 ("cfg80211: add function for 802.3 conversion with separate output buffer")
Reported-by: Chad Monroe <chad.monroe@smartrg.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/util.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/net/wireless/util.c b/net/wireless/util.c
index 01493568a21d..1f285b515028 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -559,7 +559,7 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
 		return -1;
 
 	hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset;
-	if (skb->len < hdrlen + 8)
+	if (skb->len < hdrlen)
 		return -1;
 
 	/* convert IEEE 802.11 header + possible LLC headers into Ethernet
@@ -574,8 +574,9 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
 	memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN);
 	memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN);
 
-	if (iftype == NL80211_IFTYPE_MESH_POINT)
-		skb_copy_bits(skb, hdrlen, &mesh_flags, 1);
+	if (iftype == NL80211_IFTYPE_MESH_POINT &&
+	    skb_copy_bits(skb, hdrlen, &mesh_flags, 1) < 0)
+		return -1;
 
 	mesh_flags &= MESH_FLAGS_AE;
 
@@ -595,11 +596,12 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
 		if (iftype == NL80211_IFTYPE_MESH_POINT) {
 			if (mesh_flags == MESH_FLAGS_AE_A4)
 				return -1;
-			if (mesh_flags == MESH_FLAGS_AE_A5_A6) {
-				skb_copy_bits(skb, hdrlen +
-					offsetof(struct ieee80211s_hdr, eaddr1),
-					tmp.h_dest, 2 * ETH_ALEN);
-			}
+			if (mesh_flags == MESH_FLAGS_AE_A5_A6 &&
+			    skb_copy_bits(skb, hdrlen +
+					  offsetof(struct ieee80211s_hdr, eaddr1),
+					  tmp.h_dest, 2 * ETH_ALEN) < 0)
+				return -1;
+
 			hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags);
 		}
 		break;
@@ -613,10 +615,11 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
 		if (iftype == NL80211_IFTYPE_MESH_POINT) {
 			if (mesh_flags == MESH_FLAGS_AE_A5_A6)
 				return -1;
-			if (mesh_flags == MESH_FLAGS_AE_A4)
-				skb_copy_bits(skb, hdrlen +
-					offsetof(struct ieee80211s_hdr, eaddr1),
-					tmp.h_source, ETH_ALEN);
+			if (mesh_flags == MESH_FLAGS_AE_A4 &&
+			    skb_copy_bits(skb, hdrlen +
+					  offsetof(struct ieee80211s_hdr, eaddr1),
+					  tmp.h_source, ETH_ALEN) < 0)
+				return -1;
 			hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags);
 		}
 		break;
@@ -628,16 +631,15 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
 		break;
 	}
 
-	skb_copy_bits(skb, hdrlen, &payload, sizeof(payload));
-	tmp.h_proto = payload.proto;
-
-	if (likely((!is_amsdu && ether_addr_equal(payload.hdr, rfc1042_header) &&
-		    tmp.h_proto != htons(ETH_P_AARP) &&
-		    tmp.h_proto != htons(ETH_P_IPX)) ||
-		   ether_addr_equal(payload.hdr, bridge_tunnel_header))) {
+	if (likely(skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)) == 0 &&
+	           ((!is_amsdu && ether_addr_equal(payload.hdr, rfc1042_header) &&
+		     payload.proto != htons(ETH_P_AARP) &&
+		     payload.proto != htons(ETH_P_IPX)) ||
+		    ether_addr_equal(payload.hdr, bridge_tunnel_header)))) {
 		/* remove RFC1042 or Bridge-Tunnel encapsulation and
 		 * replace EtherType */
 		hdrlen += ETH_ALEN + 2;
+		tmp.h_proto = payload.proto;
 		skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2);
 	} else {
 		tmp.h_proto = htons(skb->len - hdrlen);
-- 
cgit v1.2.3


From e3e6e1d16a4cf7b63159ec71774e822194071954 Mon Sep 17 00:00:00 2001
From: Hawkins Jiawei <yin31149@gmail.com>
Date: Tue, 27 Sep 2022 07:34:59 +0800
Subject: wifi: wext: use flex array destination for memcpy()

Syzkaller reports buffer overflow false positive as follows:
------------[ cut here ]------------
memcpy: detected field-spanning write (size 8) of single field
	"&compat_event->pointer" at net/wireless/wext-core.c:623 (size 4)
WARNING: CPU: 0 PID: 3607 at net/wireless/wext-core.c:623
	wireless_send_event+0xab5/0xca0 net/wireless/wext-core.c:623
Modules linked in:
CPU: 1 PID: 3607 Comm: syz-executor659 Not tainted
	6.0.0-rc6-next-20220921-syzkaller #0
[...]
Call Trace:
 <TASK>
 ioctl_standard_call+0x155/0x1f0 net/wireless/wext-core.c:1022
 wireless_process_ioctl+0xc8/0x4c0 net/wireless/wext-core.c:955
 wext_ioctl_dispatch net/wireless/wext-core.c:988 [inline]
 wext_ioctl_dispatch net/wireless/wext-core.c:976 [inline]
 wext_handle_ioctl+0x26b/0x280 net/wireless/wext-core.c:1049
 sock_ioctl+0x285/0x640 net/socket.c:1220
 vfs_ioctl fs/ioctl.c:51 [inline]
 __do_sys_ioctl fs/ioctl.c:870 [inline]
 __se_sys_ioctl fs/ioctl.c:856 [inline]
 __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:856
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x63/0xcd
 [...]
 </TASK>

Wireless events will be sent on the appropriate channels in
wireless_send_event(). Different wireless events may have different
payload structure and size, so kernel uses **len** and **cmd** field
in struct __compat_iw_event as wireless event common LCP part, uses
**pointer** as a label to mark the position of remaining different part.

Yet the problem is that, **pointer** is a compat_caddr_t type, which may
be smaller than the relative structure at the same position. So during
wireless_send_event() tries to parse the wireless events payload, it may
trigger the memcpy() run-time destination buffer bounds checking when the
relative structure's data is copied to the position marked by **pointer**.

This patch solves it by introducing flexible-array field **ptr_bytes**,
to mark the position of the wireless events remaining part next to
LCP part. What's more, this patch also adds **ptr_len** variable in
wireless_send_event() to improve its maintainability.

Reported-and-tested-by: syzbot+473754e5af963cf014cf@syzkaller.appspotmail.com
Link: https://lore.kernel.org/all/00000000000070db2005e95a5984@google.com/
Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Hawkins Jiawei <yin31149@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/wireless.h | 10 +++++++++-
 net/wireless/wext-core.c | 17 ++++++++++-------
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/include/linux/wireless.h b/include/linux/wireless.h
index 2d1b54556eff..e6e34d74dda0 100644
--- a/include/linux/wireless.h
+++ b/include/linux/wireless.h
@@ -26,7 +26,15 @@ struct compat_iw_point {
 struct __compat_iw_event {
 	__u16		len;			/* Real length of this stuff */
 	__u16		cmd;			/* Wireless IOCTL */
-	compat_caddr_t	pointer;
+
+	union {
+		compat_caddr_t	pointer;
+
+		/* we need ptr_bytes to make memcpy() run-time destination
+		 * buffer bounds checking happy, nothing special
+		 */
+		DECLARE_FLEX_ARRAY(__u8, ptr_bytes);
+	};
 };
 #define IW_EV_COMPAT_LCP_LEN offsetof(struct __compat_iw_event, pointer)
 #define IW_EV_COMPAT_POINT_OFF offsetof(struct compat_iw_point, length)
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 76a80a41615b..fe8765c4075d 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -468,6 +468,7 @@ void wireless_send_event(struct net_device *	dev,
 	struct __compat_iw_event *compat_event;
 	struct compat_iw_point compat_wrqu;
 	struct sk_buff *compskb;
+	int ptr_len;
 #endif
 
 	/*
@@ -582,6 +583,9 @@ void wireless_send_event(struct net_device *	dev,
 	nlmsg_end(skb, nlh);
 #ifdef CONFIG_COMPAT
 	hdr_len = compat_event_type_size[descr->header_type];
+
+	/* ptr_len is remaining size in event header apart from LCP */
+	ptr_len = hdr_len - IW_EV_COMPAT_LCP_LEN;
 	event_len = hdr_len + extra_len;
 
 	compskb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
@@ -612,16 +616,15 @@ void wireless_send_event(struct net_device *	dev,
 	if (descr->header_type == IW_HEADER_TYPE_POINT) {
 		compat_wrqu.length = wrqu->data.length;
 		compat_wrqu.flags = wrqu->data.flags;
-		memcpy(&compat_event->pointer,
-			((char *) &compat_wrqu) + IW_EV_COMPAT_POINT_OFF,
-			hdr_len - IW_EV_COMPAT_LCP_LEN);
+		memcpy(compat_event->ptr_bytes,
+		       ((char *)&compat_wrqu) + IW_EV_COMPAT_POINT_OFF,
+			ptr_len);
 		if (extra_len)
-			memcpy(((char *) compat_event) + hdr_len,
-				extra, extra_len);
+			memcpy(&compat_event->ptr_bytes[ptr_len],
+			       extra, extra_len);
 	} else {
 		/* extra_len must be zero, so no if (extra) needed */
-		memcpy(&compat_event->pointer, wrqu,
-			hdr_len - IW_EV_COMPAT_LCP_LEN);
+		memcpy(compat_event->ptr_bytes, wrqu, ptr_len);
 	}
 
 	nlmsg_end(compskb, nlh);
-- 
cgit v1.2.3


From 10d5ea5a436da8d60cdb5845f454d595accdbce0 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 26 Sep 2022 19:29:23 -0700
Subject: wifi: nl80211: Split memcpy() of struct nl80211_wowlan_tcp_data_token
 flexible array

To work around a misbehavior of the compiler's ability to see into
composite flexible array structs (as detailed in the coming memcpy()
hardening series[1]), split the memcpy() of the header and the payload
so no false positive run-time overflow warning will be generated.

[1] https://lore.kernel.org/linux-hardening/20220901065914.1417829-2-keescook@chromium.org/

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 8ff8b1c040f0..597c52236514 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -13265,7 +13265,9 @@ static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev,
 	       wake_mask_size);
 	if (tok) {
 		cfg->tokens_size = tokens_size;
-		memcpy(&cfg->payload_tok, tok, sizeof(*tok) + tokens_size);
+		cfg->payload_tok = *tok;
+		memcpy(cfg->payload_tok.token_stream, tok->token_stream,
+		       tokens_size);
 	}
 
 	trig->tcp = cfg;
-- 
cgit v1.2.3