From 47661ee1821fc3a6b5ae07aac37410d6ccada976 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 18 Jun 2021 13:18:46 +0200
Subject: memory: tegra: Add compile-test stub for tegra_mc_probe_device()

The tegra_mc_probe_device() symbol is only available when the TEGRA_MC
Kconfig option is enabled. Provide a stub if that's not the case so that
the driver can be compile-tested.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Link: https://lore.kernel.org/r/20210618111846.1286166-1-thierry.reding@gmail.com
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
---
 include/soc/tegra/mc.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h
index e19c2504a14b..1066b1194a5a 100644
--- a/include/soc/tegra/mc.h
+++ b/include/soc/tegra/mc.h
@@ -237,14 +237,19 @@ unsigned int tegra_mc_get_emem_device_count(struct tegra_mc *mc);
 
 #ifdef CONFIG_TEGRA_MC
 struct tegra_mc *devm_tegra_memory_controller_get(struct device *dev);
+int tegra_mc_probe_device(struct tegra_mc *mc, struct device *dev);
 #else
 static inline struct tegra_mc *
 devm_tegra_memory_controller_get(struct device *dev)
 {
 	return ERR_PTR(-ENODEV);
 }
-#endif
 
-int tegra_mc_probe_device(struct tegra_mc *mc, struct device *dev);
+static inline int
+tegra_mc_probe_device(struct tegra_mc *mc, struct device *dev)
+{
+	return -ENODEV;
+}
+#endif
 
 #endif /* __SOC_TEGRA_MC_H__ */
-- 
cgit v1.2.3


From 6549c46af8551b346bcc0b9043f93848319acd5c Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Sun, 27 Jun 2021 16:04:18 +0800
Subject: regulator: rt5033: Fix n_voltages settings for BUCK and LDO

For linear regulators, the n_voltages should be (max - min) / step + 1.

Buck voltage from 1v to 3V, per step 100mV, and vout mask is 0x1f.
If value is from 20 to 31, the voltage will all be fixed to 3V.
And LDO also, just vout range is different from 1.2v to 3v, step is the
same. If value is from 18 to 31, the voltage will also be fixed to 3v.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Reviewed-by: ChiYuan Huang <cy_huang@richtek.com>
Link: https://lore.kernel.org/r/20210627080418.1718127-1-axel.lin@ingics.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/mfd/rt5033-private.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mfd/rt5033-private.h b/include/linux/mfd/rt5033-private.h
index 2d1895c3efbf..40a0c2dfb80f 100644
--- a/include/linux/mfd/rt5033-private.h
+++ b/include/linux/mfd/rt5033-private.h
@@ -200,13 +200,13 @@ enum rt5033_reg {
 #define RT5033_REGULATOR_BUCK_VOLTAGE_MIN		1000000U
 #define RT5033_REGULATOR_BUCK_VOLTAGE_MAX		3000000U
 #define RT5033_REGULATOR_BUCK_VOLTAGE_STEP		100000U
-#define RT5033_REGULATOR_BUCK_VOLTAGE_STEP_NUM		32
+#define RT5033_REGULATOR_BUCK_VOLTAGE_STEP_NUM		21
 
 /* RT5033 regulator LDO output voltage uV */
 #define RT5033_REGULATOR_LDO_VOLTAGE_MIN		1200000U
 #define RT5033_REGULATOR_LDO_VOLTAGE_MAX		3000000U
 #define RT5033_REGULATOR_LDO_VOLTAGE_STEP		100000U
-#define RT5033_REGULATOR_LDO_VOLTAGE_STEP_NUM		32
+#define RT5033_REGULATOR_LDO_VOLTAGE_STEP_NUM		19
 
 /* RT5033 regulator SAFE LDO output voltage uV */
 #define RT5033_REGULATOR_SAFE_LDO_VOLTAGE		4900000U
-- 
cgit v1.2.3


From 0dbffbb5335a1e3aa6855e4ee317e25e669dd302 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 29 Jun 2021 07:12:45 -0700
Subject: net: annotate data race around sk_ll_usec

sk_ll_usec is read locklessly from sk_can_busy_loop()
while another thread can change its value in sock_setsockopt()

This is correct but needs annotations.

BUG: KCSAN: data-race in __skb_try_recv_datagram / sock_setsockopt

write to 0xffff88814eb5f904 of 4 bytes by task 14011 on cpu 0:
 sock_setsockopt+0x1287/0x2090 net/core/sock.c:1175
 __sys_setsockopt+0x14f/0x200 net/socket.c:2100
 __do_sys_setsockopt net/socket.c:2115 [inline]
 __se_sys_setsockopt net/socket.c:2112 [inline]
 __x64_sys_setsockopt+0x62/0x70 net/socket.c:2112
 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47
 entry_SYSCALL_64_after_hwframe+0x44/0xae

read to 0xffff88814eb5f904 of 4 bytes by task 14001 on cpu 1:
 sk_can_busy_loop include/net/busy_poll.h:41 [inline]
 __skb_try_recv_datagram+0x14f/0x320 net/core/datagram.c:273
 unix_dgram_recvmsg+0x14c/0x870 net/unix/af_unix.c:2101
 unix_seqpacket_recvmsg+0x5a/0x70 net/unix/af_unix.c:2067
 ____sys_recvmsg+0x15d/0x310 include/linux/uio.h:244
 ___sys_recvmsg net/socket.c:2598 [inline]
 do_recvmmsg+0x35c/0x9f0 net/socket.c:2692
 __sys_recvmmsg net/socket.c:2771 [inline]
 __do_sys_recvmmsg net/socket.c:2794 [inline]
 __se_sys_recvmmsg net/socket.c:2787 [inline]
 __x64_sys_recvmmsg+0xcf/0x150 net/socket.c:2787
 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47
 entry_SYSCALL_64_after_hwframe+0x44/0xae

value changed: 0x00000000 -> 0x00000101

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 14001 Comm: syz-executor.3 Not tainted 5.13.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/busy_poll.h | 2 +-
 net/core/sock.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 73af4a64a599..40296ed976a9 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -38,7 +38,7 @@ static inline bool net_busy_loop_on(void)
 
 static inline bool sk_can_busy_loop(const struct sock *sk)
 {
-	return sk->sk_ll_usec && !signal_pending(current);
+	return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current);
 }
 
 bool sk_busy_loop_end(void *p, unsigned long start_time);
diff --git a/net/core/sock.c b/net/core/sock.c
index ba1c0f75cd45..dd9599656c40 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1201,7 +1201,7 @@ set_sndbuf:
 			if (val < 0)
 				ret = -EINVAL;
 			else
-				sk->sk_ll_usec = val;
+				WRITE_ONCE(sk->sk_ll_usec, val);
 		}
 		break;
 	case SO_PREFER_BUSY_POLL:
-- 
cgit v1.2.3


From 1d11fa231cabeae09a95cb3e4cf1d9dd34e00f08 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 29 Jun 2021 23:34:08 -0400
Subject: sctp: move 198 addresses from unusable to private scope
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The doc draft-stewart-tsvwg-sctp-ipv4-00 that restricts 198 addresses
was never published. These addresses as private addresses should be
allowed to use in SCTP.

As Michael Tuexen suggested, this patch is to move 198 addresses from
unusable to private scope.

Reported-by: Sérgio <surkamp@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/constants.h | 4 +---
 net/sctp/protocol.c          | 3 ++-
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 265fffa33dad..5859e0a16a58 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -360,8 +360,7 @@ enum {
 #define SCTP_SCOPE_POLICY_MAX	SCTP_SCOPE_POLICY_LINK
 
 /* Based on IPv4 scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>,
- * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 198.18.0.0/24,
- * 192.88.99.0/24.
+ * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 192.88.99.0/24.
  * Also, RFC 8.4, non-unicast addresses are not considered valid SCTP
  * addresses.
  */
@@ -369,7 +368,6 @@ enum {
 	((htonl(INADDR_BROADCAST) == a) ||  \
 	 ipv4_is_multicast(a) ||	    \
 	 ipv4_is_zeronet(a) ||		    \
-	 ipv4_is_test_198(a) ||		    \
 	 ipv4_is_anycast_6to4(a))
 
 /* Flags used for the bind address copy functions.  */
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 3c1fbf38f4f7..ec0f52567c16 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -398,7 +398,8 @@ static enum sctp_scope sctp_v4_scope(union sctp_addr *addr)
 		retval = SCTP_SCOPE_LINK;
 	} else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) ||
 		   ipv4_is_private_172(addr->v4.sin_addr.s_addr) ||
-		   ipv4_is_private_192(addr->v4.sin_addr.s_addr)) {
+		   ipv4_is_private_192(addr->v4.sin_addr.s_addr) ||
+		   ipv4_is_test_198(addr->v4.sin_addr.s_addr)) {
 		retval = SCTP_SCOPE_PRIVATE;
 	} else {
 		retval = SCTP_SCOPE_GLOBAL;
-- 
cgit v1.2.3


From 5d43f951b1ac797450bb4d230fdc960b739bea04 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:52 +0800
Subject: ptp: add ptp virtual clock driver framework

This patch is to add ptp virtual clock driver framework
utilizing timecounter/cyclecounter.

The patch just exports two essential APIs for PTP driver.

- ptp_vclock_register()
- ptp_vclock_unregister()

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/Makefile             |   2 +-
 drivers/ptp/ptp_private.h        |  15 ++++
 drivers/ptp/ptp_vclock.c         | 150 +++++++++++++++++++++++++++++++++++++++
 include/linux/ptp_clock_kernel.h |   4 +-
 4 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 drivers/ptp/ptp_vclock.c

(limited to 'include')

diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile
index 8673d1743faa..28a6fe342d3e 100644
--- a/drivers/ptp/Makefile
+++ b/drivers/ptp/Makefile
@@ -3,7 +3,7 @@
 # Makefile for PTP 1588 clock support.
 #
 
-ptp-y					:= ptp_clock.o ptp_chardev.o ptp_sysfs.o
+ptp-y					:= ptp_clock.o ptp_chardev.o ptp_sysfs.o ptp_vclock.o
 ptp_kvm-$(CONFIG_X86)			:= ptp_kvm_x86.o ptp_kvm_common.o
 ptp_kvm-$(CONFIG_HAVE_ARM_SMCCC)	:= ptp_kvm_arm.o ptp_kvm_common.o
 obj-$(CONFIG_PTP_1588_CLOCK)		+= ptp.o
diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
index 6b97155148f1..853b79b6b30e 100644
--- a/drivers/ptp/ptp_private.h
+++ b/drivers/ptp/ptp_private.h
@@ -48,6 +48,19 @@ struct ptp_clock {
 	struct kthread_delayed_work aux_work;
 };
 
+#define info_to_vclock(d) container_of((d), struct ptp_vclock, info)
+#define cc_to_vclock(d) container_of((d), struct ptp_vclock, cc)
+#define dw_to_vclock(d) container_of((d), struct ptp_vclock, refresh_work)
+
+struct ptp_vclock {
+	struct ptp_clock *pclock;
+	struct ptp_clock_info info;
+	struct ptp_clock *clock;
+	struct cyclecounter cc;
+	struct timecounter tc;
+	spinlock_t lock;	/* protects tc/cc */
+};
+
 /*
  * The function queue_cnt() is safe for readers to call without
  * holding q->lock. Readers use this function to verify that the queue
@@ -89,4 +102,6 @@ extern const struct attribute_group *ptp_groups[];
 int ptp_populate_pin_groups(struct ptp_clock *ptp);
 void ptp_cleanup_pin_groups(struct ptp_clock *ptp);
 
+struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock);
+void ptp_vclock_unregister(struct ptp_vclock *vclock);
 #endif
diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
new file mode 100644
index 000000000000..fc9205cc504d
--- /dev/null
+++ b/drivers/ptp/ptp_vclock.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PTP virtual clock driver
+ *
+ * Copyright 2021 NXP
+ */
+#include <linux/slab.h>
+#include "ptp_private.h"
+
+#define PTP_VCLOCK_CC_SHIFT		31
+#define PTP_VCLOCK_CC_MULT		(1 << PTP_VCLOCK_CC_SHIFT)
+#define PTP_VCLOCK_FADJ_SHIFT		9
+#define PTP_VCLOCK_FADJ_DENOMINATOR	15625ULL
+#define PTP_VCLOCK_REFRESH_INTERVAL	(HZ * 2)
+
+static int ptp_vclock_adjfine(struct ptp_clock_info *ptp, long scaled_ppm)
+{
+	struct ptp_vclock *vclock = info_to_vclock(ptp);
+	unsigned long flags;
+	s64 adj;
+
+	adj = (s64)scaled_ppm << PTP_VCLOCK_FADJ_SHIFT;
+	adj = div_s64(adj, PTP_VCLOCK_FADJ_DENOMINATOR);
+
+	spin_lock_irqsave(&vclock->lock, flags);
+	timecounter_read(&vclock->tc);
+	vclock->cc.mult = PTP_VCLOCK_CC_MULT + adj;
+	spin_unlock_irqrestore(&vclock->lock, flags);
+
+	return 0;
+}
+
+static int ptp_vclock_adjtime(struct ptp_clock_info *ptp, s64 delta)
+{
+	struct ptp_vclock *vclock = info_to_vclock(ptp);
+	unsigned long flags;
+
+	spin_lock_irqsave(&vclock->lock, flags);
+	timecounter_adjtime(&vclock->tc, delta);
+	spin_unlock_irqrestore(&vclock->lock, flags);
+
+	return 0;
+}
+
+static int ptp_vclock_gettime(struct ptp_clock_info *ptp,
+			      struct timespec64 *ts)
+{
+	struct ptp_vclock *vclock = info_to_vclock(ptp);
+	unsigned long flags;
+	u64 ns;
+
+	spin_lock_irqsave(&vclock->lock, flags);
+	ns = timecounter_read(&vclock->tc);
+	spin_unlock_irqrestore(&vclock->lock, flags);
+	*ts = ns_to_timespec64(ns);
+
+	return 0;
+}
+
+static int ptp_vclock_settime(struct ptp_clock_info *ptp,
+			      const struct timespec64 *ts)
+{
+	struct ptp_vclock *vclock = info_to_vclock(ptp);
+	u64 ns = timespec64_to_ns(ts);
+	unsigned long flags;
+
+	spin_lock_irqsave(&vclock->lock, flags);
+	timecounter_init(&vclock->tc, &vclock->cc, ns);
+	spin_unlock_irqrestore(&vclock->lock, flags);
+
+	return 0;
+}
+
+static long ptp_vclock_refresh(struct ptp_clock_info *ptp)
+{
+	struct ptp_vclock *vclock = info_to_vclock(ptp);
+	struct timespec64 ts;
+
+	ptp_vclock_gettime(&vclock->info, &ts);
+
+	return PTP_VCLOCK_REFRESH_INTERVAL;
+}
+
+static const struct ptp_clock_info ptp_vclock_info = {
+	.owner		= THIS_MODULE,
+	.name		= "ptp virtual clock",
+	/* The maximum ppb value that long scaled_ppm can support */
+	.max_adj	= 32767999,
+	.adjfine	= ptp_vclock_adjfine,
+	.adjtime	= ptp_vclock_adjtime,
+	.gettime64	= ptp_vclock_gettime,
+	.settime64	= ptp_vclock_settime,
+	.do_aux_work	= ptp_vclock_refresh,
+};
+
+static u64 ptp_vclock_read(const struct cyclecounter *cc)
+{
+	struct ptp_vclock *vclock = cc_to_vclock(cc);
+	struct ptp_clock *ptp = vclock->pclock;
+	struct timespec64 ts = {};
+
+	if (ptp->info->gettimex64)
+		ptp->info->gettimex64(ptp->info, &ts, NULL);
+	else
+		ptp->info->gettime64(ptp->info, &ts);
+
+	return timespec64_to_ns(&ts);
+}
+
+static const struct cyclecounter ptp_vclock_cc = {
+	.read	= ptp_vclock_read,
+	.mask	= CYCLECOUNTER_MASK(32),
+	.mult	= PTP_VCLOCK_CC_MULT,
+	.shift	= PTP_VCLOCK_CC_SHIFT,
+};
+
+struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock)
+{
+	struct ptp_vclock *vclock;
+
+	vclock = kzalloc(sizeof(*vclock), GFP_KERNEL);
+	if (!vclock)
+		return NULL;
+
+	vclock->pclock = pclock;
+	vclock->info = ptp_vclock_info;
+	vclock->cc = ptp_vclock_cc;
+
+	snprintf(vclock->info.name, PTP_CLOCK_NAME_LEN, "ptp%d_virt",
+		 pclock->index);
+
+	spin_lock_init(&vclock->lock);
+
+	vclock->clock = ptp_clock_register(&vclock->info, &pclock->dev);
+	if (IS_ERR_OR_NULL(vclock->clock)) {
+		kfree(vclock);
+		return NULL;
+	}
+
+	timecounter_init(&vclock->tc, &vclock->cc, 0);
+	ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL);
+
+	return vclock;
+}
+
+void ptp_vclock_unregister(struct ptp_vclock *vclock)
+{
+	ptp_clock_unregister(vclock->clock);
+	kfree(vclock);
+}
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index aba237c0b3a2..b6fb771ee524 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -11,7 +11,9 @@
 #include <linux/device.h>
 #include <linux/pps_kernel.h>
 #include <linux/ptp_clock.h>
+#include <linux/timecounter.h>
 
+#define PTP_CLOCK_NAME_LEN	32
 /**
  * struct ptp_clock_request - request PTP clock event
  *
@@ -134,7 +136,7 @@ struct ptp_system_timestamp {
 
 struct ptp_clock_info {
 	struct module *owner;
-	char name[16];
+	char name[PTP_CLOCK_NAME_LEN];
 	s32 max_adj;
 	int n_alarm;
 	int n_ext_ts;
-- 
cgit v1.2.3


From acb288e8047b7569fbc9af6fa6e9405315345103 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:55 +0800
Subject: ptp: add kernel API ptp_get_vclocks_index()

Add kernel API ptp_get_vclocks_index() to get all ptp
vclocks index on pclock.

This is preparation for supporting ptp vclocks info query
through ethtool.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_clock.c          |  3 ++-
 drivers/ptp/ptp_private.h        |  2 ++
 drivers/ptp/ptp_vclock.c         | 35 +++++++++++++++++++++++++++++++++++
 include/linux/ptp_clock_kernel.h | 14 ++++++++++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index 9205a9362a9d..f012fa581cf4 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -24,10 +24,11 @@
 #define PTP_PPS_EVENT PPS_CAPTUREASSERT
 #define PTP_PPS_MODE (PTP_PPS_DEFAULTS | PPS_CANWAIT | PPS_TSFMT_TSPEC)
 
+struct class *ptp_class;
+
 /* private globals */
 
 static dev_t ptp_devt;
-static struct class *ptp_class;
 
 static DEFINE_IDA(ptp_clocks_map);
 
diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h
index f75fadd9b244..dba6be477067 100644
--- a/drivers/ptp/ptp_private.h
+++ b/drivers/ptp/ptp_private.h
@@ -96,6 +96,8 @@ static inline bool ptp_vclock_in_use(struct ptp_clock *ptp)
 	return in_use;
 }
 
+extern struct class *ptp_class;
+
 /*
  * see ptp_chardev.c
  */
diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
index fc9205cc504d..cefab29a0592 100644
--- a/drivers/ptp/ptp_vclock.c
+++ b/drivers/ptp/ptp_vclock.c
@@ -148,3 +148,38 @@ void ptp_vclock_unregister(struct ptp_vclock *vclock)
 	ptp_clock_unregister(vclock->clock);
 	kfree(vclock);
 }
+
+int ptp_get_vclocks_index(int pclock_index, int **vclock_index)
+{
+	char name[PTP_CLOCK_NAME_LEN] = "";
+	struct ptp_clock *ptp;
+	struct device *dev;
+	int num = 0;
+
+	if (pclock_index < 0)
+		return num;
+
+	snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", pclock_index);
+	dev = class_find_device_by_name(ptp_class, name);
+	if (!dev)
+		return num;
+
+	ptp = dev_get_drvdata(dev);
+
+	if (mutex_lock_interruptible(&ptp->n_vclocks_mux)) {
+		put_device(dev);
+		return num;
+	}
+
+	*vclock_index = kzalloc(sizeof(int) * ptp->n_vclocks, GFP_KERNEL);
+	if (!(*vclock_index))
+		goto out;
+
+	memcpy(*vclock_index, ptp->vclock_index, sizeof(int) * ptp->n_vclocks);
+	num = ptp->n_vclocks;
+out:
+	mutex_unlock(&ptp->n_vclocks_mux);
+	put_device(dev);
+	return num;
+}
+EXPORT_SYMBOL(ptp_get_vclocks_index);
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index b6fb771ee524..300a984fec87 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -306,6 +306,18 @@ int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay);
  */
 void ptp_cancel_worker_sync(struct ptp_clock *ptp);
 
+/**
+ * ptp_get_vclocks_index() - get all vclocks index on pclock, and
+ *                           caller is responsible to free memory
+ *                           of vclock_index
+ *
+ * @pclock_index: phc index of ptp pclock.
+ * @vclock_index: pointer to pointer of vclock index.
+ *
+ * return number of vclocks.
+ */
+int ptp_get_vclocks_index(int pclock_index, int **vclock_index);
+
 #else
 static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 						   struct device *parent)
@@ -325,6 +337,8 @@ static inline int ptp_schedule_worker(struct ptp_clock *ptp,
 { return -EOPNOTSUPP; }
 static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp)
 { }
+static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index)
+{ return 0; }
 
 #endif
 
-- 
cgit v1.2.3


From c156174a67070042d51d2c866146d3c934d5468c Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:56 +0800
Subject: ethtool: add a new command for getting PHC virtual clocks

Add an interface for getting PHC (PTP Hardware Clock)
virtual clocks, which are based on PHC physical clock
providing hardware timestamp to network packets.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst | 22 +++++++
 include/linux/ethtool.h                      | 10 +++
 include/uapi/linux/ethtool_netlink.h         | 15 +++++
 net/ethtool/Makefile                         |  2 +-
 net/ethtool/common.c                         | 13 ++++
 net/ethtool/netlink.c                        | 10 +++
 net/ethtool/netlink.h                        |  2 +
 net/ethtool/phc_vclocks.c                    | 94 ++++++++++++++++++++++++++++
 8 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 net/ethtool/phc_vclocks.c

(limited to 'include')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 6ea91e41593f..c86628e6a235 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -212,6 +212,7 @@ Userspace to kernel:
   ``ETHTOOL_MSG_FEC_SET``               set FEC settings
   ``ETHTOOL_MSG_MODULE_EEPROM_GET``     read SFP module EEPROM
   ``ETHTOOL_MSG_STATS_GET``             get standard statistics
+  ``ETHTOOL_MSG_PHC_VCLOCKS_GET``       get PHC virtual clocks info
   ===================================== ================================
 
 Kernel to userspace:
@@ -250,6 +251,7 @@ Kernel to userspace:
   ``ETHTOOL_MSG_FEC_NTF``                  FEC settings
   ``ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY``  read SFP module EEPROM
   ``ETHTOOL_MSG_STATS_GET_REPLY``          standard statistics
+  ``ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY``    PHC virtual clocks info
   ======================================== =================================
 
 ``GET`` requests are sent by userspace applications to retrieve device
@@ -1477,6 +1479,25 @@ Low and high bounds are inclusive, for example:
  etherStatsPkts512to1023Octets 512  1023
  ============================= ==== ====
 
+PHC_VCLOCKS_GET
+===============
+
+Query device PHC virtual clocks information.
+
+Request contents:
+
+  ====================================  ======  ==========================
+  ``ETHTOOL_A_PHC_VCLOCKS_HEADER``      nested  request header
+  ====================================  ======  ==========================
+
+Kernel response contents:
+
+  ====================================  ======  ==========================
+  ``ETHTOOL_A_PHC_VCLOCKS_HEADER``      nested  reply header
+  ``ETHTOOL_A_PHC_VCLOCKS_NUM``         u32     PHC virtual clocks number
+  ``ETHTOOL_A_PHC_VCLOCKS_INDEX``       s32     PHC index array
+  ====================================  ======  ==========================
+
 Request translation
 ===================
 
@@ -1575,4 +1596,5 @@ are netlink only.
   n/a                                 ``ETHTOOL_MSG_CABLE_TEST_ACT``
   n/a                                 ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT``
   n/a                                 ``ETHTOOL_MSG_TUNNEL_INFO_GET``
+  n/a                                 ``ETHTOOL_MSG_PHC_VCLOCKS_GET``
   =================================== =====================================
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 29dbb603bc91..232daaec56e4 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -757,6 +757,16 @@ void
 ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings,
 			      enum ethtool_link_mode_bit_indices link_mode);
 
+/**
+ * ethtool_get_phc_vclocks - Derive phc vclocks information, and caller
+ *                           is responsible to free memory of vclock_index
+ * @dev: pointer to net_device structure
+ * @vclock_index: pointer to pointer of vclock index
+ *
+ * Return number of phc vclocks
+ */
+int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index);
+
 /**
  * ethtool_sprintf - Write formatted string to ethtool string data
  * @data: Pointer to start of string to update
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index c7135c9c37a5..b3b93710eff7 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -46,6 +46,7 @@ enum {
 	ETHTOOL_MSG_FEC_SET,
 	ETHTOOL_MSG_MODULE_EEPROM_GET,
 	ETHTOOL_MSG_STATS_GET,
+	ETHTOOL_MSG_PHC_VCLOCKS_GET,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_USER_CNT,
@@ -88,6 +89,7 @@ enum {
 	ETHTOOL_MSG_FEC_NTF,
 	ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY,
 	ETHTOOL_MSG_STATS_GET_REPLY,
+	ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY,
 
 	/* add new constants above here */
 	__ETHTOOL_MSG_KERNEL_CNT,
@@ -440,6 +442,19 @@ enum {
 	ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1)
 };
 
+/* PHC VCLOCKS */
+
+enum {
+	ETHTOOL_A_PHC_VCLOCKS_UNSPEC,
+	ETHTOOL_A_PHC_VCLOCKS_HEADER,			/* nest - _A_HEADER_* */
+	ETHTOOL_A_PHC_VCLOCKS_NUM,			/* u32 */
+	ETHTOOL_A_PHC_VCLOCKS_INDEX,			/* array, s32 */
+
+	/* add new constants above here */
+	__ETHTOOL_A_PHC_VCLOCKS_CNT,
+	ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1)
+};
+
 /* CABLE TEST */
 
 enum {
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index 723c9a8a8cdf..0a19470efbfb 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK)	+= ethtool_nl.o
 ethtool_nl-y	:= netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
 		   linkstate.o debug.o wol.o features.o privflags.o rings.o \
 		   channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
-		   tunnels.o fec.o eeprom.o stats.o
+		   tunnels.o fec.o eeprom.o stats.o phc_vclocks.o
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index f9dcbad84788..798231b07676 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -4,6 +4,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/phy.h>
 #include <linux/rtnetlink.h>
+#include <linux/ptp_clock_kernel.h>
 
 #include "common.h"
 
@@ -554,6 +555,18 @@ int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
 	return 0;
 }
 
+int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index)
+{
+	struct ethtool_ts_info info = { };
+	int num = 0;
+
+	if (!__ethtool_get_ts_info(dev, &info))
+		num = ptp_get_vclocks_index(info.phc_index, vclock_index);
+
+	return num;
+}
+EXPORT_SYMBOL(ethtool_get_phc_vclocks);
+
 const struct ethtool_phy_ops *ethtool_phy_ops;
 
 void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops)
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index a7346346114f..73e0f5b626bf 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -248,6 +248,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
 	[ETHTOOL_MSG_TSINFO_GET]	= &ethnl_tsinfo_request_ops,
 	[ETHTOOL_MSG_MODULE_EEPROM_GET]	= &ethnl_module_eeprom_request_ops,
 	[ETHTOOL_MSG_STATS_GET]		= &ethnl_stats_request_ops,
+	[ETHTOOL_MSG_PHC_VCLOCKS_GET]	= &ethnl_phc_vclocks_request_ops,
 };
 
 static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -958,6 +959,15 @@ static const struct genl_ops ethtool_genl_ops[] = {
 		.policy = ethnl_stats_get_policy,
 		.maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1,
 	},
+	{
+		.cmd	= ETHTOOL_MSG_PHC_VCLOCKS_GET,
+		.doit	= ethnl_default_doit,
+		.start	= ethnl_default_start,
+		.dumpit	= ethnl_default_dumpit,
+		.done	= ethnl_default_done,
+		.policy = ethnl_phc_vclocks_get_policy,
+		.maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1,
+	},
 };
 
 static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 3e25a47fd482..3fc395c86702 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -347,6 +347,7 @@ extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
 extern const struct ethnl_request_ops ethnl_fec_request_ops;
 extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops;
 extern const struct ethnl_request_ops ethnl_stats_request_ops;
+extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -382,6 +383,7 @@ extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1];
 extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1];
 extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1];
 extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1];
+extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1];
 
 int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
 int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/ethtool/phc_vclocks.c b/net/ethtool/phc_vclocks.c
new file mode 100644
index 000000000000..637b2f5297d5
--- /dev/null
+++ b/net/ethtool/phc_vclocks.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2021 NXP
+ */
+#include "netlink.h"
+#include "common.h"
+
+struct phc_vclocks_req_info {
+	struct ethnl_req_info		base;
+};
+
+struct phc_vclocks_reply_data {
+	struct ethnl_reply_data		base;
+	int				num;
+	int				*index;
+};
+
+#define PHC_VCLOCKS_REPDATA(__reply_base) \
+	container_of(__reply_base, struct phc_vclocks_reply_data, base)
+
+const struct nla_policy ethnl_phc_vclocks_get_policy[] = {
+	[ETHTOOL_A_PHC_VCLOCKS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base,
+				    struct ethnl_reply_data *reply_base,
+				    struct genl_info *info)
+{
+	struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base);
+	struct net_device *dev = reply_base->dev;
+	int ret;
+
+	ret = ethnl_ops_begin(dev);
+	if (ret < 0)
+		return ret;
+	data->num = ethtool_get_phc_vclocks(dev, &data->index);
+	ethnl_ops_complete(dev);
+
+	return ret;
+}
+
+static int phc_vclocks_reply_size(const struct ethnl_req_info *req_base,
+				  const struct ethnl_reply_data *reply_base)
+{
+	const struct phc_vclocks_reply_data *data =
+		PHC_VCLOCKS_REPDATA(reply_base);
+	int len = 0;
+
+	if (data->num > 0) {
+		len += nla_total_size(sizeof(u32));
+		len += nla_total_size(sizeof(s32) * data->num);
+	}
+
+	return len;
+}
+
+static int phc_vclocks_fill_reply(struct sk_buff *skb,
+				  const struct ethnl_req_info *req_base,
+				  const struct ethnl_reply_data *reply_base)
+{
+	const struct phc_vclocks_reply_data *data =
+		PHC_VCLOCKS_REPDATA(reply_base);
+
+	if (data->num <= 0)
+		return 0;
+
+	if (nla_put_u32(skb, ETHTOOL_A_PHC_VCLOCKS_NUM, data->num) ||
+	    nla_put(skb, ETHTOOL_A_PHC_VCLOCKS_INDEX,
+		    sizeof(s32) * data->num, data->index))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static void phc_vclocks_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+	const struct phc_vclocks_reply_data *data =
+		PHC_VCLOCKS_REPDATA(reply_base);
+
+	kfree(data->index);
+}
+
+const struct ethnl_request_ops ethnl_phc_vclocks_request_ops = {
+	.request_cmd		= ETHTOOL_MSG_PHC_VCLOCKS_GET,
+	.reply_cmd		= ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY,
+	.hdr_attr		= ETHTOOL_A_PHC_VCLOCKS_HEADER,
+	.req_info_size		= sizeof(struct phc_vclocks_req_info),
+	.reply_data_size	= sizeof(struct phc_vclocks_reply_data),
+
+	.prepare_data		= phc_vclocks_prepare_data,
+	.reply_size		= phc_vclocks_reply_size,
+	.fill_reply		= phc_vclocks_fill_reply,
+	.cleanup_data		= phc_vclocks_cleanup_data,
+};
-- 
cgit v1.2.3


From 895487a3a10fb3a177e20dcde875515d46ccd4df Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:57 +0800
Subject: ptp: add kernel API ptp_convert_timestamp()

Add kernel API ptp_convert_timestamp() to convert raw hardware timestamp
to a specified ptp vclock time.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/ptp/ptp_vclock.c         | 34 ++++++++++++++++++++++++++++++++++
 include/linux/ptp_clock_kernel.h | 13 +++++++++++++
 2 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c
index cefab29a0592..e0f87c57749a 100644
--- a/drivers/ptp/ptp_vclock.c
+++ b/drivers/ptp/ptp_vclock.c
@@ -183,3 +183,37 @@ out:
 	return num;
 }
 EXPORT_SYMBOL(ptp_get_vclocks_index);
+
+void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
+			   int vclock_index)
+{
+	char name[PTP_CLOCK_NAME_LEN] = "";
+	struct ptp_vclock *vclock;
+	struct ptp_clock *ptp;
+	unsigned long flags;
+	struct device *dev;
+	u64 ns;
+
+	snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", vclock_index);
+	dev = class_find_device_by_name(ptp_class, name);
+	if (!dev)
+		return;
+
+	ptp = dev_get_drvdata(dev);
+	if (!ptp->is_virtual_clock) {
+		put_device(dev);
+		return;
+	}
+
+	vclock = info_to_vclock(ptp->info);
+
+	ns = ktime_to_ns(hwtstamps->hwtstamp);
+
+	spin_lock_irqsave(&vclock->lock, flags);
+	ns = timecounter_cyc2time(&vclock->tc, ns);
+	spin_unlock_irqrestore(&vclock->lock, flags);
+
+	put_device(dev);
+	hwtstamps->hwtstamp = ns_to_ktime(ns);
+}
+EXPORT_SYMBOL(ptp_convert_timestamp);
diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h
index 300a984fec87..71fac9237725 100644
--- a/include/linux/ptp_clock_kernel.h
+++ b/include/linux/ptp_clock_kernel.h
@@ -12,6 +12,7 @@
 #include <linux/pps_kernel.h>
 #include <linux/ptp_clock.h>
 #include <linux/timecounter.h>
+#include <linux/skbuff.h>
 
 #define PTP_CLOCK_NAME_LEN	32
 /**
@@ -318,6 +319,15 @@ void ptp_cancel_worker_sync(struct ptp_clock *ptp);
  */
 int ptp_get_vclocks_index(int pclock_index, int **vclock_index);
 
+/**
+ * ptp_convert_timestamp() - convert timestamp to a ptp vclock time
+ *
+ * @hwtstamps:    skb_shared_hwtstamps structure pointer
+ * @vclock_index: phc index of ptp vclock.
+ */
+void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
+			   int vclock_index);
+
 #else
 static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info,
 						   struct device *parent)
@@ -339,6 +349,9 @@ static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp)
 { }
 static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index)
 { return 0; }
+static inline void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps,
+					 int vclock_index)
+{ }
 
 #endif
 
-- 
cgit v1.2.3


From d463126e23f112629edb01594141ca437a92a108 Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Wed, 30 Jun 2021 16:11:59 +0800
Subject: net: sock: extend SO_TIMESTAMPING for PHC binding

Since PTP virtual clock support is added, there can be
several PTP virtual clocks based on one PTP physical
clock for timestamping.

This patch is to extend SO_TIMESTAMPING API to support
PHC (PTP Hardware Clock) binding by adding a new flag
SOF_TIMESTAMPING_BIND_PHC. When PTP virtual clocks are
in use, user space can configure to bind one for
timestamping, but PTP physical clock is not supported
and not needed to bind.

This patch is preparation for timestamp conversion from
raw timestamp to a specific PTP virtual clock time in
core net.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h              |  8 +++--
 include/uapi/linux/net_tstamp.h | 17 +++++++++--
 net/core/sock.c                 | 65 +++++++++++++++++++++++++++++++++++++++--
 net/ethtool/common.c            |  1 +
 net/mptcp/sockopt.c             | 23 +++++++++++----
 5 files changed, 101 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 8bdd80027ffb..f23cb259b0e2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -316,7 +316,9 @@ struct bpf_local_storage;
   *	@sk_timer: sock cleanup timer
   *	@sk_stamp: time stamp of last packet received
   *	@sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
-  *	@sk_tsflags: SO_TIMESTAMPING socket options
+  *	@sk_tsflags: SO_TIMESTAMPING flags
+  *	@sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
+  *	              for timestamping
   *	@sk_tskey: counter to disambiguate concurrent tstamp requests
   *	@sk_zckey: counter to order MSG_ZEROCOPY notifications
   *	@sk_socket: Identd and reporting IO signals
@@ -493,6 +495,7 @@ struct sock {
 	seqlock_t		sk_stamp_seq;
 #endif
 	u16			sk_tsflags;
+	int			sk_bind_phc;
 	u8			sk_shutdown;
 	u32			sk_tskey;
 	atomic_t		sk_zckey;
@@ -2755,7 +2758,8 @@ void sock_def_readable(struct sock *sk);
 
 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
 void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
-int sock_set_timestamping(struct sock *sk, int optname, int val);
+int sock_set_timestamping(struct sock *sk, int optname,
+			  struct so_timestamping timestamping);
 
 void sock_enable_timestamps(struct sock *sk);
 void sock_no_linger(struct sock *sk);
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 7ed0b3d1c00a..fcc61c73a666 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -13,7 +13,7 @@
 #include <linux/types.h>
 #include <linux/socket.h>   /* for SO_TIMESTAMPING */
 
-/* SO_TIMESTAMPING gets an integer bit field comprised of these values */
+/* SO_TIMESTAMPING flags */
 enum {
 	SOF_TIMESTAMPING_TX_HARDWARE = (1<<0),
 	SOF_TIMESTAMPING_TX_SOFTWARE = (1<<1),
@@ -30,8 +30,9 @@ enum {
 	SOF_TIMESTAMPING_OPT_STATS = (1<<12),
 	SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
 	SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14),
+	SOF_TIMESTAMPING_BIND_PHC = (1 << 15),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TX_SWHW,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_BIND_PHC,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
@@ -46,6 +47,18 @@ enum {
 					 SOF_TIMESTAMPING_TX_SCHED | \
 					 SOF_TIMESTAMPING_TX_ACK)
 
+/**
+ * struct so_timestamping - SO_TIMESTAMPING parameter
+ *
+ * @flags:	SO_TIMESTAMPING flags
+ * @bind_phc:	Index of PTP virtual clock bound to sock. This is available
+ *		if flag SOF_TIMESTAMPING_BIND_PHC is set.
+ */
+struct so_timestamping {
+	int flags;
+	int bind_phc;
+};
+
 /**
  * struct hwtstamp_config - %SIOCGHWTSTAMP and %SIOCSHWTSTAMP parameter
  *
diff --git a/net/core/sock.c b/net/core/sock.c
index dd9599656c40..cad107112204 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -139,6 +139,8 @@
 #include <net/tcp.h>
 #include <net/busy_poll.h>
 
+#include <linux/ethtool.h>
+
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
@@ -810,8 +812,47 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 	}
 }
 
-int sock_set_timestamping(struct sock *sk, int optname, int val)
+static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 {
+	struct net *net = sock_net(sk);
+	struct net_device *dev = NULL;
+	bool match = false;
+	int *vclock_index;
+	int i, num;
+
+	if (sk->sk_bound_dev_if)
+		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
+
+	if (!dev) {
+		pr_err("%s: sock not bind to device\n", __func__);
+		return -EOPNOTSUPP;
+	}
+
+	num = ethtool_get_phc_vclocks(dev, &vclock_index);
+	for (i = 0; i < num; i++) {
+		if (*(vclock_index + i) == phc_index) {
+			match = true;
+			break;
+		}
+	}
+
+	if (num > 0)
+		kfree(vclock_index);
+
+	if (!match)
+		return -EINVAL;
+
+	sk->sk_bind_phc = phc_index;
+
+	return 0;
+}
+
+int sock_set_timestamping(struct sock *sk, int optname,
+			  struct so_timestamping timestamping)
+{
+	int val = timestamping.flags;
+	int ret;
+
 	if (val & ~SOF_TIMESTAMPING_MASK)
 		return -EINVAL;
 
@@ -832,6 +873,12 @@ int sock_set_timestamping(struct sock *sk, int optname, int val)
 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 		return -EINVAL;
 
+	if (val & SOF_TIMESTAMPING_BIND_PHC) {
+		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
+		if (ret)
+			return ret;
+	}
+
 	sk->sk_tsflags = val;
 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 
@@ -907,6 +954,7 @@ EXPORT_SYMBOL(sock_set_mark);
 int sock_setsockopt(struct socket *sock, int level, int optname,
 		    sockptr_t optval, unsigned int optlen)
 {
+	struct so_timestamping timestamping;
 	struct sock_txtime sk_txtime;
 	struct sock *sk = sock->sk;
 	int val;
@@ -1073,7 +1121,15 @@ set_sndbuf:
 
 	case SO_TIMESTAMPING_NEW:
 	case SO_TIMESTAMPING_OLD:
-		ret = sock_set_timestamping(sk, optname, val);
+		if (optlen == sizeof(timestamping)) {
+			if (copy_from_sockptr(&timestamping, optval,
+					      sizeof(timestamping)))
+				return -EFAULT;
+		} else {
+			memset(&timestamping, 0, sizeof(timestamping));
+			timestamping.flags = val;
+		}
+		ret = sock_set_timestamping(sk, optname, timestamping);
 		break;
 
 	case SO_RCVLOWAT:
@@ -1348,6 +1404,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		struct __kernel_old_timeval tm;
 		struct  __kernel_sock_timeval stm;
 		struct sock_txtime txtime;
+		struct so_timestamping timestamping;
 	} v;
 
 	int lv = sizeof(int);
@@ -1451,7 +1508,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		break;
 
 	case SO_TIMESTAMPING_OLD:
-		v.val = sk->sk_tsflags;
+		lv = sizeof(v.timestamping);
+		v.timestamping.flags = sk->sk_tsflags;
+		v.timestamping.bind_phc = sk->sk_bind_phc;
 		break;
 
 	case SO_RCVTIMEO_OLD:
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 798231b07676..c63e0739dc6a 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -398,6 +398,7 @@ const char sof_timestamping_names[][ETH_GSTRING_LEN] = {
 	[const_ilog2(SOF_TIMESTAMPING_OPT_STATS)]    = "option-stats",
 	[const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)]  = "option-pktinfo",
 	[const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)]  = "option-tx-swhw",
+	[const_ilog2(SOF_TIMESTAMPING_BIND_PHC)]     = "bind-phc",
 };
 static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT);
 
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index ea38cbcd2ad4..8c03afac5ca0 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -207,14 +207,25 @@ static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
 {
 	struct mptcp_subflow_context *subflow;
 	struct sock *sk = (struct sock *)msk;
-	int val, ret;
+	struct so_timestamping timestamping;
+	int ret;
 
-	ret = mptcp_get_int_option(msk, optval, optlen, &val);
-	if (ret)
-		return ret;
+	if (optlen == sizeof(timestamping)) {
+		if (copy_from_sockptr(&timestamping, optval,
+				      sizeof(timestamping)))
+			return -EFAULT;
+	} else if (optlen == sizeof(int)) {
+		memset(&timestamping, 0, sizeof(timestamping));
+
+		if (copy_from_sockptr(&timestamping.flags, optval, sizeof(int)))
+			return -EFAULT;
+	} else {
+		return -EINVAL;
+	}
 
 	ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
-			      KERNEL_SOCKPTR(&val), sizeof(val));
+			      KERNEL_SOCKPTR(&timestamping),
+			      sizeof(timestamping));
 	if (ret)
 		return ret;
 
@@ -224,7 +235,7 @@ static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
 		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 		bool slow = lock_sock_fast(ssk);
 
-		sock_set_timestamping(sk, optname, val);
+		sock_set_timestamping(sk, optname, timestamping);
 		unlock_sock_fast(ssk, slow);
 	}
 
-- 
cgit v1.2.3


From ca75bcf0a83b6cc7f53a593d98ec7121c4839b43 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Jul 2021 10:15:09 +0200
Subject: net: remove the caif_hsi driver

The caif_hsi driver relies on a cfhsi_get_ops symbol using symbol_get,
but this symbol is not provided anywhere in the kernel tree.  Remove
this driver given that it is dead code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/caif/Kconfig    |    9 -
 drivers/net/caif/Makefile   |    3 -
 drivers/net/caif/caif_hsi.c | 1454 -------------------------------------------
 include/net/caif/caif_hsi.h |  200 ------
 4 files changed, 1666 deletions(-)
 delete mode 100644 drivers/net/caif/caif_hsi.c
 delete mode 100644 include/net/caif/caif_hsi.h

(limited to 'include')

diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig
index a77124bc1f4b..709660cb38f8 100644
--- a/drivers/net/caif/Kconfig
+++ b/drivers/net/caif/Kconfig
@@ -20,15 +20,6 @@ config CAIF_TTY
 	  identified as N_CAIF. When this ldisc is opened from user space
 	  it will redirect the TTY's traffic into the CAIF stack.
 
-config CAIF_HSI
-	tristate "CAIF HSI transport driver"
-	depends on CAIF
-	default n
-	help
-	  The CAIF low level driver for CAIF over HSI.
-	  Be aware that if you enable this then you also need to
-	  enable a low-level HSI driver.
-
 config CAIF_VIRTIO
 	tristate "CAIF virtio transport driver"
 	depends on CAIF && HAS_DMA
diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile
index b1918c8c126c..97f664f8016c 100644
--- a/drivers/net/caif/Makefile
+++ b/drivers/net/caif/Makefile
@@ -4,8 +4,5 @@ ccflags-$(CONFIG_CAIF_DEBUG) := -DDEBUG
 # Serial interface
 obj-$(CONFIG_CAIF_TTY) += caif_serial.o
 
-# HSI interface
-obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
-
 # Virtio interface
 obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o
diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c
deleted file mode 100644
index 3d63b15bbaa1..000000000000
--- a/drivers/net/caif/caif_hsi.c
+++ /dev/null
@@ -1,1454 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author:  Daniel Martensson
- *	    Dmitry.Tarnyagin  / dmitry.tarnyagin@lockless.no
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME fmt
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/netdevice.h>
-#include <linux/string.h>
-#include <linux/list.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/if_arp.h>
-#include <linux/timer.h>
-#include <net/rtnetlink.h>
-#include <linux/pkt_sched.h>
-#include <net/caif/caif_layer.h>
-#include <net/caif/caif_hsi.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Daniel Martensson");
-MODULE_DESCRIPTION("CAIF HSI driver");
-
-/* Returns the number of padding bytes for alignment. */
-#define PAD_POW2(x, pow) ((((x)&((pow)-1)) == 0) ? 0 :\
-				(((pow)-((x)&((pow)-1)))))
-
-static const struct cfhsi_config  hsi_default_config = {
-
-	/* Inactivity timeout on HSI, ms */
-	.inactivity_timeout = HZ,
-
-	/* Aggregation timeout (ms) of zero means no aggregation is done*/
-	.aggregation_timeout = 1,
-
-	/*
-	 * HSI link layer flow-control thresholds.
-	 * Threshold values for the HSI packet queue. Flow-control will be
-	 * asserted when the number of packets exceeds q_high_mark. It will
-	 * not be de-asserted before the number of packets drops below
-	 * q_low_mark.
-	 * Warning: A high threshold value might increase throughput but it
-	 * will at the same time prevent channel prioritization and increase
-	 * the risk of flooding the modem. The high threshold should be above
-	 * the low.
-	 */
-	.q_high_mark = 100,
-	.q_low_mark = 50,
-
-	/*
-	 * HSI padding options.
-	 * Warning: must be a base of 2 (& operation used) and can not be zero !
-	 */
-	.head_align = 4,
-	.tail_align = 4,
-};
-
-#define ON 1
-#define OFF 0
-
-static LIST_HEAD(cfhsi_list);
-
-static void cfhsi_inactivity_tout(struct timer_list *t)
-{
-	struct cfhsi *cfhsi = from_timer(cfhsi, t, inactivity_timer);
-
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	/* Schedule power down work queue. */
-	if (!test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		queue_work(cfhsi->wq, &cfhsi->wake_down_work);
-}
-
-static void cfhsi_update_aggregation_stats(struct cfhsi *cfhsi,
-					   const struct sk_buff *skb,
-					   int direction)
-{
-	struct caif_payload_info *info;
-	int hpad, tpad, len;
-
-	info = (struct caif_payload_info *)&skb->cb;
-	hpad = 1 + PAD_POW2((info->hdr_len + 1), cfhsi->cfg.head_align);
-	tpad = PAD_POW2((skb->len + hpad), cfhsi->cfg.tail_align);
-	len = skb->len + hpad + tpad;
-
-	if (direction > 0)
-		cfhsi->aggregation_len += len;
-	else if (direction < 0)
-		cfhsi->aggregation_len -= len;
-}
-
-static bool cfhsi_can_send_aggregate(struct cfhsi *cfhsi)
-{
-	int i;
-
-	if (cfhsi->cfg.aggregation_timeout == 0)
-		return true;
-
-	for (i = 0; i < CFHSI_PRIO_BEBK; ++i) {
-		if (cfhsi->qhead[i].qlen)
-			return true;
-	}
-
-	/* TODO: Use aggregation_len instead */
-	if (cfhsi->qhead[CFHSI_PRIO_BEBK].qlen >= CFHSI_MAX_PKTS)
-		return true;
-
-	return false;
-}
-
-static struct sk_buff *cfhsi_dequeue(struct cfhsi *cfhsi)
-{
-	struct sk_buff *skb;
-	int i;
-
-	for (i = 0; i < CFHSI_PRIO_LAST; ++i) {
-		skb = skb_dequeue(&cfhsi->qhead[i]);
-		if (skb)
-			break;
-	}
-
-	return skb;
-}
-
-static int cfhsi_tx_queue_len(struct cfhsi *cfhsi)
-{
-	int i, len = 0;
-	for (i = 0; i < CFHSI_PRIO_LAST; ++i)
-		len += skb_queue_len(&cfhsi->qhead[i]);
-	return len;
-}
-
-static void cfhsi_abort_tx(struct cfhsi *cfhsi)
-{
-	struct sk_buff *skb;
-
-	for (;;) {
-		spin_lock_bh(&cfhsi->lock);
-		skb = cfhsi_dequeue(cfhsi);
-		if (!skb)
-			break;
-
-		cfhsi->ndev->stats.tx_errors++;
-		cfhsi->ndev->stats.tx_dropped++;
-		cfhsi_update_aggregation_stats(cfhsi, skb, -1);
-		spin_unlock_bh(&cfhsi->lock);
-		kfree_skb(skb);
-	}
-	cfhsi->tx_state = CFHSI_TX_STATE_IDLE;
-	if (!test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		mod_timer(&cfhsi->inactivity_timer,
-			jiffies + cfhsi->cfg.inactivity_timeout);
-	spin_unlock_bh(&cfhsi->lock);
-}
-
-static int cfhsi_flush_fifo(struct cfhsi *cfhsi)
-{
-	char buffer[32]; /* Any reasonable value */
-	size_t fifo_occupancy;
-	int ret;
-
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	do {
-		ret = cfhsi->ops->cfhsi_fifo_occupancy(cfhsi->ops,
-				&fifo_occupancy);
-		if (ret) {
-			netdev_warn(cfhsi->ndev,
-				"%s: can't get FIFO occupancy: %d.\n",
-				__func__, ret);
-			break;
-		} else if (!fifo_occupancy)
-			/* No more data, exitting normally */
-			break;
-
-		fifo_occupancy = min(sizeof(buffer), fifo_occupancy);
-		set_bit(CFHSI_FLUSH_FIFO, &cfhsi->bits);
-		ret = cfhsi->ops->cfhsi_rx(buffer, fifo_occupancy,
-				cfhsi->ops);
-		if (ret) {
-			clear_bit(CFHSI_FLUSH_FIFO, &cfhsi->bits);
-			netdev_warn(cfhsi->ndev,
-				"%s: can't read data: %d.\n",
-				__func__, ret);
-			break;
-		}
-
-		ret = 5 * HZ;
-		ret = wait_event_interruptible_timeout(cfhsi->flush_fifo_wait,
-			 !test_bit(CFHSI_FLUSH_FIFO, &cfhsi->bits), ret);
-
-		if (ret < 0) {
-			netdev_warn(cfhsi->ndev,
-				"%s: can't wait for flush complete: %d.\n",
-				__func__, ret);
-			break;
-		} else if (!ret) {
-			ret = -ETIMEDOUT;
-			netdev_warn(cfhsi->ndev,
-				"%s: timeout waiting for flush complete.\n",
-				__func__);
-			break;
-		}
-	} while (1);
-
-	return ret;
-}
-
-static int cfhsi_tx_frm(struct cfhsi_desc *desc, struct cfhsi *cfhsi)
-{
-	int nfrms = 0;
-	int pld_len = 0;
-	struct sk_buff *skb;
-	u8 *pfrm = desc->emb_frm + CFHSI_MAX_EMB_FRM_SZ;
-
-	skb = cfhsi_dequeue(cfhsi);
-	if (!skb)
-		return 0;
-
-	/* Clear offset. */
-	desc->offset = 0;
-
-	/* Check if we can embed a CAIF frame. */
-	if (skb->len < CFHSI_MAX_EMB_FRM_SZ) {
-		struct caif_payload_info *info;
-		int hpad;
-		int tpad;
-
-		/* Calculate needed head alignment and tail alignment. */
-		info = (struct caif_payload_info *)&skb->cb;
-
-		hpad = 1 + PAD_POW2((info->hdr_len + 1), cfhsi->cfg.head_align);
-		tpad = PAD_POW2((skb->len + hpad), cfhsi->cfg.tail_align);
-
-		/* Check if frame still fits with added alignment. */
-		if ((skb->len + hpad + tpad) <= CFHSI_MAX_EMB_FRM_SZ) {
-			u8 *pemb = desc->emb_frm;
-			desc->offset = CFHSI_DESC_SHORT_SZ;
-			*pemb = (u8)(hpad - 1);
-			pemb += hpad;
-
-			/* Update network statistics. */
-			spin_lock_bh(&cfhsi->lock);
-			cfhsi->ndev->stats.tx_packets++;
-			cfhsi->ndev->stats.tx_bytes += skb->len;
-			cfhsi_update_aggregation_stats(cfhsi, skb, -1);
-			spin_unlock_bh(&cfhsi->lock);
-
-			/* Copy in embedded CAIF frame. */
-			skb_copy_bits(skb, 0, pemb, skb->len);
-
-			/* Consume the SKB */
-			consume_skb(skb);
-			skb = NULL;
-		}
-	}
-
-	/* Create payload CAIF frames. */
-	while (nfrms < CFHSI_MAX_PKTS) {
-		struct caif_payload_info *info;
-		int hpad;
-		int tpad;
-
-		if (!skb)
-			skb = cfhsi_dequeue(cfhsi);
-
-		if (!skb)
-			break;
-
-		/* Calculate needed head alignment and tail alignment. */
-		info = (struct caif_payload_info *)&skb->cb;
-
-		hpad = 1 + PAD_POW2((info->hdr_len + 1), cfhsi->cfg.head_align);
-		tpad = PAD_POW2((skb->len + hpad), cfhsi->cfg.tail_align);
-
-		/* Fill in CAIF frame length in descriptor. */
-		desc->cffrm_len[nfrms] = hpad + skb->len + tpad;
-
-		/* Fill head padding information. */
-		*pfrm = (u8)(hpad - 1);
-		pfrm += hpad;
-
-		/* Update network statistics. */
-		spin_lock_bh(&cfhsi->lock);
-		cfhsi->ndev->stats.tx_packets++;
-		cfhsi->ndev->stats.tx_bytes += skb->len;
-		cfhsi_update_aggregation_stats(cfhsi, skb, -1);
-		spin_unlock_bh(&cfhsi->lock);
-
-		/* Copy in CAIF frame. */
-		skb_copy_bits(skb, 0, pfrm, skb->len);
-
-		/* Update payload length. */
-		pld_len += desc->cffrm_len[nfrms];
-
-		/* Update frame pointer. */
-		pfrm += skb->len + tpad;
-
-		/* Consume the SKB */
-		consume_skb(skb);
-		skb = NULL;
-
-		/* Update number of frames. */
-		nfrms++;
-	}
-
-	/* Unused length fields should be zero-filled (according to SPEC). */
-	while (nfrms < CFHSI_MAX_PKTS) {
-		desc->cffrm_len[nfrms] = 0x0000;
-		nfrms++;
-	}
-
-	/* Check if we can piggy-back another descriptor. */
-	if (cfhsi_can_send_aggregate(cfhsi))
-		desc->header |= CFHSI_PIGGY_DESC;
-	else
-		desc->header &= ~CFHSI_PIGGY_DESC;
-
-	return CFHSI_DESC_SZ + pld_len;
-}
-
-static void cfhsi_start_tx(struct cfhsi *cfhsi)
-{
-	struct cfhsi_desc *desc = (struct cfhsi_desc *)cfhsi->tx_buf;
-	int len, res;
-
-	netdev_dbg(cfhsi->ndev, "%s.\n", __func__);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	do {
-		/* Create HSI frame. */
-		len = cfhsi_tx_frm(desc, cfhsi);
-		if (!len) {
-			spin_lock_bh(&cfhsi->lock);
-			if (unlikely(cfhsi_tx_queue_len(cfhsi))) {
-				spin_unlock_bh(&cfhsi->lock);
-				res = -EAGAIN;
-				continue;
-			}
-			cfhsi->tx_state = CFHSI_TX_STATE_IDLE;
-			/* Start inactivity timer. */
-			mod_timer(&cfhsi->inactivity_timer,
-				jiffies + cfhsi->cfg.inactivity_timeout);
-			spin_unlock_bh(&cfhsi->lock);
-			break;
-		}
-
-		/* Set up new transfer. */
-		res = cfhsi->ops->cfhsi_tx(cfhsi->tx_buf, len, cfhsi->ops);
-		if (WARN_ON(res < 0))
-			netdev_err(cfhsi->ndev, "%s: TX error %d.\n",
-				__func__, res);
-	} while (res < 0);
-}
-
-static void cfhsi_tx_done(struct cfhsi *cfhsi)
-{
-	netdev_dbg(cfhsi->ndev, "%s.\n", __func__);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	/*
-	 * Send flow on if flow off has been previously signalled
-	 * and number of packets is below low water mark.
-	 */
-	spin_lock_bh(&cfhsi->lock);
-	if (cfhsi->flow_off_sent &&
-			cfhsi_tx_queue_len(cfhsi) <= cfhsi->cfg.q_low_mark &&
-			cfhsi->cfdev.flowctrl) {
-
-		cfhsi->flow_off_sent = 0;
-		cfhsi->cfdev.flowctrl(cfhsi->ndev, ON);
-	}
-
-	if (cfhsi_can_send_aggregate(cfhsi)) {
-		spin_unlock_bh(&cfhsi->lock);
-		cfhsi_start_tx(cfhsi);
-	} else {
-		mod_timer(&cfhsi->aggregation_timer,
-			jiffies + cfhsi->cfg.aggregation_timeout);
-		spin_unlock_bh(&cfhsi->lock);
-	}
-
-	return;
-}
-
-static void cfhsi_tx_done_cb(struct cfhsi_cb_ops *cb_ops)
-{
-	struct cfhsi *cfhsi;
-
-	cfhsi = container_of(cb_ops, struct cfhsi, cb_ops);
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-	cfhsi_tx_done(cfhsi);
-}
-
-static int cfhsi_rx_desc(struct cfhsi_desc *desc, struct cfhsi *cfhsi)
-{
-	int xfer_sz = 0;
-	int nfrms = 0;
-	u16 *plen = NULL;
-	u8 *pfrm = NULL;
-
-	if ((desc->header & ~CFHSI_PIGGY_DESC) ||
-			(desc->offset > CFHSI_MAX_EMB_FRM_SZ)) {
-		netdev_err(cfhsi->ndev, "%s: Invalid descriptor.\n",
-			__func__);
-		return -EPROTO;
-	}
-
-	/* Check for embedded CAIF frame. */
-	if (desc->offset) {
-		struct sk_buff *skb;
-		int len = 0;
-		pfrm = ((u8 *)desc) + desc->offset;
-
-		/* Remove offset padding. */
-		pfrm += *pfrm + 1;
-
-		/* Read length of CAIF frame (little endian). */
-		len = *pfrm;
-		len |= ((*(pfrm+1)) << 8) & 0xFF00;
-		len += 2;	/* Add FCS fields. */
-
-		/* Sanity check length of CAIF frame. */
-		if (unlikely(len > CFHSI_MAX_CAIF_FRAME_SZ)) {
-			netdev_err(cfhsi->ndev, "%s: Invalid length.\n",
-				__func__);
-			return -EPROTO;
-		}
-
-		/* Allocate SKB (OK even in IRQ context). */
-		skb = alloc_skb(len + 1, GFP_ATOMIC);
-		if (!skb) {
-			netdev_err(cfhsi->ndev, "%s: Out of memory !\n",
-				__func__);
-			return -ENOMEM;
-		}
-		caif_assert(skb != NULL);
-
-		skb_put_data(skb, pfrm, len);
-
-		skb->protocol = htons(ETH_P_CAIF);
-		skb_reset_mac_header(skb);
-		skb->dev = cfhsi->ndev;
-
-		netif_rx_any_context(skb);
-
-		/* Update network statistics. */
-		cfhsi->ndev->stats.rx_packets++;
-		cfhsi->ndev->stats.rx_bytes += len;
-	}
-
-	/* Calculate transfer length. */
-	plen = desc->cffrm_len;
-	while (nfrms < CFHSI_MAX_PKTS && *plen) {
-		xfer_sz += *plen;
-		plen++;
-		nfrms++;
-	}
-
-	/* Check for piggy-backed descriptor. */
-	if (desc->header & CFHSI_PIGGY_DESC)
-		xfer_sz += CFHSI_DESC_SZ;
-
-	if ((xfer_sz % 4) || (xfer_sz > (CFHSI_BUF_SZ_RX - CFHSI_DESC_SZ))) {
-		netdev_err(cfhsi->ndev,
-				"%s: Invalid payload len: %d, ignored.\n",
-			__func__, xfer_sz);
-		return -EPROTO;
-	}
-	return xfer_sz;
-}
-
-static int cfhsi_rx_desc_len(struct cfhsi_desc *desc)
-{
-	int xfer_sz = 0;
-	int nfrms = 0;
-	u16 *plen;
-
-	if ((desc->header & ~CFHSI_PIGGY_DESC) ||
-			(desc->offset > CFHSI_MAX_EMB_FRM_SZ)) {
-
-		pr_err("Invalid descriptor. %x %x\n", desc->header,
-				desc->offset);
-		return -EPROTO;
-	}
-
-	/* Calculate transfer length. */
-	plen = desc->cffrm_len;
-	while (nfrms < CFHSI_MAX_PKTS && *plen) {
-		xfer_sz += *plen;
-		plen++;
-		nfrms++;
-	}
-
-	if (xfer_sz % 4) {
-		pr_err("Invalid payload len: %d, ignored.\n", xfer_sz);
-		return -EPROTO;
-	}
-	return xfer_sz;
-}
-
-static int cfhsi_rx_pld(struct cfhsi_desc *desc, struct cfhsi *cfhsi)
-{
-	int rx_sz = 0;
-	int nfrms = 0;
-	u16 *plen = NULL;
-	u8 *pfrm = NULL;
-
-	/* Sanity check header and offset. */
-	if (WARN_ON((desc->header & ~CFHSI_PIGGY_DESC) ||
-			(desc->offset > CFHSI_MAX_EMB_FRM_SZ))) {
-		netdev_err(cfhsi->ndev, "%s: Invalid descriptor.\n",
-			__func__);
-		return -EPROTO;
-	}
-
-	/* Set frame pointer to start of payload. */
-	pfrm = desc->emb_frm + CFHSI_MAX_EMB_FRM_SZ;
-	plen = desc->cffrm_len;
-
-	/* Skip already processed frames. */
-	while (nfrms < cfhsi->rx_state.nfrms) {
-		pfrm += *plen;
-		rx_sz += *plen;
-		plen++;
-		nfrms++;
-	}
-
-	/* Parse payload. */
-	while (nfrms < CFHSI_MAX_PKTS && *plen) {
-		struct sk_buff *skb;
-		u8 *pcffrm = NULL;
-		int len;
-
-		/* CAIF frame starts after head padding. */
-		pcffrm = pfrm + *pfrm + 1;
-
-		/* Read length of CAIF frame (little endian). */
-		len = *pcffrm;
-		len |= ((*(pcffrm + 1)) << 8) & 0xFF00;
-		len += 2;	/* Add FCS fields. */
-
-		/* Sanity check length of CAIF frames. */
-		if (unlikely(len > CFHSI_MAX_CAIF_FRAME_SZ)) {
-			netdev_err(cfhsi->ndev, "%s: Invalid length.\n",
-				__func__);
-			return -EPROTO;
-		}
-
-		/* Allocate SKB (OK even in IRQ context). */
-		skb = alloc_skb(len + 1, GFP_ATOMIC);
-		if (!skb) {
-			netdev_err(cfhsi->ndev, "%s: Out of memory !\n",
-				__func__);
-			cfhsi->rx_state.nfrms = nfrms;
-			return -ENOMEM;
-		}
-		caif_assert(skb != NULL);
-
-		skb_put_data(skb, pcffrm, len);
-
-		skb->protocol = htons(ETH_P_CAIF);
-		skb_reset_mac_header(skb);
-		skb->dev = cfhsi->ndev;
-
-		netif_rx_any_context(skb);
-
-		/* Update network statistics. */
-		cfhsi->ndev->stats.rx_packets++;
-		cfhsi->ndev->stats.rx_bytes += len;
-
-		pfrm += *plen;
-		rx_sz += *plen;
-		plen++;
-		nfrms++;
-	}
-
-	return rx_sz;
-}
-
-static void cfhsi_rx_done(struct cfhsi *cfhsi)
-{
-	int res;
-	int desc_pld_len = 0, rx_len, rx_state;
-	struct cfhsi_desc *desc = NULL;
-	u8 *rx_ptr, *rx_buf;
-	struct cfhsi_desc *piggy_desc = NULL;
-
-	desc = (struct cfhsi_desc *)cfhsi->rx_buf;
-
-	netdev_dbg(cfhsi->ndev, "%s\n", __func__);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	/* Update inactivity timer if pending. */
-	spin_lock_bh(&cfhsi->lock);
-	mod_timer_pending(&cfhsi->inactivity_timer,
-			jiffies + cfhsi->cfg.inactivity_timeout);
-	spin_unlock_bh(&cfhsi->lock);
-
-	if (cfhsi->rx_state.state == CFHSI_RX_STATE_DESC) {
-		desc_pld_len = cfhsi_rx_desc_len(desc);
-
-		if (desc_pld_len < 0)
-			goto out_of_sync;
-
-		rx_buf = cfhsi->rx_buf;
-		rx_len = desc_pld_len;
-		if (desc_pld_len > 0 && (desc->header & CFHSI_PIGGY_DESC))
-			rx_len += CFHSI_DESC_SZ;
-		if (desc_pld_len == 0)
-			rx_buf = cfhsi->rx_flip_buf;
-	} else {
-		rx_buf = cfhsi->rx_flip_buf;
-
-		rx_len = CFHSI_DESC_SZ;
-		if (cfhsi->rx_state.pld_len > 0 &&
-				(desc->header & CFHSI_PIGGY_DESC)) {
-
-			piggy_desc = (struct cfhsi_desc *)
-				(desc->emb_frm + CFHSI_MAX_EMB_FRM_SZ +
-						cfhsi->rx_state.pld_len);
-
-			cfhsi->rx_state.piggy_desc = true;
-
-			/* Extract payload len from piggy-backed descriptor. */
-			desc_pld_len = cfhsi_rx_desc_len(piggy_desc);
-			if (desc_pld_len < 0)
-				goto out_of_sync;
-
-			if (desc_pld_len > 0) {
-				rx_len = desc_pld_len;
-				if (piggy_desc->header & CFHSI_PIGGY_DESC)
-					rx_len += CFHSI_DESC_SZ;
-			}
-
-			/*
-			 * Copy needed information from the piggy-backed
-			 * descriptor to the descriptor in the start.
-			 */
-			memcpy(rx_buf, (u8 *)piggy_desc,
-					CFHSI_DESC_SHORT_SZ);
-		}
-	}
-
-	if (desc_pld_len) {
-		rx_state = CFHSI_RX_STATE_PAYLOAD;
-		rx_ptr = rx_buf + CFHSI_DESC_SZ;
-	} else {
-		rx_state = CFHSI_RX_STATE_DESC;
-		rx_ptr = rx_buf;
-		rx_len = CFHSI_DESC_SZ;
-	}
-
-	/* Initiate next read */
-	if (test_bit(CFHSI_AWAKE, &cfhsi->bits)) {
-		/* Set up new transfer. */
-		netdev_dbg(cfhsi->ndev, "%s: Start RX.\n",
-				__func__);
-
-		res = cfhsi->ops->cfhsi_rx(rx_ptr, rx_len,
-				cfhsi->ops);
-		if (WARN_ON(res < 0)) {
-			netdev_err(cfhsi->ndev, "%s: RX error %d.\n",
-				__func__, res);
-			cfhsi->ndev->stats.rx_errors++;
-			cfhsi->ndev->stats.rx_dropped++;
-		}
-	}
-
-	if (cfhsi->rx_state.state == CFHSI_RX_STATE_DESC) {
-		/* Extract payload from descriptor */
-		if (cfhsi_rx_desc(desc, cfhsi) < 0)
-			goto out_of_sync;
-	} else {
-		/* Extract payload */
-		if (cfhsi_rx_pld(desc, cfhsi) < 0)
-			goto out_of_sync;
-		if (piggy_desc) {
-			/* Extract any payload in piggyback descriptor. */
-			if (cfhsi_rx_desc(piggy_desc, cfhsi) < 0)
-				goto out_of_sync;
-			/* Mark no embedded frame after extracting it */
-			piggy_desc->offset = 0;
-		}
-	}
-
-	/* Update state info */
-	memset(&cfhsi->rx_state, 0, sizeof(cfhsi->rx_state));
-	cfhsi->rx_state.state = rx_state;
-	cfhsi->rx_ptr = rx_ptr;
-	cfhsi->rx_len = rx_len;
-	cfhsi->rx_state.pld_len = desc_pld_len;
-	cfhsi->rx_state.piggy_desc = desc->header & CFHSI_PIGGY_DESC;
-
-	if (rx_buf != cfhsi->rx_buf)
-		swap(cfhsi->rx_buf, cfhsi->rx_flip_buf);
-	return;
-
-out_of_sync:
-	netdev_err(cfhsi->ndev, "%s: Out of sync.\n", __func__);
-	print_hex_dump_bytes("--> ", DUMP_PREFIX_NONE,
-			cfhsi->rx_buf, CFHSI_DESC_SZ);
-	schedule_work(&cfhsi->out_of_sync_work);
-}
-
-static void cfhsi_rx_slowpath(struct timer_list *t)
-{
-	struct cfhsi *cfhsi = from_timer(cfhsi, t, rx_slowpath_timer);
-
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	cfhsi_rx_done(cfhsi);
-}
-
-static void cfhsi_rx_done_cb(struct cfhsi_cb_ops *cb_ops)
-{
-	struct cfhsi *cfhsi;
-
-	cfhsi = container_of(cb_ops, struct cfhsi, cb_ops);
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	if (test_and_clear_bit(CFHSI_FLUSH_FIFO, &cfhsi->bits))
-		wake_up_interruptible(&cfhsi->flush_fifo_wait);
-	else
-		cfhsi_rx_done(cfhsi);
-}
-
-static void cfhsi_wake_up(struct work_struct *work)
-{
-	struct cfhsi *cfhsi = NULL;
-	int res;
-	int len;
-	long ret;
-
-	cfhsi = container_of(work, struct cfhsi, wake_up_work);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	if (unlikely(test_bit(CFHSI_AWAKE, &cfhsi->bits))) {
-		/* It happenes when wakeup is requested by
-		 * both ends at the same time. */
-		clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-		clear_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-		return;
-	}
-
-	/* Activate wake line. */
-	cfhsi->ops->cfhsi_wake_up(cfhsi->ops);
-
-	netdev_dbg(cfhsi->ndev, "%s: Start waiting.\n",
-		__func__);
-
-	/* Wait for acknowledge. */
-	ret = CFHSI_WAKE_TOUT;
-	ret = wait_event_interruptible_timeout(cfhsi->wake_up_wait,
-					test_and_clear_bit(CFHSI_WAKE_UP_ACK,
-							&cfhsi->bits), ret);
-	if (unlikely(ret < 0)) {
-		/* Interrupted by signal. */
-		netdev_err(cfhsi->ndev, "%s: Signalled: %ld.\n",
-			__func__, ret);
-
-		clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-		cfhsi->ops->cfhsi_wake_down(cfhsi->ops);
-		return;
-	} else if (!ret) {
-		bool ca_wake = false;
-		size_t fifo_occupancy = 0;
-
-		/* Wakeup timeout */
-		netdev_dbg(cfhsi->ndev, "%s: Timeout.\n",
-			__func__);
-
-		/* Check FIFO to check if modem has sent something. */
-		WARN_ON(cfhsi->ops->cfhsi_fifo_occupancy(cfhsi->ops,
-					&fifo_occupancy));
-
-		netdev_dbg(cfhsi->ndev, "%s: Bytes in FIFO: %u.\n",
-				__func__, (unsigned) fifo_occupancy);
-
-		/* Check if we misssed the interrupt. */
-		WARN_ON(cfhsi->ops->cfhsi_get_peer_wake(cfhsi->ops,
-							&ca_wake));
-
-		if (ca_wake) {
-			netdev_err(cfhsi->ndev, "%s: CA Wake missed !.\n",
-				__func__);
-
-			/* Clear the CFHSI_WAKE_UP_ACK bit to prevent race. */
-			clear_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-
-			/* Continue execution. */
-			goto wake_ack;
-		}
-
-		clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-		cfhsi->ops->cfhsi_wake_down(cfhsi->ops);
-		return;
-	}
-wake_ack:
-	netdev_dbg(cfhsi->ndev, "%s: Woken.\n",
-		__func__);
-
-	/* Clear power up bit. */
-	set_bit(CFHSI_AWAKE, &cfhsi->bits);
-	clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-
-	/* Resume read operation. */
-	netdev_dbg(cfhsi->ndev, "%s: Start RX.\n", __func__);
-	res = cfhsi->ops->cfhsi_rx(cfhsi->rx_ptr, cfhsi->rx_len, cfhsi->ops);
-
-	if (WARN_ON(res < 0))
-		netdev_err(cfhsi->ndev, "%s: RX err %d.\n", __func__, res);
-
-	/* Clear power up acknowledment. */
-	clear_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-
-	spin_lock_bh(&cfhsi->lock);
-
-	/* Resume transmit if queues are not empty. */
-	if (!cfhsi_tx_queue_len(cfhsi)) {
-		netdev_dbg(cfhsi->ndev, "%s: Peer wake, start timer.\n",
-			__func__);
-		/* Start inactivity timer. */
-		mod_timer(&cfhsi->inactivity_timer,
-				jiffies + cfhsi->cfg.inactivity_timeout);
-		spin_unlock_bh(&cfhsi->lock);
-		return;
-	}
-
-	netdev_dbg(cfhsi->ndev, "%s: Host wake.\n",
-		__func__);
-
-	spin_unlock_bh(&cfhsi->lock);
-
-	/* Create HSI frame. */
-	len = cfhsi_tx_frm((struct cfhsi_desc *)cfhsi->tx_buf, cfhsi);
-
-	if (likely(len > 0)) {
-		/* Set up new transfer. */
-		res = cfhsi->ops->cfhsi_tx(cfhsi->tx_buf, len, cfhsi->ops);
-		if (WARN_ON(res < 0)) {
-			netdev_err(cfhsi->ndev, "%s: TX error %d.\n",
-				__func__, res);
-			cfhsi_abort_tx(cfhsi);
-		}
-	} else {
-		netdev_err(cfhsi->ndev,
-				"%s: Failed to create HSI frame: %d.\n",
-				__func__, len);
-	}
-}
-
-static void cfhsi_wake_down(struct work_struct *work)
-{
-	long ret;
-	struct cfhsi *cfhsi = NULL;
-	size_t fifo_occupancy = 0;
-	int retry = CFHSI_WAKE_TOUT;
-
-	cfhsi = container_of(work, struct cfhsi, wake_down_work);
-	netdev_dbg(cfhsi->ndev, "%s.\n", __func__);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	/* Deactivate wake line. */
-	cfhsi->ops->cfhsi_wake_down(cfhsi->ops);
-
-	/* Wait for acknowledge. */
-	ret = CFHSI_WAKE_TOUT;
-	ret = wait_event_interruptible_timeout(cfhsi->wake_down_wait,
-					test_and_clear_bit(CFHSI_WAKE_DOWN_ACK,
-							&cfhsi->bits), ret);
-	if (ret < 0) {
-		/* Interrupted by signal. */
-		netdev_err(cfhsi->ndev, "%s: Signalled: %ld.\n",
-			__func__, ret);
-		return;
-	} else if (!ret) {
-		bool ca_wake = true;
-
-		/* Timeout */
-		netdev_err(cfhsi->ndev, "%s: Timeout.\n", __func__);
-
-		/* Check if we misssed the interrupt. */
-		WARN_ON(cfhsi->ops->cfhsi_get_peer_wake(cfhsi->ops,
-							&ca_wake));
-		if (!ca_wake)
-			netdev_err(cfhsi->ndev, "%s: CA Wake missed !.\n",
-				__func__);
-	}
-
-	/* Check FIFO occupancy. */
-	while (retry) {
-		WARN_ON(cfhsi->ops->cfhsi_fifo_occupancy(cfhsi->ops,
-							&fifo_occupancy));
-
-		if (!fifo_occupancy)
-			break;
-
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(1);
-		retry--;
-	}
-
-	if (!retry)
-		netdev_err(cfhsi->ndev, "%s: FIFO Timeout.\n", __func__);
-
-	/* Clear AWAKE condition. */
-	clear_bit(CFHSI_AWAKE, &cfhsi->bits);
-
-	/* Cancel pending RX requests. */
-	cfhsi->ops->cfhsi_rx_cancel(cfhsi->ops);
-}
-
-static void cfhsi_out_of_sync(struct work_struct *work)
-{
-	struct cfhsi *cfhsi = NULL;
-
-	cfhsi = container_of(work, struct cfhsi, out_of_sync_work);
-
-	rtnl_lock();
-	dev_close(cfhsi->ndev);
-	rtnl_unlock();
-}
-
-static void cfhsi_wake_up_cb(struct cfhsi_cb_ops *cb_ops)
-{
-	struct cfhsi *cfhsi = NULL;
-
-	cfhsi = container_of(cb_ops, struct cfhsi, cb_ops);
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	set_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-	wake_up_interruptible(&cfhsi->wake_up_wait);
-
-	if (test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))
-		return;
-
-	/* Schedule wake up work queue if the peer initiates. */
-	if (!test_and_set_bit(CFHSI_WAKE_UP, &cfhsi->bits))
-		queue_work(cfhsi->wq, &cfhsi->wake_up_work);
-}
-
-static void cfhsi_wake_down_cb(struct cfhsi_cb_ops *cb_ops)
-{
-	struct cfhsi *cfhsi = NULL;
-
-	cfhsi = container_of(cb_ops, struct cfhsi, cb_ops);
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	/* Initiating low power is only permitted by the host (us). */
-	set_bit(CFHSI_WAKE_DOWN_ACK, &cfhsi->bits);
-	wake_up_interruptible(&cfhsi->wake_down_wait);
-}
-
-static void cfhsi_aggregation_tout(struct timer_list *t)
-{
-	struct cfhsi *cfhsi = from_timer(cfhsi, t, aggregation_timer);
-
-	netdev_dbg(cfhsi->ndev, "%s.\n",
-		__func__);
-
-	cfhsi_start_tx(cfhsi);
-}
-
-static netdev_tx_t cfhsi_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-	struct cfhsi *cfhsi = NULL;
-	int start_xfer = 0;
-	int timer_active;
-	int prio;
-
-	if (!dev)
-		return -EINVAL;
-
-	cfhsi = netdev_priv(dev);
-
-	switch (skb->priority) {
-	case TC_PRIO_BESTEFFORT:
-	case TC_PRIO_FILLER:
-	case TC_PRIO_BULK:
-		prio = CFHSI_PRIO_BEBK;
-		break;
-	case TC_PRIO_INTERACTIVE_BULK:
-		prio = CFHSI_PRIO_VI;
-		break;
-	case TC_PRIO_INTERACTIVE:
-		prio = CFHSI_PRIO_VO;
-		break;
-	case TC_PRIO_CONTROL:
-	default:
-		prio = CFHSI_PRIO_CTL;
-		break;
-	}
-
-	spin_lock_bh(&cfhsi->lock);
-
-	/* Update aggregation statistics  */
-	cfhsi_update_aggregation_stats(cfhsi, skb, 1);
-
-	/* Queue the SKB */
-	skb_queue_tail(&cfhsi->qhead[prio], skb);
-
-	/* Sanity check; xmit should not be called after unregister_netdev */
-	if (WARN_ON(test_bit(CFHSI_SHUTDOWN, &cfhsi->bits))) {
-		spin_unlock_bh(&cfhsi->lock);
-		cfhsi_abort_tx(cfhsi);
-		return -EINVAL;
-	}
-
-	/* Send flow off if number of packets is above high water mark. */
-	if (!cfhsi->flow_off_sent &&
-		cfhsi_tx_queue_len(cfhsi) > cfhsi->cfg.q_high_mark &&
-		cfhsi->cfdev.flowctrl) {
-		cfhsi->flow_off_sent = 1;
-		cfhsi->cfdev.flowctrl(cfhsi->ndev, OFF);
-	}
-
-	if (cfhsi->tx_state == CFHSI_TX_STATE_IDLE) {
-		cfhsi->tx_state = CFHSI_TX_STATE_XFER;
-		start_xfer = 1;
-	}
-
-	if (!start_xfer) {
-		/* Send aggregate if it is possible */
-		bool aggregate_ready =
-			cfhsi_can_send_aggregate(cfhsi) &&
-			del_timer(&cfhsi->aggregation_timer) > 0;
-		spin_unlock_bh(&cfhsi->lock);
-		if (aggregate_ready)
-			cfhsi_start_tx(cfhsi);
-		return NETDEV_TX_OK;
-	}
-
-	/* Delete inactivity timer if started. */
-	timer_active = del_timer_sync(&cfhsi->inactivity_timer);
-
-	spin_unlock_bh(&cfhsi->lock);
-
-	if (timer_active) {
-		struct cfhsi_desc *desc = (struct cfhsi_desc *)cfhsi->tx_buf;
-		int len;
-		int res;
-
-		/* Create HSI frame. */
-		len = cfhsi_tx_frm(desc, cfhsi);
-		WARN_ON(!len);
-
-		/* Set up new transfer. */
-		res = cfhsi->ops->cfhsi_tx(cfhsi->tx_buf, len, cfhsi->ops);
-		if (WARN_ON(res < 0)) {
-			netdev_err(cfhsi->ndev, "%s: TX error %d.\n",
-				__func__, res);
-			cfhsi_abort_tx(cfhsi);
-		}
-	} else {
-		/* Schedule wake up work queue if the we initiate. */
-		if (!test_and_set_bit(CFHSI_WAKE_UP, &cfhsi->bits))
-			queue_work(cfhsi->wq, &cfhsi->wake_up_work);
-	}
-
-	return NETDEV_TX_OK;
-}
-
-static const struct net_device_ops cfhsi_netdevops;
-
-static void cfhsi_setup(struct net_device *dev)
-{
-	int i;
-	struct cfhsi *cfhsi = netdev_priv(dev);
-	dev->features = 0;
-	dev->type = ARPHRD_CAIF;
-	dev->flags = IFF_POINTOPOINT | IFF_NOARP;
-	dev->mtu = CFHSI_MAX_CAIF_FRAME_SZ;
-	dev->priv_flags |= IFF_NO_QUEUE;
-	dev->needs_free_netdev = true;
-	dev->netdev_ops = &cfhsi_netdevops;
-	for (i = 0; i < CFHSI_PRIO_LAST; ++i)
-		skb_queue_head_init(&cfhsi->qhead[i]);
-	cfhsi->cfdev.link_select = CAIF_LINK_HIGH_BANDW;
-	cfhsi->cfdev.use_frag = false;
-	cfhsi->cfdev.use_stx = false;
-	cfhsi->cfdev.use_fcs = false;
-	cfhsi->ndev = dev;
-	cfhsi->cfg = hsi_default_config;
-}
-
-static int cfhsi_open(struct net_device *ndev)
-{
-	struct cfhsi *cfhsi = netdev_priv(ndev);
-	int res;
-
-	clear_bit(CFHSI_SHUTDOWN, &cfhsi->bits);
-
-	/* Initialize state vaiables. */
-	cfhsi->tx_state = CFHSI_TX_STATE_IDLE;
-	cfhsi->rx_state.state = CFHSI_RX_STATE_DESC;
-
-	/* Set flow info */
-	cfhsi->flow_off_sent = 0;
-
-	/*
-	 * Allocate a TX buffer with the size of a HSI packet descriptors
-	 * and the necessary room for CAIF payload frames.
-	 */
-	cfhsi->tx_buf = kzalloc(CFHSI_BUF_SZ_TX, GFP_KERNEL);
-	if (!cfhsi->tx_buf) {
-		res = -ENODEV;
-		goto err_alloc_tx;
-	}
-
-	/*
-	 * Allocate a RX buffer with the size of two HSI packet descriptors and
-	 * the necessary room for CAIF payload frames.
-	 */
-	cfhsi->rx_buf = kzalloc(CFHSI_BUF_SZ_RX, GFP_KERNEL);
-	if (!cfhsi->rx_buf) {
-		res = -ENODEV;
-		goto err_alloc_rx;
-	}
-
-	cfhsi->rx_flip_buf = kzalloc(CFHSI_BUF_SZ_RX, GFP_KERNEL);
-	if (!cfhsi->rx_flip_buf) {
-		res = -ENODEV;
-		goto err_alloc_rx_flip;
-	}
-
-	/* Initialize aggregation timeout */
-	cfhsi->cfg.aggregation_timeout = hsi_default_config.aggregation_timeout;
-
-	/* Initialize recieve vaiables. */
-	cfhsi->rx_ptr = cfhsi->rx_buf;
-	cfhsi->rx_len = CFHSI_DESC_SZ;
-
-	/* Initialize spin locks. */
-	spin_lock_init(&cfhsi->lock);
-
-	/* Set up the driver. */
-	cfhsi->cb_ops.tx_done_cb = cfhsi_tx_done_cb;
-	cfhsi->cb_ops.rx_done_cb = cfhsi_rx_done_cb;
-	cfhsi->cb_ops.wake_up_cb = cfhsi_wake_up_cb;
-	cfhsi->cb_ops.wake_down_cb = cfhsi_wake_down_cb;
-
-	/* Initialize the work queues. */
-	INIT_WORK(&cfhsi->wake_up_work, cfhsi_wake_up);
-	INIT_WORK(&cfhsi->wake_down_work, cfhsi_wake_down);
-	INIT_WORK(&cfhsi->out_of_sync_work, cfhsi_out_of_sync);
-
-	/* Clear all bit fields. */
-	clear_bit(CFHSI_WAKE_UP_ACK, &cfhsi->bits);
-	clear_bit(CFHSI_WAKE_DOWN_ACK, &cfhsi->bits);
-	clear_bit(CFHSI_WAKE_UP, &cfhsi->bits);
-	clear_bit(CFHSI_AWAKE, &cfhsi->bits);
-
-	/* Create work thread. */
-	cfhsi->wq = alloc_ordered_workqueue(cfhsi->ndev->name, WQ_MEM_RECLAIM);
-	if (!cfhsi->wq) {
-		netdev_err(cfhsi->ndev, "%s: Failed to create work queue.\n",
-			__func__);
-		res = -ENODEV;
-		goto err_create_wq;
-	}
-
-	/* Initialize wait queues. */
-	init_waitqueue_head(&cfhsi->wake_up_wait);
-	init_waitqueue_head(&cfhsi->wake_down_wait);
-	init_waitqueue_head(&cfhsi->flush_fifo_wait);
-
-	/* Setup the inactivity timer. */
-	timer_setup(&cfhsi->inactivity_timer, cfhsi_inactivity_tout, 0);
-	/* Setup the slowpath RX timer. */
-	timer_setup(&cfhsi->rx_slowpath_timer, cfhsi_rx_slowpath, 0);
-	/* Setup the aggregation timer. */
-	timer_setup(&cfhsi->aggregation_timer, cfhsi_aggregation_tout, 0);
-
-	/* Activate HSI interface. */
-	res = cfhsi->ops->cfhsi_up(cfhsi->ops);
-	if (res) {
-		netdev_err(cfhsi->ndev,
-			"%s: can't activate HSI interface: %d.\n",
-			__func__, res);
-		goto err_activate;
-	}
-
-	/* Flush FIFO */
-	res = cfhsi_flush_fifo(cfhsi);
-	if (res) {
-		netdev_err(cfhsi->ndev, "%s: Can't flush FIFO: %d.\n",
-			__func__, res);
-		goto err_net_reg;
-	}
-	return res;
-
- err_net_reg:
-	cfhsi->ops->cfhsi_down(cfhsi->ops);
- err_activate:
-	destroy_workqueue(cfhsi->wq);
- err_create_wq:
-	kfree(cfhsi->rx_flip_buf);
- err_alloc_rx_flip:
-	kfree(cfhsi->rx_buf);
- err_alloc_rx:
-	kfree(cfhsi->tx_buf);
- err_alloc_tx:
-	return res;
-}
-
-static int cfhsi_close(struct net_device *ndev)
-{
-	struct cfhsi *cfhsi = netdev_priv(ndev);
-	u8 *tx_buf, *rx_buf, *flip_buf;
-
-	/* going to shutdown driver */
-	set_bit(CFHSI_SHUTDOWN, &cfhsi->bits);
-
-	/* Delete timers if pending */
-	del_timer_sync(&cfhsi->inactivity_timer);
-	del_timer_sync(&cfhsi->rx_slowpath_timer);
-	del_timer_sync(&cfhsi->aggregation_timer);
-
-	/* Cancel pending RX request (if any) */
-	cfhsi->ops->cfhsi_rx_cancel(cfhsi->ops);
-
-	/* Destroy workqueue */
-	destroy_workqueue(cfhsi->wq);
-
-	/* Store bufferes: will be freed later. */
-	tx_buf = cfhsi->tx_buf;
-	rx_buf = cfhsi->rx_buf;
-	flip_buf = cfhsi->rx_flip_buf;
-	/* Flush transmit queues. */
-	cfhsi_abort_tx(cfhsi);
-
-	/* Deactivate interface */
-	cfhsi->ops->cfhsi_down(cfhsi->ops);
-
-	/* Free buffers. */
-	kfree(tx_buf);
-	kfree(rx_buf);
-	kfree(flip_buf);
-	return 0;
-}
-
-static void cfhsi_uninit(struct net_device *dev)
-{
-	struct cfhsi *cfhsi = netdev_priv(dev);
-	ASSERT_RTNL();
-	symbol_put(cfhsi_get_device);
-	list_del(&cfhsi->list);
-}
-
-static const struct net_device_ops cfhsi_netdevops = {
-	.ndo_uninit = cfhsi_uninit,
-	.ndo_open = cfhsi_open,
-	.ndo_stop = cfhsi_close,
-	.ndo_start_xmit = cfhsi_xmit
-};
-
-static void cfhsi_netlink_parms(struct nlattr *data[], struct cfhsi *cfhsi)
-{
-	int i;
-
-	if (!data) {
-		pr_debug("no params data found\n");
-		return;
-	}
-
-	i = __IFLA_CAIF_HSI_INACTIVITY_TOUT;
-	/*
-	 * Inactivity timeout in millisecs. Lowest possible value is 1,
-	 * and highest possible is NEXT_TIMER_MAX_DELTA.
-	 */
-	if (data[i]) {
-		u32 inactivity_timeout = nla_get_u32(data[i]);
-		/* Pre-calculate inactivity timeout. */
-		cfhsi->cfg.inactivity_timeout =	inactivity_timeout * HZ / 1000;
-		if (cfhsi->cfg.inactivity_timeout == 0)
-			cfhsi->cfg.inactivity_timeout = 1;
-		else if (cfhsi->cfg.inactivity_timeout > NEXT_TIMER_MAX_DELTA)
-			cfhsi->cfg.inactivity_timeout = NEXT_TIMER_MAX_DELTA;
-	}
-
-	i = __IFLA_CAIF_HSI_AGGREGATION_TOUT;
-	if (data[i])
-		cfhsi->cfg.aggregation_timeout = nla_get_u32(data[i]);
-
-	i = __IFLA_CAIF_HSI_HEAD_ALIGN;
-	if (data[i])
-		cfhsi->cfg.head_align = nla_get_u32(data[i]);
-
-	i = __IFLA_CAIF_HSI_TAIL_ALIGN;
-	if (data[i])
-		cfhsi->cfg.tail_align = nla_get_u32(data[i]);
-
-	i = __IFLA_CAIF_HSI_QHIGH_WATERMARK;
-	if (data[i])
-		cfhsi->cfg.q_high_mark = nla_get_u32(data[i]);
-
-	i = __IFLA_CAIF_HSI_QLOW_WATERMARK;
-	if (data[i])
-		cfhsi->cfg.q_low_mark = nla_get_u32(data[i]);
-}
-
-static int caif_hsi_changelink(struct net_device *dev, struct nlattr *tb[],
-			       struct nlattr *data[],
-			       struct netlink_ext_ack *extack)
-{
-	cfhsi_netlink_parms(data, netdev_priv(dev));
-	netdev_state_change(dev);
-	return 0;
-}
-
-static const struct nla_policy caif_hsi_policy[__IFLA_CAIF_HSI_MAX + 1] = {
-	[__IFLA_CAIF_HSI_INACTIVITY_TOUT] = { .type = NLA_U32, .len = 4 },
-	[__IFLA_CAIF_HSI_AGGREGATION_TOUT] = { .type = NLA_U32, .len = 4 },
-	[__IFLA_CAIF_HSI_HEAD_ALIGN] = { .type = NLA_U32, .len = 4 },
-	[__IFLA_CAIF_HSI_TAIL_ALIGN] = { .type = NLA_U32, .len = 4 },
-	[__IFLA_CAIF_HSI_QHIGH_WATERMARK] = { .type = NLA_U32, .len = 4 },
-	[__IFLA_CAIF_HSI_QLOW_WATERMARK] = { .type = NLA_U32, .len = 4 },
-};
-
-static size_t caif_hsi_get_size(const struct net_device *dev)
-{
-	int i;
-	size_t s = 0;
-	for (i = __IFLA_CAIF_HSI_UNSPEC + 1; i < __IFLA_CAIF_HSI_MAX; i++)
-		s += nla_total_size(caif_hsi_policy[i].len);
-	return s;
-}
-
-static int caif_hsi_fill_info(struct sk_buff *skb, const struct net_device *dev)
-{
-	struct cfhsi *cfhsi = netdev_priv(dev);
-
-	if (nla_put_u32(skb, __IFLA_CAIF_HSI_INACTIVITY_TOUT,
-			cfhsi->cfg.inactivity_timeout) ||
-	    nla_put_u32(skb, __IFLA_CAIF_HSI_AGGREGATION_TOUT,
-			cfhsi->cfg.aggregation_timeout) ||
-	    nla_put_u32(skb, __IFLA_CAIF_HSI_HEAD_ALIGN,
-			cfhsi->cfg.head_align) ||
-	    nla_put_u32(skb, __IFLA_CAIF_HSI_TAIL_ALIGN,
-			cfhsi->cfg.tail_align) ||
-	    nla_put_u32(skb, __IFLA_CAIF_HSI_QHIGH_WATERMARK,
-			cfhsi->cfg.q_high_mark) ||
-	    nla_put_u32(skb, __IFLA_CAIF_HSI_QLOW_WATERMARK,
-			cfhsi->cfg.q_low_mark))
-		return -EMSGSIZE;
-
-	return 0;
-}
-
-static int caif_hsi_newlink(struct net *src_net, struct net_device *dev,
-			    struct nlattr *tb[], struct nlattr *data[],
-			    struct netlink_ext_ack *extack)
-{
-	struct cfhsi *cfhsi = NULL;
-	struct cfhsi_ops *(*get_ops)(void);
-
-	ASSERT_RTNL();
-
-	cfhsi = netdev_priv(dev);
-	cfhsi_netlink_parms(data, cfhsi);
-
-	get_ops = symbol_get(cfhsi_get_ops);
-	if (!get_ops) {
-		pr_err("%s: failed to get the cfhsi_ops\n", __func__);
-		return -ENODEV;
-	}
-
-	/* Assign the HSI device. */
-	cfhsi->ops = (*get_ops)();
-	if (!cfhsi->ops) {
-		pr_err("%s: failed to get the cfhsi_ops\n", __func__);
-		goto err;
-	}
-
-	/* Assign the driver to this HSI device. */
-	cfhsi->ops->cb_ops = &cfhsi->cb_ops;
-	if (register_netdevice(dev)) {
-		pr_warn("%s: caif_hsi device registration failed\n", __func__);
-		goto err;
-	}
-	/* Add CAIF HSI device to list. */
-	list_add_tail(&cfhsi->list, &cfhsi_list);
-
-	return 0;
-err:
-	symbol_put(cfhsi_get_ops);
-	return -ENODEV;
-}
-
-static struct rtnl_link_ops caif_hsi_link_ops __read_mostly = {
-	.kind		= "cfhsi",
-	.priv_size	= sizeof(struct cfhsi),
-	.setup		= cfhsi_setup,
-	.maxtype	= __IFLA_CAIF_HSI_MAX,
-	.policy	= caif_hsi_policy,
-	.newlink	= caif_hsi_newlink,
-	.changelink	= caif_hsi_changelink,
-	.get_size	= caif_hsi_get_size,
-	.fill_info	= caif_hsi_fill_info,
-};
-
-static void __exit cfhsi_exit_module(void)
-{
-	struct list_head *list_node;
-	struct list_head *n;
-	struct cfhsi *cfhsi;
-
-	rtnl_link_unregister(&caif_hsi_link_ops);
-
-	rtnl_lock();
-	list_for_each_safe(list_node, n, &cfhsi_list) {
-		cfhsi = list_entry(list_node, struct cfhsi, list);
-		unregister_netdevice(cfhsi->ndev);
-	}
-	rtnl_unlock();
-}
-
-static int __init cfhsi_init_module(void)
-{
-	return rtnl_link_register(&caif_hsi_link_ops);
-}
-
-module_init(cfhsi_init_module);
-module_exit(cfhsi_exit_module);
diff --git a/include/net/caif/caif_hsi.h b/include/net/caif/caif_hsi.h
deleted file mode 100644
index 552cf68d28d2..000000000000
--- a/include/net/caif/caif_hsi.h
+++ /dev/null
@@ -1,200 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author:  Daniel Martensson / daniel.martensson@stericsson.com
- *	    Dmitry.Tarnyagin  / dmitry.tarnyagin@stericsson.com
- */
-
-#ifndef CAIF_HSI_H_
-#define CAIF_HSI_H_
-
-#include <net/caif/caif_layer.h>
-#include <net/caif/caif_device.h>
-#include <linux/atomic.h>
-
-/*
- * Maximum number of CAIF frames that can reside in the same HSI frame.
- */
-#define CFHSI_MAX_PKTS 15
-
-/*
- * Maximum number of bytes used for the frame that can be embedded in the
- * HSI descriptor.
- */
-#define CFHSI_MAX_EMB_FRM_SZ 96
-
-/*
- * Decides if HSI buffers should be prefilled with 0xFF pattern for easier
- * debugging. Both TX and RX buffers will be filled before the transfer.
- */
-#define CFHSI_DBG_PREFILL		0
-
-/* Structure describing a HSI packet descriptor. */
-#pragma pack(1) /* Byte alignment. */
-struct cfhsi_desc {
-	u8 header;
-	u8 offset;
-	u16 cffrm_len[CFHSI_MAX_PKTS];
-	u8 emb_frm[CFHSI_MAX_EMB_FRM_SZ];
-};
-#pragma pack() /* Default alignment. */
-
-/* Size of the complete HSI packet descriptor. */
-#define CFHSI_DESC_SZ (sizeof(struct cfhsi_desc))
-
-/*
- * Size of the complete HSI packet descriptor excluding the optional embedded
- * CAIF frame.
- */
-#define CFHSI_DESC_SHORT_SZ (CFHSI_DESC_SZ - CFHSI_MAX_EMB_FRM_SZ)
-
-/*
- * Maximum bytes transferred in one transfer.
- */
-#define CFHSI_MAX_CAIF_FRAME_SZ 4096
-
-#define CFHSI_MAX_PAYLOAD_SZ (CFHSI_MAX_PKTS * CFHSI_MAX_CAIF_FRAME_SZ)
-
-/* Size of the complete HSI TX buffer. */
-#define CFHSI_BUF_SZ_TX (CFHSI_DESC_SZ + CFHSI_MAX_PAYLOAD_SZ)
-
-/* Size of the complete HSI RX buffer. */
-#define CFHSI_BUF_SZ_RX ((2 * CFHSI_DESC_SZ) + CFHSI_MAX_PAYLOAD_SZ)
-
-/* Bitmasks for the HSI descriptor. */
-#define CFHSI_PIGGY_DESC		(0x01 << 7)
-
-#define CFHSI_TX_STATE_IDLE			0
-#define CFHSI_TX_STATE_XFER			1
-
-#define CFHSI_RX_STATE_DESC			0
-#define CFHSI_RX_STATE_PAYLOAD			1
-
-/* Bitmasks for power management. */
-#define CFHSI_WAKE_UP				0
-#define CFHSI_WAKE_UP_ACK			1
-#define CFHSI_WAKE_DOWN_ACK			2
-#define CFHSI_AWAKE				3
-#define CFHSI_WAKELOCK_HELD			4
-#define CFHSI_SHUTDOWN				5
-#define CFHSI_FLUSH_FIFO			6
-
-#ifndef CFHSI_INACTIVITY_TOUT
-#define CFHSI_INACTIVITY_TOUT			(1 * HZ)
-#endif /* CFHSI_INACTIVITY_TOUT */
-
-#ifndef CFHSI_WAKE_TOUT
-#define CFHSI_WAKE_TOUT			(3 * HZ)
-#endif /* CFHSI_WAKE_TOUT */
-
-#ifndef CFHSI_MAX_RX_RETRIES
-#define CFHSI_MAX_RX_RETRIES		(10 * HZ)
-#endif
-
-/* Structure implemented by the CAIF HSI driver. */
-struct cfhsi_cb_ops {
-	void (*tx_done_cb) (struct cfhsi_cb_ops *drv);
-	void (*rx_done_cb) (struct cfhsi_cb_ops *drv);
-	void (*wake_up_cb) (struct cfhsi_cb_ops *drv);
-	void (*wake_down_cb) (struct cfhsi_cb_ops *drv);
-};
-
-/* Structure implemented by HSI device. */
-struct cfhsi_ops {
-	int (*cfhsi_up) (struct cfhsi_ops *dev);
-	int (*cfhsi_down) (struct cfhsi_ops *dev);
-	int (*cfhsi_tx) (u8 *ptr, int len, struct cfhsi_ops *dev);
-	int (*cfhsi_rx) (u8 *ptr, int len, struct cfhsi_ops *dev);
-	int (*cfhsi_wake_up) (struct cfhsi_ops *dev);
-	int (*cfhsi_wake_down) (struct cfhsi_ops *dev);
-	int (*cfhsi_get_peer_wake) (struct cfhsi_ops *dev, bool *status);
-	int (*cfhsi_fifo_occupancy) (struct cfhsi_ops *dev, size_t *occupancy);
-	int (*cfhsi_rx_cancel)(struct cfhsi_ops *dev);
-	struct cfhsi_cb_ops *cb_ops;
-};
-
-/* Structure holds status of received CAIF frames processing */
-struct cfhsi_rx_state {
-	int state;
-	int nfrms;
-	int pld_len;
-	int retries;
-	bool piggy_desc;
-};
-
-/* Priority mapping */
-enum {
-	CFHSI_PRIO_CTL = 0,
-	CFHSI_PRIO_VI,
-	CFHSI_PRIO_VO,
-	CFHSI_PRIO_BEBK,
-	CFHSI_PRIO_LAST,
-};
-
-struct cfhsi_config {
-	u32 inactivity_timeout;
-	u32 aggregation_timeout;
-	u32 head_align;
-	u32 tail_align;
-	u32 q_high_mark;
-	u32 q_low_mark;
-};
-
-/* Structure implemented by CAIF HSI drivers. */
-struct cfhsi {
-	struct caif_dev_common cfdev;
-	struct net_device *ndev;
-	struct platform_device *pdev;
-	struct sk_buff_head qhead[CFHSI_PRIO_LAST];
-	struct cfhsi_cb_ops cb_ops;
-	struct cfhsi_ops *ops;
-	int tx_state;
-	struct cfhsi_rx_state rx_state;
-	struct cfhsi_config cfg;
-	int rx_len;
-	u8 *rx_ptr;
-	u8 *tx_buf;
-	u8 *rx_buf;
-	u8 *rx_flip_buf;
-	spinlock_t lock;
-	int flow_off_sent;
-	struct list_head list;
-	struct work_struct wake_up_work;
-	struct work_struct wake_down_work;
-	struct work_struct out_of_sync_work;
-	struct workqueue_struct *wq;
-	wait_queue_head_t wake_up_wait;
-	wait_queue_head_t wake_down_wait;
-	wait_queue_head_t flush_fifo_wait;
-	struct timer_list inactivity_timer;
-	struct timer_list rx_slowpath_timer;
-
-	/* TX aggregation */
-	int aggregation_len;
-	struct timer_list aggregation_timer;
-
-	unsigned long bits;
-};
-extern struct platform_driver cfhsi_driver;
-
-/**
- * enum ifla_caif_hsi - CAIF HSI NetlinkRT parameters.
- * @IFLA_CAIF_HSI_INACTIVITY_TOUT: Inactivity timeout before
- *			taking the HSI wakeline down, in milliseconds.
- * When using RT Netlink to create, destroy or configure a CAIF HSI interface,
- * enum ifla_caif_hsi is used to specify the configuration attributes.
- */
-enum ifla_caif_hsi {
-	__IFLA_CAIF_HSI_UNSPEC,
-	__IFLA_CAIF_HSI_INACTIVITY_TOUT,
-	__IFLA_CAIF_HSI_AGGREGATION_TOUT,
-	__IFLA_CAIF_HSI_HEAD_ALIGN,
-	__IFLA_CAIF_HSI_TAIL_ALIGN,
-	__IFLA_CAIF_HSI_QHIGH_WATERMARK,
-	__IFLA_CAIF_HSI_QLOW_WATERMARK,
-	__IFLA_CAIF_HSI_MAX
-};
-
-struct cfhsi_ops *cfhsi_get_ops(void);
-
-#endif		/* CAIF_HSI_H_ */
-- 
cgit v1.2.3


From 71158bb1f2d2da61385c58fc1114e1a1c19984ba Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Wed, 30 Jun 2021 13:42:13 +0200
Subject: tcp: consistently disable header prediction for mptcp

The MPTCP receive path is hooked only into the TCP slow-path.
The DSS presence allows plain MPTCP traffic to hit that
consistently.

Since commit e1ff9e82e2ea ("net: mptcp: improve fallback to TCP"),
when an MPTCP socket falls back to TCP, it can hit the TCP receive
fast-path, and delay or stop triggering the event notification.

Address the issue explicitly disabling the header prediction
for MPTCP sockets.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/200
Fixes: e1ff9e82e2ea ("net: mptcp: improve fallback to TCP")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index e668f1bf780d..17df9b047ee4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -686,6 +686,10 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
 
 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
 {
+	/* mptcp hooks are only on the slow path */
+	if (sk_is_mptcp((struct sock *)tp))
+		return;
+
 	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
 			       ntohl(TCP_FLAG_ACK) |
 			       snd_wnd);
-- 
cgit v1.2.3


From a23f89a9990684a0ca0cac4a2857c15d338ebe2d Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Thu, 1 Jul 2021 08:02:24 +0300
Subject: netfilter: conntrack: nf_ct_gre_keymap_flush() removal

nf_ct_gre_keymap_flush() is useless.
It is called from nf_conntrack_cleanup_net_list() only and tries to remove
nf_ct_gre_keymap entries from pernet gre keymap list. Though:
a) at this point the list should already be empty, all its entries were
deleted during the conntracks cleanup, because
nf_conntrack_cleanup_net_list() executes nf_ct_iterate_cleanup(kill_all)
before nf_conntrack_proto_pernet_fini():
 nf_conntrack_cleanup_net_list
  +- nf_ct_iterate_cleanup
  |   nf_ct_put
  |    nf_conntrack_put
  |     nf_conntrack_destroy
  |      destroy_conntrack
  |       destroy_gre_conntrack
  |        nf_ct_gre_keymap_destroy
  `- nf_conntrack_proto_pernet_fini
      nf_ct_gre_keymap_flush

b) Let's say we find that the keymap list is not empty. This means netns
still has a conntrack associated with gre, in which case we should not free
its memory, because this will lead to a double free and related crashes.
However I doubt it could have gone unnoticed for years, obviously
this does not happen in real life. So I think we can remove
both nf_ct_gre_keymap_flush() and nf_conntrack_proto_pernet_fini().

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_core.h |  1 -
 net/netfilter/nf_conntrack_core.c         |  1 -
 net/netfilter/nf_conntrack_proto.c        |  7 -------
 net/netfilter/nf_conntrack_proto_gre.c    | 13 -------------
 4 files changed, 22 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 09f2efea0b97..13807ea94cd2 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -30,7 +30,6 @@ void nf_conntrack_cleanup_net(struct net *net);
 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list);
 
 void nf_conntrack_proto_pernet_init(struct net *net);
-void nf_conntrack_proto_pernet_fini(struct net *net);
 
 int nf_conntrack_proto_init(void);
 void nf_conntrack_proto_fini(void);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 96ba19fc8155..085a11f1eb43 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -2457,7 +2457,6 @@ i_see_dead_people:
 	}
 
 	list_for_each_entry(net, net_exit_list, exit_list) {
-		nf_conntrack_proto_pernet_fini(net);
 		nf_conntrack_ecache_pernet_fini(net);
 		nf_conntrack_expect_pernet_fini(net);
 		free_percpu(net->ct.stat);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 55647409a9be..8f7a9837349c 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -697,13 +697,6 @@ void nf_conntrack_proto_pernet_init(struct net *net)
 #endif
 }
 
-void nf_conntrack_proto_pernet_fini(struct net *net)
-{
-#ifdef CONFIG_NF_CT_PROTO_GRE
-	nf_ct_gre_keymap_flush(net);
-#endif
-}
-
 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
 		  &nf_conntrack_htable_size, 0600);
 
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index db11e403d818..728eeb0aea87 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -55,19 +55,6 @@ static inline struct nf_gre_net *gre_pernet(struct net *net)
 	return &net->ct.nf_ct_proto.gre;
 }
 
-void nf_ct_gre_keymap_flush(struct net *net)
-{
-	struct nf_gre_net *net_gre = gre_pernet(net);
-	struct nf_ct_gre_keymap *km, *tmp;
-
-	spin_lock_bh(&keymap_lock);
-	list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) {
-		list_del_rcu(&km->list);
-		kfree_rcu(km, rcu);
-	}
-	spin_unlock_bh(&keymap_lock);
-}
-
 static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km,
 				const struct nf_conntrack_tuple *t)
 {
-- 
cgit v1.2.3


From 2580d3f40022642452dd8422bfb8c22e54cf84bb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 28 Jun 2021 15:34:28 +0200
Subject: xfrm: Fix RCU vs hash_resize_mutex lock inversion

xfrm_bydst_resize() calls synchronize_rcu() while holding
hash_resize_mutex. But then on PREEMPT_RT configurations,
xfrm_policy_lookup_bytype() may acquire that mutex while running in an
RCU read side critical section. This results in a deadlock.

In fact the scope of hash_resize_mutex is way beyond the purpose of
xfrm_policy_lookup_bytype() to just fetch a coherent and stable policy
for a given destination/direction, along with other details.

The lower level net->xfrm.xfrm_policy_lock, which among other things
protects per destination/direction references to policy entries, is
enough to serialize and benefit from priority inheritance against the
write side. As a bonus, it makes it officially a per network namespace
synchronization business where a policy table resize on namespace A
shouldn't block a policy lookup on namespace B.

Fixes: 77cc278f7b20 (xfrm: policy: Use sequence counters with associated lock)
Cc: stable@vger.kernel.org
Cc: Ahmed S. Darwish <a.darwish@linutronix.de>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Varad Gautam <varad.gautam@suse.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/netns/xfrm.h |  1 +
 net/xfrm/xfrm_policy.c   | 17 ++++++++---------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index e816b6a3ef2b..9b376b87bd54 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -74,6 +74,7 @@ struct netns_xfrm {
 #endif
 	spinlock_t		xfrm_state_lock;
 	seqcount_spinlock_t	xfrm_state_hash_generation;
+	seqcount_spinlock_t	xfrm_policy_hash_generation;
 
 	spinlock_t xfrm_policy_lock;
 	struct mutex xfrm_cfg_mutex;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index ce500f847b99..46a6d15b66d6 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -155,7 +155,6 @@ static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
 						__read_mostly;
 
 static struct kmem_cache *xfrm_dst_cache __ro_after_init;
-static __read_mostly seqcount_mutex_t xfrm_policy_hash_generation;
 
 static struct rhashtable xfrm_policy_inexact_table;
 static const struct rhashtable_params xfrm_pol_inexact_params;
@@ -585,7 +584,7 @@ static void xfrm_bydst_resize(struct net *net, int dir)
 		return;
 
 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
-	write_seqcount_begin(&xfrm_policy_hash_generation);
+	write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
 
 	odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table,
 				lockdep_is_held(&net->xfrm.xfrm_policy_lock));
@@ -596,7 +595,7 @@ static void xfrm_bydst_resize(struct net *net, int dir)
 	rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst);
 	net->xfrm.policy_bydst[dir].hmask = nhashmask;
 
-	write_seqcount_end(&xfrm_policy_hash_generation);
+	write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	synchronize_rcu();
@@ -1245,7 +1244,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 	} while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq));
 
 	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
-	write_seqcount_begin(&xfrm_policy_hash_generation);
+	write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
 
 	/* make sure that we can insert the indirect policies again before
 	 * we start with destructive action.
@@ -1354,7 +1353,7 @@ static void xfrm_hash_rebuild(struct work_struct *work)
 
 out_unlock:
 	__xfrm_policy_inexact_flush(net);
-	write_seqcount_end(&xfrm_policy_hash_generation);
+	write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation);
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 
 	mutex_unlock(&hash_resize_mutex);
@@ -2095,9 +2094,9 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 	rcu_read_lock();
  retry:
 	do {
-		sequence = read_seqcount_begin(&xfrm_policy_hash_generation);
+		sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation);
 		chain = policy_hash_direct(net, daddr, saddr, family, dir);
-	} while (read_seqcount_retry(&xfrm_policy_hash_generation, sequence));
+	} while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence));
 
 	ret = NULL;
 	hlist_for_each_entry_rcu(pol, chain, bydst) {
@@ -2128,7 +2127,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
 	}
 
 skip_inexact:
-	if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence))
+	if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence))
 		goto retry;
 
 	if (ret && !xfrm_pol_hold_rcu(ret))
@@ -4084,6 +4083,7 @@ static int __net_init xfrm_net_init(struct net *net)
 	/* Initialize the per-net locks here */
 	spin_lock_init(&net->xfrm.xfrm_state_lock);
 	spin_lock_init(&net->xfrm.xfrm_policy_lock);
+	seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock);
 	mutex_init(&net->xfrm.xfrm_cfg_mutex);
 
 	rv = xfrm_statistics_init(net);
@@ -4128,7 +4128,6 @@ void __init xfrm_init(void)
 {
 	register_pernet_subsys(&xfrm_net_ops);
 	xfrm_dev_init();
-	seqcount_mutex_init(&xfrm_policy_hash_generation, &hash_resize_mutex);
 	xfrm_input_init();
 
 #ifdef CONFIG_XFRM_ESPINTCP
-- 
cgit v1.2.3


From 40fc3054b45820c28ea3c65e2c86d041dc244a8a Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vfedorenko@novek.ru>
Date: Fri, 2 Jul 2021 02:47:00 +0300
Subject: net: ipv6: fix return value of ip6_skb_dst_mtu

Commit 628a5c561890 ("[INET]: Add IP(V6)_PMTUDISC_RPOBE") introduced
ip6_skb_dst_mtu with return value of signed int which is inconsistent
with actually returned values. Also 2 users of this function actually
assign its value to unsigned int variable and only __xfrm6_output
assigns result of this function to signed variable but actually uses
as unsigned in further comparisons and calls. Change this function
to return unsigned int value.

Fixes: 628a5c561890 ("[INET]: Add IP(V6)_PMTUDISC_RPOBE")
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Vadim Fedorenko <vfedorenko@novek.ru>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 2 +-
 net/ipv6/xfrm6_output.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index f14149df5a65..625a38ccb5d9 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -263,7 +263,7 @@ static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		 int (*output)(struct net *, struct sock *, struct sk_buff *));
 
-static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
+static inline unsigned int ip6_skb_dst_mtu(struct sk_buff *skb)
 {
 	int mtu;
 
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 57fa27c1cdf9..d0d280077721 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -49,7 +49,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct xfrm_state *x = dst->xfrm;
-	int mtu;
+	unsigned int mtu;
 	bool toobig;
 
 #ifdef CONFIG_NETFILTER
-- 
cgit v1.2.3


From b2aae654a4794ef898ad33a179f341eb610f6b85 Mon Sep 17 00:00:00 2001
From: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Date: Mon, 5 Jul 2021 18:26:54 +0800
Subject: net: stmmac: add mutex lock to protect est parameters

Add a mutex lock to protect est structure parameters so that the
EST parameters can be updated by other threads.

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 12 +++++++++++-
 include/linux/stmmac.h                          |  1 +
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
index 596626c71189..2e3cdf540168 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -796,14 +796,18 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 					 GFP_KERNEL);
 		if (!plat->est)
 			return -ENOMEM;
+
+		mutex_init(&priv->plat->est->lock);
 	} else {
 		memset(plat->est, 0, sizeof(*plat->est));
 	}
 
 	size = qopt->num_entries;
 
+	mutex_lock(&priv->plat->est->lock);
 	priv->plat->est->gcl_size = size;
 	priv->plat->est->enable = qopt->enable;
+	mutex_unlock(&priv->plat->est->lock);
 
 	for (i = 0; i < size; i++) {
 		s64 delta_ns = qopt->entries[i].interval;
@@ -834,6 +838,7 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 		priv->plat->est->gcl[i] = delta_ns | (gates << wid);
 	}
 
+	mutex_lock(&priv->plat->est->lock);
 	/* Adjust for real system time */
 	priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops, &current_time);
 	current_time_ns = timespec64_to_ktime(current_time);
@@ -847,8 +852,10 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 	priv->plat->est->ctr[0] = do_div(ctr, NSEC_PER_SEC);
 	priv->plat->est->ctr[1] = (u32)ctr;
 
-	if (fpe && !priv->dma_cap.fpesel)
+	if (fpe && !priv->dma_cap.fpesel) {
+		mutex_unlock(&priv->plat->est->lock);
 		return -EOPNOTSUPP;
+	}
 
 	/* Actual FPE register configuration will be done after FPE handshake
 	 * is success.
@@ -857,6 +864,7 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 
 	ret = stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
 				   priv->plat->clk_ptp_rate);
+	mutex_unlock(&priv->plat->est->lock);
 	if (ret) {
 		netdev_err(priv->dev, "failed to configure EST\n");
 		goto disable;
@@ -872,9 +880,11 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 	return 0;
 
 disable:
+	mutex_lock(&priv->plat->est->lock);
 	priv->plat->est->enable = false;
 	stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
 			     priv->plat->clk_ptp_rate);
+	mutex_unlock(&priv->plat->est->lock);
 
 	priv->plat->fpe_cfg->enable = false;
 	stmmac_fpe_configure(priv, priv->ioaddr,
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index d5ae621d66ba..09157b8a5810 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -115,6 +115,7 @@ struct stmmac_axi {
 
 #define EST_GCL		1024
 struct stmmac_est {
+	struct mutex lock;
 	int enable;
 	u32 btr_offset[2];
 	u32 btr[2];
-- 
cgit v1.2.3


From e9e3720002f61cd637a49ecafae77cac230eefae Mon Sep 17 00:00:00 2001
From: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Date: Mon, 5 Jul 2021 18:26:55 +0800
Subject: net: stmmac: ptp: update tas basetime after ptp adjust

After adjusting the ptp time, the Qbv base time may be the past time
of the new current time. dwmac5 hardware limited the base time cannot
be set as past time. This patch add a btr_reserve to store the base
time get from qopt, then calculate the base time and reset the Qbv
configuration after ptp time adjust.

Signed-off-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c | 41 +++++++++++++++++++++++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c  |  6 +++-
 include/linux/stmmac.h                           |  1 +
 3 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index 4e86cdf2bc9f..580cc035536b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -62,7 +62,8 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
 	u32 sec, nsec;
 	u32 quotient, reminder;
 	int neg_adj = 0;
-	bool xmac;
+	bool xmac, est_rst = false;
+	int ret;
 
 	xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac;
 
@@ -75,10 +76,48 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
 	sec = quotient;
 	nsec = reminder;
 
+	/* If EST is enabled, disabled it before adjust ptp time. */
+	if (priv->plat->est && priv->plat->est->enable) {
+		est_rst = true;
+		mutex_lock(&priv->plat->est->lock);
+		priv->plat->est->enable = false;
+		stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
+				     priv->plat->clk_ptp_rate);
+		mutex_unlock(&priv->plat->est->lock);
+	}
+
 	spin_lock_irqsave(&priv->ptp_lock, flags);
 	stmmac_adjust_systime(priv, priv->ptpaddr, sec, nsec, neg_adj, xmac);
 	spin_unlock_irqrestore(&priv->ptp_lock, flags);
 
+	/* Caculate new basetime and re-configured EST after PTP time adjust. */
+	if (est_rst) {
+		struct timespec64 current_time, time;
+		ktime_t current_time_ns, basetime;
+		u64 cycle_time;
+
+		mutex_lock(&priv->plat->est->lock);
+		priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops, &current_time);
+		current_time_ns = timespec64_to_ktime(current_time);
+		time.tv_nsec = priv->plat->est->btr_reserve[0];
+		time.tv_sec = priv->plat->est->btr_reserve[1];
+		basetime = timespec64_to_ktime(time);
+		cycle_time = priv->plat->est->ctr[1] * NSEC_PER_SEC +
+			     priv->plat->est->ctr[0];
+		time = stmmac_calc_tas_basetime(basetime,
+						current_time_ns,
+						cycle_time);
+
+		priv->plat->est->btr[0] = (u32)time.tv_nsec;
+		priv->plat->est->btr[1] = (u32)time.tv_sec;
+		priv->plat->est->enable = true;
+		ret = stmmac_est_configure(priv, priv->ioaddr, priv->plat->est,
+					   priv->plat->clk_ptp_rate);
+		mutex_unlock(&priv->plat->est->lock);
+		if (ret)
+			netdev_err(priv->dev, "failed to configure EST\n");
+	}
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
index 2e3cdf540168..4f3b6437b114 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -739,7 +739,7 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 {
 	u32 size, wid = priv->dma_cap.estwid, dep = priv->dma_cap.estdep;
 	struct plat_stmmacenet_data *plat = priv->plat;
-	struct timespec64 time, current_time;
+	struct timespec64 time, current_time, qopt_time;
 	ktime_t current_time_ns;
 	bool fpe = false;
 	int i, ret = 0;
@@ -848,6 +848,10 @@ static int tc_setup_taprio(struct stmmac_priv *priv,
 	priv->plat->est->btr[0] = (u32)time.tv_nsec;
 	priv->plat->est->btr[1] = (u32)time.tv_sec;
 
+	qopt_time = ktime_to_timespec64(qopt->base_time);
+	priv->plat->est->btr_reserve[0] = (u32)qopt_time.tv_nsec;
+	priv->plat->est->btr_reserve[1] = (u32)qopt_time.tv_sec;
+
 	ctr = qopt->cycle_time;
 	priv->plat->est->ctr[0] = do_div(ctr, NSEC_PER_SEC);
 	priv->plat->est->ctr[1] = (u32)ctr;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 09157b8a5810..a6f03b36fc4f 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -117,6 +117,7 @@ struct stmmac_axi {
 struct stmmac_est {
 	struct mutex lock;
 	int enable;
+	u32 btr_reserve[2];
 	u32 btr_offset[2];
 	u32 btr[2];
 	u32 ctr[2];
-- 
cgit v1.2.3


From 1da4cd82dd180224503e745ccf3220e3490d8897 Mon Sep 17 00:00:00 2001
From: Ali Abdallah <ali.abdallah@suse.com>
Date: Thu, 27 May 2021 09:19:06 +0200
Subject: netfilter: conntrack: add new sysctl to disable RST check

This patch adds a new sysctl tcp_ignore_invalid_rst to disable marking
out of segments RSTs as INVALID.

Signed-off-by: Ali Abdallah <aabdallah@suse.de>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/nf_conntrack-sysctl.rst |  6 ++++++
 include/net/netns/conntrack.h                    |  1 +
 net/netfilter/nf_conntrack_proto_tcp.c           |  6 +++++-
 net/netfilter/nf_conntrack_standalone.c          | 10 ++++++++++
 4 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 0467b30e4abe..d31ed6c1cb0d 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -110,6 +110,12 @@ nf_conntrack_tcp_be_liberal - BOOLEAN
 	Be conservative in what you do, be liberal in what you accept from others.
 	If it's non-zero, we mark only out of window RST segments as INVALID.
 
+nf_conntrack_tcp_ignore_invalid_rst - BOOLEAN
+	- 0 - disabled (default)
+	- 1 - enabled
+
+	If it's 1, we don't mark out of window RST segments as INVALID.
+
 nf_conntrack_tcp_loose - BOOLEAN
 	- 0 - disabled
 	- not 0 - enabled (default)
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index c3094b83a525..37e5300c7e5a 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -27,6 +27,7 @@ struct nf_tcp_net {
 	u8 tcp_loose;
 	u8 tcp_be_liberal;
 	u8 tcp_max_retrans;
+	u8 tcp_ignore_invalid_rst;
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
 	unsigned int offload_timeout;
 	unsigned int offload_pickup;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b8ff67671e93..3259416f2ea4 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1068,7 +1068,8 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
 			if (seq == 0 && !nf_conntrack_tcp_established(ct))
 				break;
 
-			if (before(seq, ct->proto.tcp.seen[!dir].td_maxack)) {
+			if (before(seq, ct->proto.tcp.seen[!dir].td_maxack) &&
+			    !tn->tcp_ignore_invalid_rst) {
 				/* Invalid RST  */
 				spin_unlock_bh(&ct->lock);
 				nf_ct_l4proto_log_invalid(skb, ct, state, "invalid rst");
@@ -1466,6 +1467,9 @@ void nf_conntrack_tcp_init_net(struct net *net)
 	 */
 	tn->tcp_be_liberal = 0;
 
+	/* If it's non-zero, we turn off RST sequence number check */
+	tn->tcp_ignore_invalid_rst = 0;
+
 	/* Max number of the retransmitted packets without receiving an (acceptable)
 	 * ACK from the destination. If this number is reached, a shorter timer
 	 * will be started.
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index f57a951c9b5e..214d9f9e499b 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -579,6 +579,7 @@ enum nf_ct_sysctl_index {
 #endif
 	NF_SYSCTL_CT_PROTO_TCP_LOOSE,
 	NF_SYSCTL_CT_PROTO_TCP_LIBERAL,
+	NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST,
 	NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS,
 	NF_SYSCTL_CT_PROTO_TIMEOUT_UDP,
 	NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM,
@@ -798,6 +799,14 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 		.extra1 	= SYSCTL_ZERO,
 		.extra2 	= SYSCTL_ONE,
 	},
+	[NF_SYSCTL_CT_PROTO_TCP_IGNORE_INVALID_RST] = {
+		.procname	= "nf_conntrack_tcp_ignore_invalid_rst",
+		.maxlen		= sizeof(u8),
+		.mode		= 0644,
+		.proc_handler	= proc_dou8vec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 	[NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS] = {
 		.procname	= "nf_conntrack_tcp_max_retrans",
 		.maxlen		= sizeof(u8),
@@ -1004,6 +1013,7 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net,
 	XASSIGN(LOOSE, &tn->tcp_loose);
 	XASSIGN(LIBERAL, &tn->tcp_be_liberal);
 	XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans);
+	XASSIGN(IGNORE_INVALID_RST, &tn->tcp_ignore_invalid_rst);
 #undef XASSIGN
 
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
-- 
cgit v1.2.3


From 9a5605505d9c7dbfdb89cc29a8f5fc5cf9fd2334 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Mon, 5 Jul 2021 15:38:12 +0000
Subject: bonding: Add struct bond_ipesc to manage SA

bonding has been supporting ipsec offload.
When SA is added, bonding just passes SA to its own active real interface.
But it doesn't manage SA.
So, when events(add/del real interface, active real interface change, etc)
occur, bonding can't handle that well because It doesn't manage SA.
So some problems(panic, UAF, refcnt leak)occur.

In order to make it stable, it should manage SA.
That's the reason why struct bond_ipsec is added.
When a new SA is added to bonding interface, it is stored in the
bond_ipsec list. And the SA is passed to a current active real interface.
If events occur, it uses bond_ipsec data to handle these events.
bond->ipsec_list is protected by bond->ipsec_lock.

If a current active real interface is changed, the following logic works.
1. delete all SAs from old active real interface
2. Add all SAs to the new active real interface.
3. If a new active real interface doesn't support ipsec offload or SA's
option, it sets real_dev to NULL.

Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c | 139 ++++++++++++++++++++++++++++++++++------
 include/net/bonding.h           |   9 ++-
 2 files changed, 127 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index f7b89743fab9..165fa55cfb38 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -401,6 +401,7 @@ static int bond_vlan_rx_kill_vid(struct net_device *bond_dev,
 static int bond_ipsec_add_sa(struct xfrm_state *xs)
 {
 	struct net_device *bond_dev = xs->xso.dev;
+	struct bond_ipsec *ipsec;
 	struct bonding *bond;
 	struct slave *slave;
 	int err;
@@ -416,9 +417,6 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs)
 		return -ENODEV;
 	}
 
-	xs->xso.real_dev = slave->dev;
-	bond->xs = xs;
-
 	if (!slave->dev->xfrmdev_ops ||
 	    !slave->dev->xfrmdev_ops->xdo_dev_state_add ||
 	    netif_is_bond_master(slave->dev)) {
@@ -427,11 +425,63 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs)
 		return -EINVAL;
 	}
 
+	ipsec = kmalloc(sizeof(*ipsec), GFP_ATOMIC);
+	if (!ipsec) {
+		rcu_read_unlock();
+		return -ENOMEM;
+	}
+	xs->xso.real_dev = slave->dev;
+
 	err = slave->dev->xfrmdev_ops->xdo_dev_state_add(xs);
+	if (!err) {
+		ipsec->xs = xs;
+		INIT_LIST_HEAD(&ipsec->list);
+		spin_lock_bh(&bond->ipsec_lock);
+		list_add(&ipsec->list, &bond->ipsec_list);
+		spin_unlock_bh(&bond->ipsec_lock);
+	} else {
+		kfree(ipsec);
+	}
 	rcu_read_unlock();
 	return err;
 }
 
+static void bond_ipsec_add_sa_all(struct bonding *bond)
+{
+	struct net_device *bond_dev = bond->dev;
+	struct bond_ipsec *ipsec;
+	struct slave *slave;
+
+	rcu_read_lock();
+	slave = rcu_dereference(bond->curr_active_slave);
+	if (!slave)
+		goto out;
+
+	if (!slave->dev->xfrmdev_ops ||
+	    !slave->dev->xfrmdev_ops->xdo_dev_state_add ||
+	    netif_is_bond_master(slave->dev)) {
+		spin_lock_bh(&bond->ipsec_lock);
+		if (!list_empty(&bond->ipsec_list))
+			slave_warn(bond_dev, slave->dev,
+				   "%s: no slave xdo_dev_state_add\n",
+				   __func__);
+		spin_unlock_bh(&bond->ipsec_lock);
+		goto out;
+	}
+
+	spin_lock_bh(&bond->ipsec_lock);
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		ipsec->xs->xso.real_dev = slave->dev;
+		if (slave->dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs)) {
+			slave_warn(bond_dev, slave->dev, "%s: failed to add SA\n", __func__);
+			ipsec->xs->xso.real_dev = NULL;
+		}
+	}
+	spin_unlock_bh(&bond->ipsec_lock);
+out:
+	rcu_read_unlock();
+}
+
 /**
  * bond_ipsec_del_sa - clear out this specific SA
  * @xs: pointer to transformer state struct
@@ -439,6 +489,7 @@ static int bond_ipsec_add_sa(struct xfrm_state *xs)
 static void bond_ipsec_del_sa(struct xfrm_state *xs)
 {
 	struct net_device *bond_dev = xs->xso.dev;
+	struct bond_ipsec *ipsec;
 	struct bonding *bond;
 	struct slave *slave;
 
@@ -452,7 +503,10 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
 	if (!slave)
 		goto out;
 
-	xs->xso.real_dev = slave->dev;
+	if (!xs->xso.real_dev)
+		goto out;
+
+	WARN_ON(xs->xso.real_dev != slave->dev);
 
 	if (!slave->dev->xfrmdev_ops ||
 	    !slave->dev->xfrmdev_ops->xdo_dev_state_delete ||
@@ -463,6 +517,48 @@ static void bond_ipsec_del_sa(struct xfrm_state *xs)
 
 	slave->dev->xfrmdev_ops->xdo_dev_state_delete(xs);
 out:
+	spin_lock_bh(&bond->ipsec_lock);
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (ipsec->xs == xs) {
+			list_del(&ipsec->list);
+			kfree(ipsec);
+			break;
+		}
+	}
+	spin_unlock_bh(&bond->ipsec_lock);
+	rcu_read_unlock();
+}
+
+static void bond_ipsec_del_sa_all(struct bonding *bond)
+{
+	struct net_device *bond_dev = bond->dev;
+	struct bond_ipsec *ipsec;
+	struct slave *slave;
+
+	rcu_read_lock();
+	slave = rcu_dereference(bond->curr_active_slave);
+	if (!slave) {
+		rcu_read_unlock();
+		return;
+	}
+
+	spin_lock_bh(&bond->ipsec_lock);
+	list_for_each_entry(ipsec, &bond->ipsec_list, list) {
+		if (!ipsec->xs->xso.real_dev)
+			continue;
+
+		if (!slave->dev->xfrmdev_ops ||
+		    !slave->dev->xfrmdev_ops->xdo_dev_state_delete ||
+		    netif_is_bond_master(slave->dev)) {
+			slave_warn(bond_dev, slave->dev,
+				   "%s: no slave xdo_dev_state_delete\n",
+				   __func__);
+		} else {
+			slave->dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs);
+		}
+		ipsec->xs->xso.real_dev = NULL;
+	}
+	spin_unlock_bh(&bond->ipsec_lock);
 	rcu_read_unlock();
 }
 
@@ -474,22 +570,27 @@ out:
 static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
 {
 	struct net_device *bond_dev = xs->xso.dev;
-	struct bonding *bond = netdev_priv(bond_dev);
-	struct slave *curr_active = rcu_dereference(bond->curr_active_slave);
-	struct net_device *slave_dev = curr_active->dev;
+	struct net_device *real_dev;
+	struct slave *curr_active;
+	struct bonding *bond;
+
+	bond = netdev_priv(bond_dev);
+	curr_active = rcu_dereference(bond->curr_active_slave);
+	real_dev = curr_active->dev;
 
 	if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
 		return true;
 
-	if (!slave_dev->xfrmdev_ops ||
-	    !slave_dev->xfrmdev_ops->xdo_dev_offload_ok ||
-	    netif_is_bond_master(slave_dev)) {
-		slave_warn(bond_dev, slave_dev, "%s: no slave xdo_dev_offload_ok\n", __func__);
+	if (!xs->xso.real_dev)
+		return false;
+
+	if (!real_dev->xfrmdev_ops ||
+	    !real_dev->xfrmdev_ops->xdo_dev_offload_ok ||
+	    netif_is_bond_master(real_dev)) {
 		return false;
 	}
 
-	xs->xso.real_dev = slave_dev;
-	return slave_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
+	return real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
 }
 
 static const struct xfrmdev_ops bond_xfrmdev_ops = {
@@ -1006,8 +1107,7 @@ void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
 		return;
 
 #ifdef CONFIG_XFRM_OFFLOAD
-	if (old_active && bond->xs)
-		bond_ipsec_del_sa(bond->xs);
+	bond_ipsec_del_sa_all(bond);
 #endif /* CONFIG_XFRM_OFFLOAD */
 
 	if (new_active) {
@@ -1082,10 +1182,7 @@ void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
 	}
 
 #ifdef CONFIG_XFRM_OFFLOAD
-	if (new_active && bond->xs) {
-		xfrm_dev_state_flush(dev_net(bond->dev), bond->dev, true);
-		bond_ipsec_add_sa(bond->xs);
-	}
+	bond_ipsec_add_sa_all(bond);
 #endif /* CONFIG_XFRM_OFFLOAD */
 
 	/* resend IGMP joins since active slave has changed or
@@ -3343,6 +3440,7 @@ static int bond_master_netdev_event(unsigned long event,
 		return bond_event_changename(event_bond);
 	case NETDEV_UNREGISTER:
 		bond_remove_proc_entry(event_bond);
+		xfrm_dev_state_flush(dev_net(bond_dev), bond_dev, true);
 		break;
 	case NETDEV_REGISTER:
 		bond_create_proc_entry(event_bond);
@@ -4910,7 +5008,8 @@ void bond_setup(struct net_device *bond_dev)
 #ifdef CONFIG_XFRM_OFFLOAD
 	/* set up xfrm device ops (only supported in active-backup right now) */
 	bond_dev->xfrmdev_ops = &bond_xfrmdev_ops;
-	bond->xs = NULL;
+	INIT_LIST_HEAD(&bond->ipsec_list);
+	spin_lock_init(&bond->ipsec_lock);
 #endif /* CONFIG_XFRM_OFFLOAD */
 
 	/* don't acquire bond device's netif_tx_lock when transmitting */
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 15335732e166..625d9c72dee3 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -201,6 +201,11 @@ struct bond_up_slave {
  */
 #define BOND_LINK_NOCHANGE -1
 
+struct bond_ipsec {
+	struct list_head list;
+	struct xfrm_state *xs;
+};
+
 /*
  * Here are the locking policies for the two bonding locks:
  * Get rcu_read_lock when reading or RTNL when writing slave list.
@@ -249,7 +254,9 @@ struct bonding {
 #endif /* CONFIG_DEBUG_FS */
 	struct rtnl_link_stats64 bond_stats;
 #ifdef CONFIG_XFRM_OFFLOAD
-	struct xfrm_state *xs;
+	struct list_head ipsec_list;
+	/* protecting ipsec_list */
+	spinlock_t ipsec_lock;
 #endif /* CONFIG_XFRM_OFFLOAD */
 };
 
-- 
cgit v1.2.3


From d322957ebfb9c21c2c72b66680f7c3ccd724e081 Mon Sep 17 00:00:00 2001
From: Duncan Roe <duncan_roe@optusnet.com.au>
Date: Wed, 7 Jul 2021 10:57:51 +1000
Subject: netfilter: uapi: refer to nfnetlink_conntrack.h, not
 nf_conntrack_netlink.h

nf_conntrack_netlink.h does not exist, refer to nfnetlink_conntrack.h instead.

Signed-off-by: Duncan Roe <duncan_roe@optusnet.com.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_log.h   | 2 +-
 include/uapi/linux/netfilter/nfnetlink_queue.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nfnetlink_log.h b/include/uapi/linux/netfilter/nfnetlink_log.h
index 45c8d3b027e0..0af9c113d665 100644
--- a/include/uapi/linux/netfilter/nfnetlink_log.h
+++ b/include/uapi/linux/netfilter/nfnetlink_log.h
@@ -61,7 +61,7 @@ enum nfulnl_attr_type {
 	NFULA_HWTYPE,			/* hardware type */
 	NFULA_HWHEADER,			/* hardware header */
 	NFULA_HWLEN,			/* hardware header length */
-	NFULA_CT,                       /* nf_conntrack_netlink.h */
+	NFULA_CT,                       /* nfnetlink_conntrack.h */
 	NFULA_CT_INFO,                  /* enum ip_conntrack_info */
 	NFULA_VLAN,			/* nested attribute: packet vlan info */
 	NFULA_L2HDR,			/* full L2 header */
diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h
index bcb2cb5d40b9..aed90c4df0c8 100644
--- a/include/uapi/linux/netfilter/nfnetlink_queue.h
+++ b/include/uapi/linux/netfilter/nfnetlink_queue.h
@@ -51,11 +51,11 @@ enum nfqnl_attr_type {
 	NFQA_IFINDEX_PHYSOUTDEV,	/* __u32 ifindex */
 	NFQA_HWADDR,			/* nfqnl_msg_packet_hw */
 	NFQA_PAYLOAD,			/* opaque data payload */
-	NFQA_CT,			/* nf_conntrack_netlink.h */
+	NFQA_CT,			/* nfnetlink_conntrack.h */
 	NFQA_CT_INFO,			/* enum ip_conntrack_info */
 	NFQA_CAP_LEN,			/* __u32 length of captured packet */
 	NFQA_SKB_INFO,			/* __u32 skb meta information */
-	NFQA_EXP,			/* nf_conntrack_netlink.h */
+	NFQA_EXP,			/* nfnetlink_conntrack.h */
 	NFQA_UID,			/* __u32 sk uid */
 	NFQA_GID,			/* __u32 sk gid */
 	NFQA_SECCTX,			/* security context string */
-- 
cgit v1.2.3


From f263a81451c12da5a342d90572e317e611846f2c Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 7 Jul 2021 15:38:47 -0700
Subject: bpf: Track subprog poke descriptors correctly and fix use-after-free

Subprograms are calling map_poke_track(), but on program release there is no
hook to call map_poke_untrack(). However, on program release, the aux memory
(and poke descriptor table) is freed even though we still have a reference to
it in the element list of the map aux data. When we run map_poke_run(), we then
end up accessing free'd memory, triggering KASAN in prog_array_map_poke_run():

  [...]
  [  402.824689] BUG: KASAN: use-after-free in prog_array_map_poke_run+0xc2/0x34e
  [  402.824698] Read of size 4 at addr ffff8881905a7940 by task hubble-fgs/4337
  [  402.824705] CPU: 1 PID: 4337 Comm: hubble-fgs Tainted: G          I       5.12.0+ #399
  [  402.824715] Call Trace:
  [  402.824719]  dump_stack+0x93/0xc2
  [  402.824727]  print_address_description.constprop.0+0x1a/0x140
  [  402.824736]  ? prog_array_map_poke_run+0xc2/0x34e
  [  402.824740]  ? prog_array_map_poke_run+0xc2/0x34e
  [  402.824744]  kasan_report.cold+0x7c/0xd8
  [  402.824752]  ? prog_array_map_poke_run+0xc2/0x34e
  [  402.824757]  prog_array_map_poke_run+0xc2/0x34e
  [  402.824765]  bpf_fd_array_map_update_elem+0x124/0x1a0
  [...]

The elements concerned are walked as follows:

    for (i = 0; i < elem->aux->size_poke_tab; i++) {
           poke = &elem->aux->poke_tab[i];
    [...]

The access to size_poke_tab is a 4 byte read, verified by checking offsets
in the KASAN dump:

  [  402.825004] The buggy address belongs to the object at ffff8881905a7800
                 which belongs to the cache kmalloc-1k of size 1024
  [  402.825008] The buggy address is located 320 bytes inside of
                 1024-byte region [ffff8881905a7800, ffff8881905a7c00)

The pahole output of bpf_prog_aux:

  struct bpf_prog_aux {
    [...]
    /* --- cacheline 5 boundary (320 bytes) --- */
    u32                        size_poke_tab;        /*   320     4 */
    [...]

In general, subprograms do not necessarily manage their own data structures.
For example, BTF func_info and linfo are just pointers to the main program
structure. This allows reference counting and cleanup to be done on the latter
which simplifies their management a bit. The aux->poke_tab struct, however,
did not follow this logic. The initial proposed fix for this use-after-free
bug further embedded poke data tracking into the subprogram with proper
reference counting. However, Daniel and Alexei questioned why we were treating
these objects special; I agree, its unnecessary. The fix here removes the per
subprogram poke table allocation and map tracking and instead simply points
the aux->poke_tab pointer at the main programs poke table. This way, map
tracking is simplified to the main program and we do not need to manage them
per subprogram.

This also means, bpf_prog_free_deferred(), which unwinds the program reference
counting and kfrees objects, needs to ensure that we don't try to double free
the poke_tab when free'ing the subprog structures. This is easily solved by
NULL'ing the poke_tab pointer. The second detail is to ensure that per
subprogram JIT logic only does fixups on poke_tab[] entries it owns. To do
this, we add a pointer in the poke structure to point at the subprogram value
so JITs can easily check while walking the poke_tab structure if the current
entry belongs to the current program. The aux pointer is stable and therefore
suitable for such comparison. On the jit_subprogs() error path, we omit
cleaning up the poke->aux field because these are only ever referenced from
the JIT side, but on error we will never make it to the JIT, so its fine to
leave them dangling. Removing these pointers would complicate the error path
for no reason. However, we do need to untrack all poke descriptors from the
main program as otherwise they could race with the freeing of JIT memory from
the subprograms. Lastly, a748c6975dea3 ("bpf: propagate poke descriptors to
subprograms") had an off-by-one on the subprogram instruction index range
check as it was testing 'insn_idx >= subprog_start && insn_idx <= subprog_end'.
However, subprog_end is the next subprogram's start instruction.

Fixes: a748c6975dea3 ("bpf: propagate poke descriptors to subprograms")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Co-developed-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20210707223848.14580-2-john.fastabend@gmail.com
---
 arch/x86/net/bpf_jit_comp.c |  3 +++
 include/linux/bpf.h         |  1 +
 kernel/bpf/core.c           |  8 +++++-
 kernel/bpf/verifier.c       | 60 ++++++++++++++++-----------------------------
 4 files changed, 32 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index e835164189f1..4b951458c9fc 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -570,6 +570,9 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog)
 
 	for (i = 0; i < prog->aux->size_poke_tab; i++) {
 		poke = &prog->aux->poke_tab[i];
+		if (poke->aux && poke->aux != prog->aux)
+			continue;
+
 		WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable));
 
 		if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f309fc1509f2..e8e2b0393ca9 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -780,6 +780,7 @@ struct bpf_jit_poke_descriptor {
 	void *tailcall_target;
 	void *tailcall_bypass;
 	void *bypass_addr;
+	void *aux;
 	union {
 		struct {
 			struct bpf_map *map;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 034ad93a1ad7..9b1577498373 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2236,8 +2236,14 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 #endif
 	if (aux->dst_trampoline)
 		bpf_trampoline_put(aux->dst_trampoline);
-	for (i = 0; i < aux->func_cnt; i++)
+	for (i = 0; i < aux->func_cnt; i++) {
+		/* We can just unlink the subprog poke descriptor table as
+		 * it was originally linked to the main program and is also
+		 * released along with it.
+		 */
+		aux->func[i]->aux->poke_tab = NULL;
 		bpf_jit_free(aux->func[i]);
+	}
 	if (aux->func_cnt) {
 		kfree(aux->func);
 		bpf_prog_unlock_free(aux->prog);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index be38bb930bf1..42a4063de7cd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12121,33 +12121,19 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			goto out_free;
 		func[i]->is_func = 1;
 		func[i]->aux->func_idx = i;
-		/* the btf and func_info will be freed only at prog->aux */
+		/* Below members will be freed only at prog->aux */
 		func[i]->aux->btf = prog->aux->btf;
 		func[i]->aux->func_info = prog->aux->func_info;
+		func[i]->aux->poke_tab = prog->aux->poke_tab;
+		func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
 
 		for (j = 0; j < prog->aux->size_poke_tab; j++) {
-			u32 insn_idx = prog->aux->poke_tab[j].insn_idx;
-			int ret;
+			struct bpf_jit_poke_descriptor *poke;
 
-			if (!(insn_idx >= subprog_start &&
-			      insn_idx <= subprog_end))
-				continue;
-
-			ret = bpf_jit_add_poke_descriptor(func[i],
-							  &prog->aux->poke_tab[j]);
-			if (ret < 0) {
-				verbose(env, "adding tail call poke descriptor failed\n");
-				goto out_free;
-			}
-
-			func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1;
-
-			map_ptr = func[i]->aux->poke_tab[ret].tail_call.map;
-			ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux);
-			if (ret < 0) {
-				verbose(env, "tracking tail call prog failed\n");
-				goto out_free;
-			}
+			poke = &prog->aux->poke_tab[j];
+			if (poke->insn_idx < subprog_end &&
+			    poke->insn_idx >= subprog_start)
+				poke->aux = func[i]->aux;
 		}
 
 		/* Use bpf_prog_F_tag to indicate functions in stack traces.
@@ -12178,18 +12164,6 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		cond_resched();
 	}
 
-	/* Untrack main program's aux structs so that during map_poke_run()
-	 * we will not stumble upon the unfilled poke descriptors; each
-	 * of the main program's poke descs got distributed across subprogs
-	 * and got tracked onto map, so we are sure that none of them will
-	 * be missed after the operation below
-	 */
-	for (i = 0; i < prog->aux->size_poke_tab; i++) {
-		map_ptr = prog->aux->poke_tab[i].tail_call.map;
-
-		map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
-	}
-
 	/* at this point all bpf functions were successfully JITed
 	 * now populate all bpf_calls with correct addresses and
 	 * run last pass of JIT
@@ -12267,14 +12241,22 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	bpf_prog_jit_attempt_done(prog);
 	return 0;
 out_free:
+	/* We failed JIT'ing, so at this point we need to unregister poke
+	 * descriptors from subprogs, so that kernel is not attempting to
+	 * patch it anymore as we're freeing the subprog JIT memory.
+	 */
+	for (i = 0; i < prog->aux->size_poke_tab; i++) {
+		map_ptr = prog->aux->poke_tab[i].tail_call.map;
+		map_ptr->ops->map_poke_untrack(map_ptr, prog->aux);
+	}
+	/* At this point we're guaranteed that poke descriptors are not
+	 * live anymore. We can just unlink its descriptor table as it's
+	 * released with the main prog.
+	 */
 	for (i = 0; i < env->subprog_cnt; i++) {
 		if (!func[i])
 			continue;
-
-		for (j = 0; j < func[i]->aux->size_poke_tab; j++) {
-			map_ptr = func[i]->aux->poke_tab[j].tail_call.map;
-			map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux);
-		}
+		func[i]->aux->poke_tab = NULL;
 		bpf_jit_free(func[i]);
 	}
 	kfree(func);
-- 
cgit v1.2.3


From 67a9c94317402b826fc3db32afc8f39336803d97 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Fri, 9 Jul 2021 17:35:18 +0000
Subject: net: validate lwtstate->data before returning from skb_tunnel_info()

skb_tunnel_info() returns pointer of lwtstate->data as ip_tunnel_info
type without validation. lwtstate->data can have various types such as
mpls_iptunnel_encap, etc and these are not compatible.
So skb_tunnel_info() should validate before returning that pointer.

Splat looks like:
BUG: KASAN: slab-out-of-bounds in vxlan_get_route+0x418/0x4b0 [vxlan]
Read of size 2 at addr ffff888106ec2698 by task ping/811

CPU: 1 PID: 811 Comm: ping Not tainted 5.13.0+ #1195
Call Trace:
 dump_stack_lvl+0x56/0x7b
 print_address_description.constprop.8.cold.13+0x13/0x2ee
 ? vxlan_get_route+0x418/0x4b0 [vxlan]
 ? vxlan_get_route+0x418/0x4b0 [vxlan]
 kasan_report.cold.14+0x83/0xdf
 ? vxlan_get_route+0x418/0x4b0 [vxlan]
 vxlan_get_route+0x418/0x4b0 [vxlan]
 [ ... ]
 vxlan_xmit_one+0x148b/0x32b0 [vxlan]
 [ ... ]
 vxlan_xmit+0x25c5/0x4780 [vxlan]
 [ ... ]
 dev_hard_start_xmit+0x1ae/0x6e0
 __dev_queue_xmit+0x1f39/0x31a0
 [ ... ]
 neigh_xmit+0x2f9/0x940
 mpls_xmit+0x911/0x1600 [mpls_iptunnel]
 lwtunnel_xmit+0x18f/0x450
 ip_finish_output2+0x867/0x2040
 [ ... ]

Fixes: 61adedf3e3f1 ("route: move lwtunnel state to dst_entry")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_metadata.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 56cb3c38569a..14efa0ded75d 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -45,7 +45,9 @@ skb_tunnel_info(const struct sk_buff *skb)
 		return &md_dst->u.tun_info;
 
 	dst = skb_dst(skb);
-	if (dst && dst->lwtstate)
+	if (dst && dst->lwtstate &&
+	    (dst->lwtstate->type == LWTUNNEL_ENCAP_IP ||
+	     dst->lwtstate->type == LWTUNNEL_ENCAP_IP6))
 		return lwt_tun_info(dst->lwtstate);
 
 	return NULL;
-- 
cgit v1.2.3


From 6787b7e350d3552651a3422d3d8980fbc8d65368 Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@chinatelecom.cn>
Date: Fri, 9 Jul 2021 17:20:49 -0700
Subject: mptcp: avoid processing packet if a subflow reset

If check_fully_established() causes a subflow reset, it should not
continue to process the packet in tcp_data_queue().
Add a return value to mptcp_incoming_options(), and return false if a
subflow has been reset, else return true. Then drop the packet in
tcp_data_queue()/tcp_rcv_state_process() if mptcp_incoming_options()
return false.

Fixes: d582484726c4 ("mptcp: fix fallback for MP_JOIN subflows")
Signed-off-by: Jianguo Wu <wujianguo@chinatelecom.cn>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mptcp.h  |  5 +++--
 net/ipv4/tcp_input.c | 19 +++++++++++++++----
 net/mptcp/options.c  | 19 +++++++++++++------
 3 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index cb580b06152f..8b5af683a818 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -105,7 +105,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
 			       unsigned int *size, unsigned int remaining,
 			       struct mptcp_out_options *opts);
-void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);
+bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb);
 
 void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
 			 struct mptcp_out_options *opts);
@@ -227,9 +227,10 @@ static inline bool mptcp_established_options(struct sock *sk,
 	return false;
 }
 
-static inline void mptcp_incoming_options(struct sock *sk,
+static inline bool mptcp_incoming_options(struct sock *sk,
 					  struct sk_buff *skb)
 {
+	return true;
 }
 
 static inline void mptcp_skb_ext_move(struct sk_buff *to,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a5a8d0a378b2..149ceb5c94ff 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4247,6 +4247,9 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
 {
 	trace_tcp_receive_reset(sk);
 
+	/* mptcp can't tell us to ignore reset pkts,
+	 * so just ignore the return value of mptcp_incoming_options().
+	 */
 	if (sk_is_mptcp(sk))
 		mptcp_incoming_options(sk, skb);
 
@@ -4941,8 +4944,13 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 	bool fragstolen;
 	int eaten;
 
-	if (sk_is_mptcp(sk))
-		mptcp_incoming_options(sk, skb);
+	/* If a subflow has been reset, the packet should not continue
+	 * to be processed, drop the packet.
+	 */
+	if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
+		__kfree_skb(skb);
+		return;
+	}
 
 	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
 		__kfree_skb(skb);
@@ -6523,8 +6531,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 	case TCP_CLOSING:
 	case TCP_LAST_ACK:
 		if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
-			if (sk_is_mptcp(sk))
-				mptcp_incoming_options(sk, skb);
+			/* If a subflow has been reset, the packet should not
+			 * continue to be processed, drop the packet.
+			 */
+			if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
+				goto discard;
 			break;
 		}
 		fallthrough;
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index b5850afea343..4452455aef7f 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1035,7 +1035,8 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk,
 	return hmac == mp_opt->ahmac;
 }
 
-void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
+/* Return false if a subflow has been reset, else return true */
+bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 {
 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
@@ -1053,12 +1054,16 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 			__mptcp_check_push(subflow->conn, sk);
 		__mptcp_data_acked(subflow->conn);
 		mptcp_data_unlock(subflow->conn);
-		return;
+		return true;
 	}
 
 	mptcp_get_options(sk, skb, &mp_opt);
+
+	/* The subflow can be in close state only if check_fully_established()
+	 * just sent a reset. If so, tell the caller to ignore the current packet.
+	 */
 	if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
-		return;
+		return sk->sk_state != TCP_CLOSE;
 
 	if (mp_opt.fastclose &&
 	    msk->local_key == mp_opt.rcvr_key) {
@@ -1100,7 +1105,7 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 	}
 
 	if (!mp_opt.dss)
-		return;
+		return true;
 
 	/* we can't wait for recvmsg() to update the ack_seq, otherwise
 	 * monodirectional flows will stuck
@@ -1119,12 +1124,12 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 		    schedule_work(&msk->work))
 			sock_hold(subflow->conn);
 
-		return;
+		return true;
 	}
 
 	mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
 	if (!mpext)
-		return;
+		return true;
 
 	memset(mpext, 0, sizeof(*mpext));
 
@@ -1153,6 +1158,8 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 		if (mpext->csum_reqd)
 			mpext->csum = mp_opt.csum;
 	}
+
+	return true;
 }
 
 static void mptcp_set_rwin(const struct tcp_sock *tp)
-- 
cgit v1.2.3


From a5de4be0aaaa66a2fa98e8a33bdbed3bd0682804 Mon Sep 17 00:00:00 2001
From: Marek Behún <kabel@kernel.org>
Date: Sun, 11 Jul 2021 18:38:15 +0200
Subject: net: phy: marvell10g: fix differentiation of 88X3310 from 88X3340
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It seems that we cannot differentiate 88X3310 from 88X3340 by simply
looking at bit 3 of revision ID. This only works on revisions A0 and A1.
On revision B0, this bit is always 1.

Instead use the 3.d00d register for differentiation, since this register
contains information about number of ports on the device.

Fixes: 9885d016ffa9 ("net: phy: marvell10g: add separate structure for 88X3340")
Signed-off-by: Marek Behún <kabel@kernel.org>
Reported-by: Matteo Croce <mcroce@linux.microsoft.com>
Tested-by: Matteo Croce <mcroce@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell10g.c | 40 +++++++++++++++++++++++++++++++++++-----
 include/linux/marvell_phy.h  |  6 +-----
 2 files changed, 36 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index bbbc6ac8fa82..53a433442803 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -78,6 +78,11 @@ enum {
 	/* Temperature read register (88E2110 only) */
 	MV_PCS_TEMP		= 0x8042,
 
+	/* Number of ports on the device */
+	MV_PCS_PORT_INFO	= 0xd00d,
+	MV_PCS_PORT_INFO_NPORTS_MASK	= 0x0380,
+	MV_PCS_PORT_INFO_NPORTS_SHIFT	= 7,
+
 	/* These registers appear at 0x800X and 0xa00X - the 0xa00X control
 	 * registers appear to set themselves to the 0x800X when AN is
 	 * restarted, but status registers appear readable from either.
@@ -966,6 +971,30 @@ static const struct mv3310_chip mv2111_type = {
 #endif
 };
 
+static int mv3310_get_number_of_ports(struct phy_device *phydev)
+{
+	int ret;
+
+	ret = phy_read_mmd(phydev, MDIO_MMD_PCS, MV_PCS_PORT_INFO);
+	if (ret < 0)
+		return ret;
+
+	ret &= MV_PCS_PORT_INFO_NPORTS_MASK;
+	ret >>= MV_PCS_PORT_INFO_NPORTS_SHIFT;
+
+	return ret + 1;
+}
+
+static int mv3310_match_phy_device(struct phy_device *phydev)
+{
+	return mv3310_get_number_of_ports(phydev) == 1;
+}
+
+static int mv3340_match_phy_device(struct phy_device *phydev)
+{
+	return mv3310_get_number_of_ports(phydev) == 4;
+}
+
 static int mv211x_match_phy_device(struct phy_device *phydev, bool has_5g)
 {
 	int val;
@@ -994,7 +1023,8 @@ static int mv2111_match_phy_device(struct phy_device *phydev)
 static struct phy_driver mv3310_drivers[] = {
 	{
 		.phy_id		= MARVELL_PHY_ID_88X3310,
-		.phy_id_mask	= MARVELL_PHY_ID_88X33X0_MASK,
+		.phy_id_mask	= MARVELL_PHY_ID_MASK,
+		.match_phy_device = mv3310_match_phy_device,
 		.name		= "mv88x3310",
 		.driver_data	= &mv3310_type,
 		.get_features	= mv3310_get_features,
@@ -1011,8 +1041,9 @@ static struct phy_driver mv3310_drivers[] = {
 		.set_loopback	= genphy_c45_loopback,
 	},
 	{
-		.phy_id		= MARVELL_PHY_ID_88X3340,
-		.phy_id_mask	= MARVELL_PHY_ID_88X33X0_MASK,
+		.phy_id		= MARVELL_PHY_ID_88X3310,
+		.phy_id_mask	= MARVELL_PHY_ID_MASK,
+		.match_phy_device = mv3340_match_phy_device,
 		.name		= "mv88x3340",
 		.driver_data	= &mv3340_type,
 		.get_features	= mv3310_get_features,
@@ -1069,8 +1100,7 @@ static struct phy_driver mv3310_drivers[] = {
 module_phy_driver(mv3310_drivers);
 
 static struct mdio_device_id __maybe_unused mv3310_tbl[] = {
-	{ MARVELL_PHY_ID_88X3310, MARVELL_PHY_ID_88X33X0_MASK },
-	{ MARVELL_PHY_ID_88X3340, MARVELL_PHY_ID_88X33X0_MASK },
+	{ MARVELL_PHY_ID_88X3310, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E2110, MARVELL_PHY_ID_MASK },
 	{ },
 };
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index acee44b9db26..0f06c2287b52 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -22,14 +22,10 @@
 #define MARVELL_PHY_ID_88E1545		0x01410ea0
 #define MARVELL_PHY_ID_88E1548P		0x01410ec0
 #define MARVELL_PHY_ID_88E3016		0x01410e60
+#define MARVELL_PHY_ID_88X3310		0x002b09a0
 #define MARVELL_PHY_ID_88E2110		0x002b09b0
 #define MARVELL_PHY_ID_88X2222		0x01410f10
 
-/* PHY IDs and mask for Alaska 10G PHYs */
-#define MARVELL_PHY_ID_88X33X0_MASK	0xfffffff8
-#define MARVELL_PHY_ID_88X3310		0x002b09a0
-#define MARVELL_PHY_ID_88X3340		0x002b09a8
-
 /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */
 #define MARVELL_PHY_ID_88E1111_FINISAR	0x01ff0cc0
 
-- 
cgit v1.2.3


From 0238bcf80e972f2ce25d767e54f89a9e49773f6e Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Mon, 5 Jul 2021 22:42:47 +0300
Subject: ASoC: ti: davinci-mcasp: Add support for the OMAP4 version of McASP

There is a single McASP on OMAP4 (and OMAP5) which is configured to only
support DIT playback mode on a single serializer.

Add 0x200 offset to DAT port address as the TRM suggests it.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Link: https://lore.kernel.org/r/20210705194249.2385-4-peter.ujfalusi@gmail.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/platform_data/davinci_asp.h |  1 +
 sound/soc/ti/Kconfig                      |  1 +
 sound/soc/ti/davinci-mcasp.c              | 26 +++++++++++++++++++++++---
 3 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/davinci_asp.h b/include/linux/platform_data/davinci_asp.h
index 5d1fb0d78a22..76b13ef67562 100644
--- a/include/linux/platform_data/davinci_asp.h
+++ b/include/linux/platform_data/davinci_asp.h
@@ -96,6 +96,7 @@ enum {
 	MCASP_VERSION_2,	/* DA8xx/OMAPL1x */
 	MCASP_VERSION_3,        /* TI81xx/AM33xx */
 	MCASP_VERSION_4,	/* DRA7xxx */
+	MCASP_VERSION_OMAP,	/* OMAP4/5 */
 };
 
 enum mcbsp_clk_input_pin {
diff --git a/sound/soc/ti/Kconfig b/sound/soc/ti/Kconfig
index 698d7bc84dcf..1d9fe3fca193 100644
--- a/sound/soc/ti/Kconfig
+++ b/sound/soc/ti/Kconfig
@@ -35,6 +35,7 @@ config SND_SOC_DAVINCI_MCASP
 	  various Texas Instruments SoCs like:
 	  - daVinci devices
 	  - Sitara line of SoCs (AM335x, AM438x, etc)
+	  - OMAP4
 	  - DRA7x devices
 	  - Keystone devices
 	  - K3 devices (am654, j721e)
diff --git a/sound/soc/ti/davinci-mcasp.c b/sound/soc/ti/davinci-mcasp.c
index 64ec6d485834..56a19eeec5c7 100644
--- a/sound/soc/ti/davinci-mcasp.c
+++ b/sound/soc/ti/davinci-mcasp.c
@@ -1794,6 +1794,12 @@ static struct davinci_mcasp_pdata dra7_mcasp_pdata = {
 	.version = MCASP_VERSION_4,
 };
 
+static struct davinci_mcasp_pdata omap_mcasp_pdata = {
+	.tx_dma_offset = 0x200,
+	.rx_dma_offset = 0,
+	.version = MCASP_VERSION_OMAP,
+};
+
 static const struct of_device_id mcasp_dt_ids[] = {
 	{
 		.compatible = "ti,dm646x-mcasp-audio",
@@ -1811,6 +1817,10 @@ static const struct of_device_id mcasp_dt_ids[] = {
 		.compatible = "ti,dra7-mcasp-audio",
 		.data = &dra7_mcasp_pdata,
 	},
+	{
+		.compatible = "ti,omap4-mcasp-audio",
+		.data = &omap_mcasp_pdata,
+	},
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, mcasp_dt_ids);
@@ -2350,10 +2360,17 @@ static int davinci_mcasp_probe(struct platform_device *pdev)
 
 	dma_data = &mcasp->dma_data[SNDRV_PCM_STREAM_PLAYBACK];
 	dma_data->filter_data = "tx";
-	if (dat)
+	if (dat) {
 		dma_data->addr = dat->start;
-	else
+		/*
+		 * According to the TRM there should be 0x200 offset added to
+		 * the DAT port address
+		 */
+		if (mcasp->version == MCASP_VERSION_OMAP)
+			dma_data->addr += davinci_mcasp_txdma_offset(mcasp->pdata);
+	} else {
 		dma_data->addr = mem->start + davinci_mcasp_txdma_offset(mcasp->pdata);
+	}
 
 
 	/* RX is not valid in DIT mode */
@@ -2418,7 +2435,10 @@ static int davinci_mcasp_probe(struct platform_device *pdev)
 		ret = edma_pcm_platform_register(&pdev->dev);
 		break;
 	case PCM_SDMA:
-		ret = sdma_pcm_platform_register(&pdev->dev, "tx", "rx");
+		if (mcasp->op_mode == DAVINCI_MCASP_IIS_MODE)
+			ret = sdma_pcm_platform_register(&pdev->dev, "tx", "rx");
+		else
+			ret = sdma_pcm_platform_register(&pdev->dev, "tx", NULL);
 		break;
 	case PCM_UDMA:
 		ret = udma_pcm_platform_register(&pdev->dev);
-- 
cgit v1.2.3


From bc619cfc6278c87b4e310f9db9f45abc263220e8 Mon Sep 17 00:00:00 2001
From: Brent Lu <brent.lu@intel.com>
Date: Fri, 25 Jun 2021 15:50:41 -0500
Subject: ASoC: SOF: add a helper to get topology configured bclk

Add helper function sof_dai_ssp_bclk() to get the BCLK frequency
configured by topology.

Reviewed-by: Bard Liao <bard.liao@intel.com>
Signed-off-by: Brent Lu <brent.lu@intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://lore.kernel.org/r/20210625205042.65181-4-pierre-louis.bossart@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sof.h       |  1 +
 sound/soc/sof/sof-audio.c | 42 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 36 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/sound/sof.h b/include/sound/sof.h
index 502ed9b8d6a1..6a1cd8e783d8 100644
--- a/include/sound/sof.h
+++ b/include/sound/sof.h
@@ -101,5 +101,6 @@ struct sof_dev_desc {
 };
 
 int sof_dai_get_mclk(struct snd_soc_pcm_runtime *rtd);
+int sof_dai_get_bclk(struct snd_soc_pcm_runtime *rtd);
 
 #endif
diff --git a/sound/soc/sof/sof-audio.c b/sound/soc/sof/sof-audio.c
index 510883cd9107..989912f2b739 100644
--- a/sound/soc/sof/sof-audio.c
+++ b/sound/soc/sof/sof-audio.c
@@ -433,11 +433,10 @@ struct snd_sof_dai *snd_sof_find_dai(struct snd_soc_component *scomp,
 	return NULL;
 }
 
-/*
- * Helper to get SSP MCLK from a pcm_runtime.
- * Return 0 if not exist.
- */
-int sof_dai_get_mclk(struct snd_soc_pcm_runtime *rtd)
+#define SOF_DAI_CLK_INTEL_SSP_MCLK	0
+#define SOF_DAI_CLK_INTEL_SSP_BCLK	1
+
+static int sof_dai_get_clk(struct snd_soc_pcm_runtime *rtd, int clk_type)
 {
 	struct snd_soc_component *component =
 		snd_soc_rtdcom_lookup(rtd, SOF_AUDIO_PCM_DRV_NAME);
@@ -450,16 +449,45 @@ int sof_dai_get_mclk(struct snd_soc_pcm_runtime *rtd)
 
 	switch (dai->dai_config->type) {
 	case SOF_DAI_INTEL_SSP:
-		return dai->dai_config->ssp.mclk_rate;
+		switch (clk_type) {
+		case SOF_DAI_CLK_INTEL_SSP_MCLK:
+			return dai->dai_config->ssp.mclk_rate;
+		case SOF_DAI_CLK_INTEL_SSP_BCLK:
+			return dai->dai_config->ssp.bclk_rate;
+		default:
+			dev_err(rtd->dev, "fail to get SSP clk %d rate\n",
+				clk_type);
+			return -EINVAL;
+		}
+		break;
 	default:
 		/* not yet implemented for platforms other than the above */
-		dev_err(rtd->dev, "mclk for dai_config->type %d not supported yet!\n",
+		dev_err(rtd->dev, "DAI type %d not supported yet!\n",
 			dai->dai_config->type);
 		return -EINVAL;
 	}
 }
+
+/*
+ * Helper to get SSP MCLK from a pcm_runtime.
+ * Return 0 if not exist.
+ */
+int sof_dai_get_mclk(struct snd_soc_pcm_runtime *rtd)
+{
+	return sof_dai_get_clk(rtd, SOF_DAI_CLK_INTEL_SSP_MCLK);
+}
 EXPORT_SYMBOL(sof_dai_get_mclk);
 
+/*
+ * Helper to get SSP BCLK from a pcm_runtime.
+ * Return 0 if not exist.
+ */
+int sof_dai_get_bclk(struct snd_soc_pcm_runtime *rtd)
+{
+	return sof_dai_get_clk(rtd, SOF_DAI_CLK_INTEL_SSP_BCLK);
+}
+EXPORT_SYMBOL(sof_dai_get_bclk);
+
 /*
  * SOF Driver enumeration.
  */
-- 
cgit v1.2.3


From c3e67ad6f5a2c698a055fb297c6f9962f5145edd Mon Sep 17 00:00:00 2001
From: Biju Das <biju.das.jz@bp.renesas.com>
Date: Sat, 26 Jun 2021 09:13:39 +0100
Subject: dt-bindings: clock: r9a07g044-cpg: Update clock/reset definitions

Update clock and reset definitions as per RZ/G2L_clock_list_r02_02.xlsx
and RZ/G2L HW(Rev.0.50) manual.

Update {GIC,IA55,SCIF} clock and reset entries in the CPG driver, and
separate reset from module clocks in order to handle them efficiently.

Update the SCIF0 clock and reset index in the SoC DTSI.

Signed-off-by: Biju Das <biju.das.jz@bp.renesas.com>
Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://lore.kernel.org/r/20210626081344.5783-6-biju.das.jz@bp.renesas.com
Link: https://lore.kernel.org/r/20210626081344.5783-7-biju.das.jz@bp.renesas.com
Link: https://lore.kernel.org/r/20210626081344.5783-8-biju.das.jz@bp.renesas.com
[geert: Squashed 3 commits]
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 arch/arm64/boot/dts/renesas/r9a07g044.dtsi |   4 +-
 drivers/clk/renesas/r9a07g044-cpg.c        |  62 ++++----
 drivers/clk/renesas/renesas-rzg2l-cpg.c    |  59 ++++----
 drivers/clk/renesas/renesas-rzg2l-cpg.h    |  36 +++--
 include/dt-bindings/clock/r9a07g044-cpg.h  | 236 ++++++++++++++++++++++-------
 5 files changed, 278 insertions(+), 119 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/boot/dts/renesas/r9a07g044.dtsi b/arch/arm64/boot/dts/renesas/r9a07g044.dtsi
index 734c8adeceba..01482d227506 100644
--- a/arch/arm64/boot/dts/renesas/r9a07g044.dtsi
+++ b/arch/arm64/boot/dts/renesas/r9a07g044.dtsi
@@ -82,10 +82,10 @@
 				     <GIC_SPI 384 IRQ_TYPE_LEVEL_HIGH>;
 			interrupt-names = "eri", "rxi", "txi",
 					  "bri", "dri", "tei";
-			clocks = <&cpg CPG_MOD R9A07G044_CLK_SCIF0>;
+			clocks = <&cpg CPG_MOD R9A07G044_SCIF0_CLK_PCK>;
 			clock-names = "fck";
 			power-domains = <&cpg>;
-			resets = <&cpg R9A07G044_CLK_SCIF0>;
+			resets = <&cpg R9A07G044_SCIF0_RST_SYSTEM_N>;
 			status = "disabled";
 		};
 
diff --git a/drivers/clk/renesas/r9a07g044-cpg.c b/drivers/clk/renesas/r9a07g044-cpg.c
index 70df4feda417..ae24e0397d3c 100644
--- a/drivers/clk/renesas/r9a07g044-cpg.c
+++ b/drivers/clk/renesas/r9a07g044-cpg.c
@@ -84,34 +84,40 @@ static const struct cpg_core_clk r9a07g044_core_clks[] __initconst = {
 };
 
 static struct rzg2l_mod_clk r9a07g044_mod_clks[] = {
-	DEF_MOD("gic",		R9A07G044_CLK_GIC600,
-				R9A07G044_CLK_P1,
-				0x514, BIT(0), (BIT(0) | BIT(1))),
-	DEF_MOD("ia55",		R9A07G044_CLK_IA55,
-				R9A07G044_CLK_P1,
-				0x518, (BIT(0) | BIT(1)), BIT(0)),
-	DEF_MOD("scif0",	R9A07G044_CLK_SCIF0,
-				R9A07G044_CLK_P0,
-				0x584, BIT(0), BIT(0)),
-	DEF_MOD("scif1",	R9A07G044_CLK_SCIF1,
-				R9A07G044_CLK_P0,
-				0x584, BIT(1), BIT(1)),
-	DEF_MOD("scif2",	R9A07G044_CLK_SCIF2,
-				R9A07G044_CLK_P0,
-				0x584, BIT(2), BIT(2)),
-	DEF_MOD("scif3",	R9A07G044_CLK_SCIF3,
-				R9A07G044_CLK_P0,
-				0x584, BIT(3), BIT(3)),
-	DEF_MOD("scif4",	R9A07G044_CLK_SCIF4,
-				R9A07G044_CLK_P0,
-				0x584, BIT(4), BIT(4)),
-	DEF_MOD("sci0",		R9A07G044_CLK_SCI0,
-				R9A07G044_CLK_P0,
-				0x588, BIT(0), BIT(0)),
+	DEF_MOD("gic",		R9A07G044_GIC600_GICCLK, R9A07G044_CLK_P1,
+				0x514, 0),
+	DEF_MOD("ia55_pclk",	R9A07G044_IA55_PCLK, R9A07G044_CLK_P2,
+				0x518, 0),
+	DEF_MOD("ia55_clk",	R9A07G044_IA55_CLK, R9A07G044_CLK_P1,
+				0x518, 1),
+	DEF_MOD("scif0",	R9A07G044_SCIF0_CLK_PCK, R9A07G044_CLK_P0,
+				0x584, 0),
+	DEF_MOD("scif1",	R9A07G044_SCIF1_CLK_PCK, R9A07G044_CLK_P0,
+				0x584, 1),
+	DEF_MOD("scif2",	R9A07G044_SCIF2_CLK_PCK, R9A07G044_CLK_P0,
+				0x584, 2),
+	DEF_MOD("scif3",	R9A07G044_SCIF3_CLK_PCK, R9A07G044_CLK_P0,
+				0x584, 3),
+	DEF_MOD("scif4",	R9A07G044_SCIF4_CLK_PCK, R9A07G044_CLK_P0,
+				0x584, 4),
+	DEF_MOD("sci0",		R9A07G044_SCI0_CLKP, R9A07G044_CLK_P0,
+				0x588, 0),
+};
+
+static struct rzg2l_reset r9a07g044_resets[] = {
+	DEF_RST(R9A07G044_GIC600_GICRESET_N, 0x814, 0),
+	DEF_RST(R9A07G044_GIC600_DBG_GICRESET_N, 0x814, 1),
+	DEF_RST(R9A07G044_IA55_RESETN, 0x818, 0),
+	DEF_RST(R9A07G044_SCIF0_RST_SYSTEM_N, 0x884, 0),
+	DEF_RST(R9A07G044_SCIF1_RST_SYSTEM_N, 0x884, 1),
+	DEF_RST(R9A07G044_SCIF2_RST_SYSTEM_N, 0x884, 2),
+	DEF_RST(R9A07G044_SCIF3_RST_SYSTEM_N, 0x884, 3),
+	DEF_RST(R9A07G044_SCIF4_RST_SYSTEM_N, 0x884, 4),
+	DEF_RST(R9A07G044_SCI0_RST, 0x888, 0),
 };
 
 static const unsigned int r9a07g044_crit_mod_clks[] __initconst = {
-	MOD_CLK_BASE + R9A07G044_CLK_GIC600,
+	MOD_CLK_BASE + R9A07G044_GIC600_GICCLK,
 };
 
 const struct rzg2l_cpg_info r9a07g044_cpg_info = {
@@ -128,5 +134,9 @@ const struct rzg2l_cpg_info r9a07g044_cpg_info = {
 	/* Module Clocks */
 	.mod_clks = r9a07g044_mod_clks,
 	.num_mod_clks = ARRAY_SIZE(r9a07g044_mod_clks),
-	.num_hw_mod_clks = R9A07G044_CLK_MIPI_DSI_PIN + 1,
+	.num_hw_mod_clks = R9A07G044_TSU_PCLK + 1,
+
+	/* Resets */
+	.resets = r9a07g044_resets,
+	.num_resets = ARRAY_SIZE(r9a07g044_resets),
 };
diff --git a/drivers/clk/renesas/renesas-rzg2l-cpg.c b/drivers/clk/renesas/renesas-rzg2l-cpg.c
index 1941f13bd922..e7c59af2a1d8 100644
--- a/drivers/clk/renesas/renesas-rzg2l-cpg.c
+++ b/drivers/clk/renesas/renesas-rzg2l-cpg.c
@@ -47,9 +47,9 @@
 #define SDIV(val)		DIV_RSMASK(val, 0, 0x7)
 
 #define CLK_ON_R(reg)		(reg)
-#define CLK_MON_R(reg)		(0x680 - 0x500 + (reg))
-#define CLK_RST_R(reg)		(0x800 - 0x500 + (reg))
-#define CLK_MRST_R(reg)		(0x980 - 0x500 + (reg))
+#define CLK_MON_R(reg)		(0x180 + (reg))
+#define CLK_RST_R(reg)		(reg)
+#define CLK_MRST_R(reg)		(0x180 + (reg))
 
 #define GET_REG_OFFSET(val)		((val >> 20) & 0xfff)
 #define GET_REG_SAMPLL_CLK1(val)	((val >> 22) & 0xfff)
@@ -78,6 +78,7 @@ struct rzg2l_cpg_priv {
 	struct clk **clks;
 	unsigned int num_core_clks;
 	unsigned int num_mod_clks;
+	unsigned int num_resets;
 	unsigned int last_dt_core_clk;
 
 	struct raw_notifier_head notifiers;
@@ -315,15 +316,13 @@ fail:
  *
  * @hw: handle between common and hardware-specific interfaces
  * @off: register offset
- * @onoff: ON/MON bits
- * @reset: reset bits
+ * @bit: ON/MON bit
  * @priv: CPG/MSTP private data
  */
 struct mstp_clock {
 	struct clk_hw hw;
 	u16 off;
-	u8 onoff;
-	u8 reset;
+	u8 bit;
 	struct rzg2l_cpg_priv *priv;
 };
 
@@ -337,6 +336,7 @@ static int rzg2l_mod_clock_endisable(struct clk_hw *hw, bool enable)
 	struct device *dev = priv->dev;
 	unsigned long flags;
 	unsigned int i;
+	u32 bitmask = BIT(clock->bit);
 	u32 value;
 
 	if (!clock->off) {
@@ -349,9 +349,9 @@ static int rzg2l_mod_clock_endisable(struct clk_hw *hw, bool enable)
 	spin_lock_irqsave(&priv->rmw_lock, flags);
 
 	if (enable)
-		value = (clock->onoff << 16) | clock->onoff;
+		value = (bitmask << 16) | bitmask;
 	else
-		value = clock->onoff << 16;
+		value = bitmask << 16;
 	writel(value, priv->base + CLK_ON_R(reg));
 
 	spin_unlock_irqrestore(&priv->rmw_lock, flags);
@@ -360,7 +360,7 @@ static int rzg2l_mod_clock_endisable(struct clk_hw *hw, bool enable)
 		return 0;
 
 	for (i = 1000; i > 0; --i) {
-		if (((readl(priv->base + CLK_MON_R(reg))) & clock->onoff))
+		if (((readl(priv->base + CLK_MON_R(reg))) & bitmask))
 			break;
 		cpu_relax();
 	}
@@ -388,6 +388,7 @@ static int rzg2l_mod_clock_is_enabled(struct clk_hw *hw)
 {
 	struct mstp_clock *clock = to_mod_clock(hw);
 	struct rzg2l_cpg_priv *priv = clock->priv;
+	u32 bitmask = BIT(clock->bit);
 	u32 value;
 
 	if (!clock->off) {
@@ -397,7 +398,7 @@ static int rzg2l_mod_clock_is_enabled(struct clk_hw *hw)
 
 	value = readl(priv->base + CLK_MON_R(clock->off));
 
-	return !(value & clock->onoff);
+	return !(value & bitmask);
 }
 
 static const struct clk_ops rzg2l_mod_clock_ops = {
@@ -457,8 +458,7 @@ rzg2l_cpg_register_mod_clk(const struct rzg2l_mod_clk *mod,
 	init.num_parents = 1;
 
 	clock->off = mod->off;
-	clock->onoff = mod->onoff;
-	clock->reset = mod->reset;
+	clock->bit = mod->bit;
 	clock->priv = priv;
 	clock->hw.init = &init;
 
@@ -483,12 +483,11 @@ static int rzg2l_cpg_reset(struct reset_controller_dev *rcdev,
 {
 	struct rzg2l_cpg_priv *priv = rcdev_to_priv(rcdev);
 	const struct rzg2l_cpg_info *info = priv->info;
-	unsigned int reg = info->mod_clks[id].off;
-	u32 dis = info->mod_clks[id].reset;
+	unsigned int reg = info->resets[id].off;
+	u32 dis = BIT(info->resets[id].bit);
 	u32 we = dis << 16;
 
-	dev_dbg(rcdev->dev, "reset name:%s id:%ld offset:0x%x\n",
-		info->mod_clks[id].name, id, CLK_RST_R(reg));
+	dev_dbg(rcdev->dev, "reset id:%ld offset:0x%x\n", id, CLK_RST_R(reg));
 
 	/* Reset module */
 	writel(we, priv->base + CLK_RST_R(reg));
@@ -507,11 +506,10 @@ static int rzg2l_cpg_assert(struct reset_controller_dev *rcdev,
 {
 	struct rzg2l_cpg_priv *priv = rcdev_to_priv(rcdev);
 	const struct rzg2l_cpg_info *info = priv->info;
-	unsigned int reg = info->mod_clks[id].off;
-	u32 value = info->mod_clks[id].reset << 16;
+	unsigned int reg = info->resets[id].off;
+	u32 value = BIT(info->resets[id].bit) << 16;
 
-	dev_dbg(rcdev->dev, "assert name:%s id:%ld offset:0x%x\n",
-		info->mod_clks[id].name, id, CLK_RST_R(reg));
+	dev_dbg(rcdev->dev, "assert id:%ld offset:0x%x\n", id, CLK_RST_R(reg));
 
 	writel(value, priv->base + CLK_RST_R(reg));
 	return 0;
@@ -522,12 +520,12 @@ static int rzg2l_cpg_deassert(struct reset_controller_dev *rcdev,
 {
 	struct rzg2l_cpg_priv *priv = rcdev_to_priv(rcdev);
 	const struct rzg2l_cpg_info *info = priv->info;
-	unsigned int reg = info->mod_clks[id].off;
-	u32 dis = info->mod_clks[id].reset;
+	unsigned int reg = info->resets[id].off;
+	u32 dis = BIT(info->resets[id].bit);
 	u32 value = (dis << 16) | dis;
 
-	dev_dbg(rcdev->dev, "deassert name:%s id:%ld offset:0x%x\n",
-		info->mod_clks[id].name, id, CLK_RST_R(reg));
+	dev_dbg(rcdev->dev, "deassert id:%ld offset:0x%x\n", id,
+		CLK_RST_R(reg));
 
 	writel(value, priv->base + CLK_RST_R(reg));
 	return 0;
@@ -538,8 +536,8 @@ static int rzg2l_cpg_status(struct reset_controller_dev *rcdev,
 {
 	struct rzg2l_cpg_priv *priv = rcdev_to_priv(rcdev);
 	const struct rzg2l_cpg_info *info = priv->info;
-	unsigned int reg = info->mod_clks[id].off;
-	u32 bitmask = info->mod_clks[id].reset;
+	unsigned int reg = info->resets[id].off;
+	u32 bitmask = BIT(info->resets[id].bit);
 
 	return !(readl(priv->base + CLK_MRST_R(reg)) & bitmask);
 }
@@ -554,9 +552,11 @@ static const struct reset_control_ops rzg2l_cpg_reset_ops = {
 static int rzg2l_cpg_reset_xlate(struct reset_controller_dev *rcdev,
 				 const struct of_phandle_args *reset_spec)
 {
+	struct rzg2l_cpg_priv *priv = rcdev_to_priv(rcdev);
+	const struct rzg2l_cpg_info *info = priv->info;
 	unsigned int id = reset_spec->args[0];
 
-	if (id >= rcdev->nr_resets) {
+	if (id >= rcdev->nr_resets || !info->resets[id].off) {
 		dev_err(rcdev->dev, "Invalid reset index %u\n", id);
 		return -EINVAL;
 	}
@@ -571,7 +571,7 @@ static int rzg2l_cpg_reset_controller_register(struct rzg2l_cpg_priv *priv)
 	priv->rcdev.dev = priv->dev;
 	priv->rcdev.of_reset_n_cells = 1;
 	priv->rcdev.of_xlate = rzg2l_cpg_reset_xlate;
-	priv->rcdev.nr_resets = priv->num_mod_clks;
+	priv->rcdev.nr_resets = priv->num_resets;
 
 	return devm_reset_controller_register(priv->dev, &priv->rcdev);
 }
@@ -699,6 +699,7 @@ static int __init rzg2l_cpg_probe(struct platform_device *pdev)
 	priv->clks = clks;
 	priv->num_core_clks = info->num_total_core_clks;
 	priv->num_mod_clks = info->num_hw_mod_clks;
+	priv->num_resets = info->num_resets;
 	priv->last_dt_core_clk = info->last_dt_core_clk;
 
 	for (i = 0; i < nclks; i++)
diff --git a/drivers/clk/renesas/renesas-rzg2l-cpg.h b/drivers/clk/renesas/renesas-rzg2l-cpg.h
index a6a3bade1985..63695280ce8b 100644
--- a/drivers/clk/renesas/renesas-rzg2l-cpg.h
+++ b/drivers/clk/renesas/renesas-rzg2l-cpg.h
@@ -77,26 +77,40 @@ enum clk_types {
  * @id: clock index in array containing all Core and Module Clocks
  * @parent: id of parent clock
  * @off: register offset
- * @onoff: ON/MON bits
- * @reset: reset bits
+ * @bit: ON/MON bit
  */
 struct rzg2l_mod_clk {
 	const char *name;
 	unsigned int id;
 	unsigned int parent;
 	u16 off;
-	u8 onoff;
-	u8 reset;
+	u8 bit;
 };
 
-#define DEF_MOD(_name, _id, _parent, _off, _onoff, _reset)	\
-	[_id] = { \
+#define DEF_MOD(_name, _id, _parent, _off, _bit)	\
+	{ \
 		.name = _name, \
-		.id = MOD_CLK_BASE + _id, \
+		.id = MOD_CLK_BASE + (_id), \
 		.parent = (_parent), \
 		.off = (_off), \
-		.onoff = (_onoff), \
-		.reset = (_reset) \
+		.bit = (_bit), \
+	}
+
+/**
+ * struct rzg2l_reset - Reset definitions
+ *
+ * @off: register offset
+ * @bit: reset bit
+ */
+struct rzg2l_reset {
+	u16 off;
+	u8 bit;
+};
+
+#define DEF_RST(_id, _off, _bit)	\
+	[_id] = { \
+		.off = (_off), \
+		.bit = (_bit) \
 	}
 
 /**
@@ -127,6 +141,10 @@ struct rzg2l_cpg_info {
 	unsigned int num_mod_clks;
 	unsigned int num_hw_mod_clks;
 
+	/* Resets */
+	const struct rzg2l_reset *resets;
+	unsigned int num_resets;
+
 	/* Critical Module Clocks that should not be disabled */
 	const unsigned int *crit_mod_clks;
 	unsigned int num_crit_mod_clks;
diff --git a/include/dt-bindings/clock/r9a07g044-cpg.h b/include/dt-bindings/clock/r9a07g044-cpg.h
index 1d8986563fc5..0728ad07ff7a 100644
--- a/include/dt-bindings/clock/r9a07g044-cpg.h
+++ b/include/dt-bindings/clock/r9a07g044-cpg.h
@@ -32,58 +32,188 @@
 #define R9A07G044_OSCCLK		21
 
 /* R9A07G044 Module Clocks */
-#define R9A07G044_CLK_GIC600		0
-#define R9A07G044_CLK_IA55		1
-#define R9A07G044_CLK_SYC		2
-#define R9A07G044_CLK_DMAC		3
-#define R9A07G044_CLK_SYSC		4
-#define R9A07G044_CLK_MTU		5
-#define R9A07G044_CLK_GPT		6
-#define R9A07G044_CLK_ETH0		7
-#define R9A07G044_CLK_ETH1		8
-#define R9A07G044_CLK_I2C0		9
-#define R9A07G044_CLK_I2C1		10
-#define R9A07G044_CLK_I2C2		11
-#define R9A07G044_CLK_I2C3		12
-#define R9A07G044_CLK_SCIF0		13
-#define R9A07G044_CLK_SCIF1		14
-#define R9A07G044_CLK_SCIF2		15
-#define R9A07G044_CLK_SCIF3		16
-#define R9A07G044_CLK_SCIF4		17
-#define R9A07G044_CLK_SCI0		18
-#define R9A07G044_CLK_SCI1		19
-#define R9A07G044_CLK_GPIO		20
-#define R9A07G044_CLK_SDHI0		21
-#define R9A07G044_CLK_SDHI1		22
-#define R9A07G044_CLK_USB0		23
-#define R9A07G044_CLK_USB1		24
-#define R9A07G044_CLK_CANFD		25
-#define R9A07G044_CLK_SSI0		26
-#define R9A07G044_CLK_SSI1		27
-#define R9A07G044_CLK_SSI2		28
-#define R9A07G044_CLK_SSI3		29
-#define R9A07G044_CLK_MHU		30
-#define R9A07G044_CLK_OSTM0		31
-#define R9A07G044_CLK_OSTM1		32
-#define R9A07G044_CLK_OSTM2		33
-#define R9A07G044_CLK_WDT0		34
-#define R9A07G044_CLK_WDT1		35
-#define R9A07G044_CLK_WDT2		36
-#define R9A07G044_CLK_WDT_PON		37
-#define R9A07G044_CLK_GPU		38
-#define R9A07G044_CLK_ISU		39
-#define R9A07G044_CLK_H264		40
-#define R9A07G044_CLK_CRU		41
-#define R9A07G044_CLK_MIPI_DSI		42
-#define R9A07G044_CLK_LCDC		43
-#define R9A07G044_CLK_SRC		44
-#define R9A07G044_CLK_RSPI0		45
-#define R9A07G044_CLK_RSPI1		46
-#define R9A07G044_CLK_RSPI2		47
-#define R9A07G044_CLK_ADC		48
-#define R9A07G044_CLK_TSU_PCLK		49
-#define R9A07G044_CLK_SPI		50
-#define R9A07G044_CLK_MIPI_DSI_V	51
-#define R9A07G044_CLK_MIPI_DSI_PIN	52
+#define R9A07G044_CA55_SCLK		0
+#define R9A07G044_CA55_PCLK		1
+#define R9A07G044_CA55_ATCLK		2
+#define R9A07G044_CA55_GICCLK		3
+#define R9A07G044_CA55_PERICLK		4
+#define R9A07G044_CA55_ACLK		5
+#define R9A07G044_CA55_TSCLK		6
+#define R9A07G044_GIC600_GICCLK		7
+#define R9A07G044_IA55_CLK		8
+#define R9A07G044_IA55_PCLK		9
+#define R9A07G044_MHU_PCLK		10
+#define R9A07G044_SYC_CNT_CLK		11
+#define R9A07G044_DMAC_ACLK		12
+#define R9A07G044_DMAC_PCLK		13
+#define R9A07G044_OSTM0_PCLK		14
+#define R9A07G044_OSTM1_PCLK		15
+#define R9A07G044_OSTM2_PCLK		16
+#define R9A07G044_MTU_X_MCK_MTU3	17
+#define R9A07G044_POE3_CLKM_POE		18
+#define R9A07G044_GPT_PCLK		19
+#define R9A07G044_POEG_A_CLKP		20
+#define R9A07G044_POEG_B_CLKP		21
+#define R9A07G044_POEG_C_CLKP		22
+#define R9A07G044_POEG_D_CLKP		23
+#define R9A07G044_WDT0_PCLK		24
+#define R9A07G044_WDT0_CLK		25
+#define R9A07G044_WDT1_PCLK		26
+#define R9A07G044_WDT1_CLK		27
+#define R9A07G044_WDT2_PCLK		28
+#define R9A07G044_WDT2_CLK		29
+#define R9A07G044_SPI_CLK2		30
+#define R9A07G044_SPI_CLK		31
+#define R9A07G044_SDHI0_IMCLK		32
+#define R9A07G044_SDHI0_IMCLK2		33
+#define R9A07G044_SDHI0_CLK_HS		34
+#define R9A07G044_SDHI0_ACLK		35
+#define R9A07G044_SDHI1_IMCLK		36
+#define R9A07G044_SDHI1_IMCLK2		37
+#define R9A07G044_SDHI1_CLK_HS		38
+#define R9A07G044_SDHI1_ACLK		39
+#define R9A07G044_GPU_CLK		40
+#define R9A07G044_GPU_AXI_CLK		41
+#define R9A07G044_GPU_ACE_CLK		42
+#define R9A07G044_ISU_ACLK		43
+#define R9A07G044_ISU_PCLK		44
+#define R9A07G044_H264_CLK_A		45
+#define R9A07G044_H264_CLK_P		46
+#define R9A07G044_CRU_SYSCLK		47
+#define R9A07G044_CRU_VCLK		48
+#define R9A07G044_CRU_PCLK		49
+#define R9A07G044_CRU_ACLK		50
+#define R9A07G044_MIPI_DSI_PLLCLK	51
+#define R9A07G044_MIPI_DSI_SYSCLK	52
+#define R9A07G044_MIPI_DSI_ACLK		53
+#define R9A07G044_MIPI_DSI_PCLK		54
+#define R9A07G044_MIPI_DSI_VCLK		55
+#define R9A07G044_MIPI_DSI_LPCLK	56
+#define R9A07G044_LCDC_CLK_A		57
+#define R9A07G044_LCDC_CLK_P		58
+#define R9A07G044_LCDC_CLK_D		59
+#define R9A07G044_SSI0_PCLK2		60
+#define R9A07G044_SSI0_PCLK_SFR		61
+#define R9A07G044_SSI1_PCLK2		62
+#define R9A07G044_SSI1_PCLK_SFR		63
+#define R9A07G044_SSI2_PCLK2		64
+#define R9A07G044_SSI2_PCLK_SFR		65
+#define R9A07G044_SSI3_PCLK2		66
+#define R9A07G044_SSI3_PCLK_SFR		67
+#define R9A07G044_SRC_CLKP		68
+#define R9A07G044_USB_U2H0_HCLK		69
+#define R9A07G044_USB_U2H1_HCLK		70
+#define R9A07G044_USB_U2P_EXR_CPUCLK	71
+#define R9A07G044_USB_PCLK		72
+#define R9A07G044_ETH0_CLK_AXI		73
+#define R9A07G044_ETH0_CLK_CHI		74
+#define R9A07G044_ETH1_CLK_AXI		75
+#define R9A07G044_ETH1_CLK_CHI		76
+#define R9A07G044_I2C0_PCLK		77
+#define R9A07G044_I2C1_PCLK		78
+#define R9A07G044_I2C2_PCLK		79
+#define R9A07G044_I2C3_PCLK		80
+#define R9A07G044_SCIF0_CLK_PCK		81
+#define R9A07G044_SCIF1_CLK_PCK		82
+#define R9A07G044_SCIF2_CLK_PCK		83
+#define R9A07G044_SCIF3_CLK_PCK		84
+#define R9A07G044_SCIF4_CLK_PCK		85
+#define R9A07G044_SCI0_CLKP		86
+#define R9A07G044_SCI1_CLKP		87
+#define R9A07G044_IRDA_CLKP		88
+#define R9A07G044_RSPI0_CLKB		89
+#define R9A07G044_RSPI1_CLKB		90
+#define R9A07G044_RSPI2_CLKB		91
+#define R9A07G044_CANFD_PCLK		92
+#define R9A07G044_GPIO_HCLK		93
+#define R9A07G044_ADC_ADCLK		94
+#define R9A07G044_ADC_PCLK		95
+#define R9A07G044_TSU_PCLK		96
+
+/* R9A07G044 Resets */
+#define R9A07G044_CA55_RST_1_0		0
+#define R9A07G044_CA55_RST_1_1		1
+#define R9A07G044_CA55_RST_3_0		2
+#define R9A07G044_CA55_RST_3_1		3
+#define R9A07G044_CA55_RST_4		4
+#define R9A07G044_CA55_RST_5		5
+#define R9A07G044_CA55_RST_6		6
+#define R9A07G044_CA55_RST_7		7
+#define R9A07G044_CA55_RST_8		8
+#define R9A07G044_CA55_RST_9		9
+#define R9A07G044_CA55_RST_10		10
+#define R9A07G044_CA55_RST_11		11
+#define R9A07G044_CA55_RST_12		12
+#define R9A07G044_GIC600_GICRESET_N	13
+#define R9A07G044_GIC600_DBG_GICRESET_N	14
+#define R9A07G044_IA55_RESETN		15
+#define R9A07G044_MHU_RESETN		16
+#define R9A07G044_DMAC_ARESETN		17
+#define R9A07G044_DMAC_RST_ASYNC	18
+#define R9A07G044_SYC_RESETN		19
+#define R9A07G044_OSTM0_PRESETZ		20
+#define R9A07G044_OSTM1_PRESETZ		21
+#define R9A07G044_OSTM2_PRESETZ		22
+#define R9A07G044_MTU_X_PRESET_MTU3	23
+#define R9A07G044_POE3_RST_M_REG	24
+#define R9A07G044_GPT_RST_C		25
+#define R9A07G044_POEG_A_RST		26
+#define R9A07G044_POEG_B_RST		27
+#define R9A07G044_POEG_C_RST		28
+#define R9A07G044_POEG_D_RST		29
+#define R9A07G044_WDT0_PRESETN		30
+#define R9A07G044_WDT1_PRESETN		31
+#define R9A07G044_WDT2_PRESETN		32
+#define R9A07G044_SPI_RST		33
+#define R9A07G044_SDHI0_IXRST		34
+#define R9A07G044_SDHI1_IXRST		35
+#define R9A07G044_GPU_RESETN		36
+#define R9A07G044_GPU_AXI_RESETN	37
+#define R9A07G044_GPU_ACE_RESETN	38
+#define R9A07G044_ISU_ARESETN		39
+#define R9A07G044_ISU_PRESETN		40
+#define R9A07G044_H264_X_RESET_VCP	41
+#define R9A07G044_H264_CP_PRESET_P	42
+#define R9A07G044_CRU_CMN_RSTB		43
+#define R9A07G044_CRU_PRESETN		44
+#define R9A07G044_CRU_ARESETN		45
+#define R9A07G044_MIPI_DSI_CMN_RSTB	46
+#define R9A07G044_MIPI_DSI_ARESET_N	47
+#define R9A07G044_MIPI_DSI_PRESET_N	48
+#define R9A07G044_LCDC_RESET_N		49
+#define R9A07G044_SSI0_RST_M2_REG	50
+#define R9A07G044_SSI1_RST_M2_REG	51
+#define R9A07G044_SSI2_RST_M2_REG	52
+#define R9A07G044_SSI3_RST_M2_REG	53
+#define R9A07G044_SRC_RST		54
+#define R9A07G044_USB_U2H0_HRESETN	55
+#define R9A07G044_USB_U2H1_HRESETN	56
+#define R9A07G044_USB_U2P_EXL_SYSRST	57
+#define R9A07G044_USB_PRESETN		58
+#define R9A07G044_ETH0_RST_HW_N		59
+#define R9A07G044_ETH1_RST_HW_N		60
+#define R9A07G044_I2C0_MRST		61
+#define R9A07G044_I2C1_MRST		62
+#define R9A07G044_I2C2_MRST		63
+#define R9A07G044_I2C3_MRST		64
+#define R9A07G044_SCIF0_RST_SYSTEM_N	65
+#define R9A07G044_SCIF1_RST_SYSTEM_N	66
+#define R9A07G044_SCIF2_RST_SYSTEM_N	67
+#define R9A07G044_SCIF3_RST_SYSTEM_N	68
+#define R9A07G044_SCIF4_RST_SYSTEM_N	69
+#define R9A07G044_SCI0_RST		70
+#define R9A07G044_SCI1_RST		71
+#define R9A07G044_IRDA_RST		72
+#define R9A07G044_RSPI0_RST		73
+#define R9A07G044_RSPI1_RST		74
+#define R9A07G044_RSPI2_RST		75
+#define R9A07G044_CANFD_RSTP_N		76
+#define R9A07G044_CANFD_RSTC_N		77
+#define R9A07G044_GPIO_RSTN		78
+#define R9A07G044_GPIO_PORT_RESETN	79
+#define R9A07G044_GPIO_SPARE_RESETN	80
+#define R9A07G044_ADC_PRESETN		81
+#define R9A07G044_ADC_ADRST_N		82
+#define R9A07G044_TSU_PRESETN		83
 
 #endif /* __DT_BINDINGS_CLOCK_R9A07G044_CPG_H__ */
-- 
cgit v1.2.3


From 514305ee0a1dade95c6ff1eb5735de5a329d1f89 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Date: Thu, 1 Jul 2021 12:41:27 +0200
Subject: RDMA/irdma: Make spdxcheck.py happy

Commit 48d6b3336a9f ("RDMA/irdma: Add ABI definitions") adds
./include/uapi/rdma/irdma-abi.h with an additional unneeded closing
bracket at the end of the SPDX-License-Identifier line.

Hence, ./scripts/spdxcheck.py complains:

  include/uapi/rdma/irdma-abi.h: 1:77 Syntax error: )

Remove that closing bracket to make spdxcheck.py happy.

Fixes: 48d6b3336a9f ("RDMA/irdma: Add ABI definitions")
Link: https://lore.kernel.org/r/20210701104127.1877-1-lukas.bulwahn@gmail.com
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
Acked-by: Tatyana Nikolova <tatyana.e.nikolova@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/uapi/rdma/irdma-abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/rdma/irdma-abi.h b/include/uapi/rdma/irdma-abi.h
index 26b638a7ad97..a7085e092d34 100644
--- a/include/uapi/rdma/irdma-abi.h
+++ b/include/uapi/rdma/irdma-abi.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB) */
+/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR Linux-OpenIB */
 /*
  * Copyright (c) 2006 - 2021 Intel Corporation.  All rights reserved.
  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
-- 
cgit v1.2.3


From 79789db03fdd77510cfb35cb4b3bd52b6c50c901 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 12 Jul 2021 16:32:07 +0100
Subject: mm: Make copy_huge_page() always available

Rewrite copy_huge_page() and move it into mm/util.c so it's always
available.  Fixes an exposure of uninitialised memory on configurations
with HUGETLB and UFFD enabled and MIGRATION disabled.

Fixes: 8cc5fcbb5be8 ("mm, hugetlb: fix racy resv_huge_pages underflow on UFFDIO_COPY")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h |  5 -----
 include/linux/mm.h      |  1 +
 mm/migrate.c            | 48 ------------------------------------------------
 mm/util.c               | 10 ++++++++++
 4 files changed, 11 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 9b7b7cd3bae9..23dadf7aeba8 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -51,7 +51,6 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
 		struct page *newpage, struct page *page, int extra_count);
-extern void copy_huge_page(struct page *dst, struct page *src);
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
@@ -77,10 +76,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
 {
 	return -ENOSYS;
 }
-
-static inline void copy_huge_page(struct page *dst, struct page *src)
-{
-}
 #endif /* CONFIG_MIGRATION */
 
 #ifdef CONFIG_COMPACTION
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 57453dba41b9..7ca22e6e694a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -906,6 +906,7 @@ void __put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
+void copy_huge_page(struct page *dst, struct page *src);
 
 /*
  * Compound pages have a destructor function.  Provide a
diff --git a/mm/migrate.c b/mm/migrate.c
index 23cbd9de030b..34a9ad3e0a4f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -536,54 +536,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
 	return MIGRATEPAGE_SUCCESS;
 }
 
-/*
- * Gigantic pages are so large that we do not guarantee that page++ pointer
- * arithmetic will work across the entire page.  We need something more
- * specialized.
- */
-static void __copy_gigantic_page(struct page *dst, struct page *src,
-				int nr_pages)
-{
-	int i;
-	struct page *dst_base = dst;
-	struct page *src_base = src;
-
-	for (i = 0; i < nr_pages; ) {
-		cond_resched();
-		copy_highpage(dst, src);
-
-		i++;
-		dst = mem_map_next(dst, dst_base, i);
-		src = mem_map_next(src, src_base, i);
-	}
-}
-
-void copy_huge_page(struct page *dst, struct page *src)
-{
-	int i;
-	int nr_pages;
-
-	if (PageHuge(src)) {
-		/* hugetlbfs page */
-		struct hstate *h = page_hstate(src);
-		nr_pages = pages_per_huge_page(h);
-
-		if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
-			__copy_gigantic_page(dst, src, nr_pages);
-			return;
-		}
-	} else {
-		/* thp page */
-		BUG_ON(!PageTransHuge(src));
-		nr_pages = thp_nr_pages(src);
-	}
-
-	for (i = 0; i < nr_pages; i++) {
-		cond_resched();
-		copy_highpage(dst + i, src + i);
-	}
-}
-
 /*
  * Copy the page to its new location
  */
diff --git a/mm/util.c b/mm/util.c
index 99c6cc77de9e..9043d03750a7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -731,6 +731,16 @@ int __page_mapcount(struct page *page)
 }
 EXPORT_SYMBOL_GPL(__page_mapcount);
 
+void copy_huge_page(struct page *dst, struct page *src)
+{
+	unsigned i, nr = compound_nr(src);
+
+	for (i = 0; i < nr; i++) {
+		cond_resched();
+		copy_highpage(nth_page(dst, i), nth_page(src, i));
+	}
+}
+
 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
 int sysctl_overcommit_ratio __read_mostly = 50;
 unsigned long sysctl_overcommit_kbytes __read_mostly;
-- 
cgit v1.2.3


From 52f83955aaf91b22f46765b007b4404ce85b2133 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Mon, 12 Jul 2021 14:08:01 +0100
Subject: firmware: arm_scmi: Fix kernel doc warnings

Kernel doc validation script is unhappy and complains with the below set
of warnings.

 | Function parameter or member 'fast_switch_possible' not described in 'scmi_perf_proto_ops'
 | Function parameter or member 'power_scale_mw_get' not described in 'scmi_perf_proto_ops'
 | cannot understand function prototype: 'struct scmi_sensor_reading '
 | cannot understand function prototype: 'struct scmi_range_attrs '
 | cannot understand function prototype: 'struct scmi_sensor_axis_info '
 | cannot understand function prototype: 'struct scmi_sensor_intervals_info '

Fix them adding appropriate documents or missing keywords.

Link: https://lore.kernel.org/r/20210712130801.2436492-2-sudeep.holla@arm.com
Reviewed-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scmi_protocol.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 79d0a1237e6c..80e781c51ddc 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -101,6 +101,10 @@ struct scmi_clk_proto_ops {
  *	to sustained performance level mapping
  * @est_power_get: gets the estimated power cost for a given performance domain
  *	at a given frequency
+ * @fast_switch_possible: indicates if fast DVFS switching is possible or not
+ *	for a given device
+ * @power_scale_mw_get: indicates if the power values provided are in milliWatts
+ *	or in some other (abstract) scale
  */
 struct scmi_perf_proto_ops {
 	int (*limits_set)(const struct scmi_protocol_handle *ph, u32 domain,
@@ -153,7 +157,7 @@ struct scmi_power_proto_ops {
 };
 
 /**
- * scmi_sensor_reading  - represent a timestamped read
+ * struct scmi_sensor_reading  - represent a timestamped read
  *
  * Used by @reading_get_timestamped method.
  *
@@ -167,7 +171,7 @@ struct scmi_sensor_reading {
 };
 
 /**
- * scmi_range_attrs  - specifies a sensor or axis values' range
+ * struct scmi_range_attrs  - specifies a sensor or axis values' range
  * @min_range: The minimum value which can be represented by the sensor/axis.
  * @max_range: The maximum value which can be represented by the sensor/axis.
  */
@@ -177,7 +181,7 @@ struct scmi_range_attrs {
 };
 
 /**
- * scmi_sensor_axis_info  - describes one sensor axes
+ * struct scmi_sensor_axis_info  - describes one sensor axes
  * @id: The axes ID.
  * @type: Axes type. Chosen amongst one of @enum scmi_sensor_class.
  * @scale: Power-of-10 multiplier applied to the axis unit.
@@ -205,8 +209,8 @@ struct scmi_sensor_axis_info {
 };
 
 /**
- * scmi_sensor_intervals_info  - describes number and type of available update
- * intervals
+ * struct scmi_sensor_intervals_info  - describes number and type of available
+ *	update intervals
  * @segmented: Flag for segmented intervals' representation. When True there
  *	       will be exactly 3 intervals in @desc, with each entry
  *	       representing a member of a segment in this order:
-- 
cgit v1.2.3


From 5ff6319d46cee22c9cd6f39a377e32c444f9a7b0 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Mon, 12 Jul 2021 11:27:48 +0100
Subject: firmware: arm_scpi: Fix kernel doc warnings

Kernel doc validation script is unhappy and complains with the below set
of warnings.

 | Function parameter or member 'device_domain_id' not described in 'scpi_ops'
 | Function parameter or member 'get_transition_latency' not described in 'scpi_ops'
 | Function parameter or member 'add_opps_to_device' not described in 'scpi_ops'
 | Function parameter or member 'sensor_get_capability' not described in 'scpi_ops'
 | Function parameter or member 'sensor_get_info' not described in 'scpi_ops'
 | Function parameter or member 'sensor_get_value' not described in 'scpi_ops'
 | Function parameter or member 'device_get_power_state' not described in 'scpi_ops'
 | Function parameter or member 'device_set_power_state' not described in 'scpi_ops'

Fix them adding appropriate documents or missing keywords.

Link: https://lore.kernel.org/r/20210712130801.2436492-1-sudeep.holla@arm.com
Reviewed-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/linux/scpi_protocol.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/scpi_protocol.h b/include/linux/scpi_protocol.h
index afbf8037d8db..d2176a56828a 100644
--- a/include/linux/scpi_protocol.h
+++ b/include/linux/scpi_protocol.h
@@ -51,6 +51,14 @@ struct scpi_sensor_info {
  *	OPP is an index to the list return by @dvfs_get_info
  * @dvfs_get_info: returns the DVFS capabilities of the given power
  *	domain. It includes the OPP list and the latency information
+ * @device_domain_id: gets the scpi domain id for a given device
+ * @get_transition_latency: gets the DVFS transition latency for a given device
+ * @add_opps_to_device: adds all the OPPs for a given device
+ * @sensor_get_capability: get the list of capabilities for the sensors
+ * @sensor_get_info: get the information of the specified sensor
+ * @sensor_get_value: gets the current value of the sensor
+ * @device_get_power_state: gets the power state of a power domain
+ * @device_set_power_state: sets the power state of a power domain
  */
 struct scpi_ops {
 	u32 (*get_version)(void);
-- 
cgit v1.2.3


From f336a009f8e3dd0b47168565584608a4a62cbbb4 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 13 Jul 2021 13:09:06 -0500
Subject: math-emu: Fix fall-through warning

Fix the following fallthrough warning (nds32-randconfig with GCC):

include/math-emu/op-common.h:332:8: warning: this statement may fall through [-Wimplicit-fallthrough=]

Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/lkml/60edca25.k00ut905IFBjPyt5%25lkp@intel.com/
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
---
 include/math-emu/op-common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/math-emu/op-common.h b/include/math-emu/op-common.h
index 143568d64b20..4b57bbba588a 100644
--- a/include/math-emu/op-common.h
+++ b/include/math-emu/op-common.h
@@ -338,7 +338,7 @@ do {									     \
 	FP_SET_EXCEPTION(FP_EX_INVALID | FP_EX_INVALID_ISI);		     \
 	break;								     \
       }									     \
-    /* FALLTHRU */							     \
+    fallthrough;							     \
 									     \
   case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_NORMAL):			     \
   case _FP_CLS_COMBINE(FP_CLS_INF,FP_CLS_ZERO):				     \
-- 
cgit v1.2.3


From d1d488d813703618f0dd93f0e4c4a05928114aa8 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Wed, 14 Jul 2021 15:47:50 +0200
Subject: fs: add vfs_parse_fs_param_source() helper

Add a simple helper that filesystems can use in their parameter parser
to parse the "source" parameter. A few places open-coded this function
and that already caused a bug in the cgroup v1 parser that we fixed.
Let's make it harder to get this wrong by introducing a helper which
performs all necessary checks.

Link: https://syzkaller.appspot.com/bug?id=6312526aba5beae046fdae8f00399f87aab48b12
Cc: Christoph Hellwig <hch@lst.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs_context.c            | 54 ++++++++++++++++++++++++++++++----------------
 include/linux/fs_context.h |  2 ++
 kernel/cgroup/cgroup-v1.c  | 14 +++++-------
 3 files changed, 43 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/fs/fs_context.c b/fs/fs_context.c
index 2834d1afa6e8..de1985eae535 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -79,6 +79,35 @@ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key)
 	return -ENOPARAM;
 }
 
+/**
+ * vfs_parse_fs_param_source - Handle setting "source" via parameter
+ * @fc: The filesystem context to modify
+ * @param: The parameter
+ *
+ * This is a simple helper for filesystems to verify that the "source" they
+ * accept is sane.
+ *
+ * Returns 0 on success, -ENOPARAM if this is not  "source" parameter, and
+ * -EINVAL otherwise. In the event of failure, supplementary error information
+ *  is logged.
+ */
+int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param)
+{
+	if (strcmp(param->key, "source") != 0)
+		return -ENOPARAM;
+
+	if (param->type != fs_value_is_string)
+		return invalf(fc, "Non-string source");
+
+	if (fc->source)
+		return invalf(fc, "Multiple sources");
+
+	fc->source = param->string;
+	param->string = NULL;
+	return 0;
+}
+EXPORT_SYMBOL(vfs_parse_fs_param_source);
+
 /**
  * vfs_parse_fs_param - Add a single parameter to a superblock config
  * @fc: The filesystem context to modify
@@ -122,15 +151,9 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param)
 	/* If the filesystem doesn't take any arguments, give it the
 	 * default handling of source.
 	 */
-	if (strcmp(param->key, "source") == 0) {
-		if (param->type != fs_value_is_string)
-			return invalf(fc, "VFS: Non-string source");
-		if (fc->source)
-			return invalf(fc, "VFS: Multiple sources");
-		fc->source = param->string;
-		param->string = NULL;
-		return 0;
-	}
+	ret = vfs_parse_fs_param_source(fc, param);
+	if (ret != -ENOPARAM)
+		return ret;
 
 	return invalf(fc, "%s: Unknown parameter '%s'",
 		      fc->fs_type->name, param->key);
@@ -504,16 +527,11 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	struct legacy_fs_context *ctx = fc->fs_private;
 	unsigned int size = ctx->data_size;
 	size_t len = 0;
+	int ret;
 
-	if (strcmp(param->key, "source") == 0) {
-		if (param->type != fs_value_is_string)
-			return invalf(fc, "VFS: Legacy: Non-string source");
-		if (fc->source)
-			return invalf(fc, "VFS: Legacy: Multiple sources");
-		fc->source = param->string;
-		param->string = NULL;
-		return 0;
-	}
+	ret = vfs_parse_fs_param_source(fc, param);
+	if (ret != -ENOPARAM)
+		return ret;
 
 	if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
 		return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 37e1e8f7f08d..e2bc16300c82 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -139,6 +139,8 @@ extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
 extern int generic_parse_monolithic(struct fs_context *fc, void *data);
 extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
+extern int vfs_parse_fs_param_source(struct fs_context *fc,
+				     struct fs_parameter *param);
 
 /*
  * sget() wrappers to be called from the ->get_tree() op.
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 527917c0b30b..8d6bf56ed77a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -911,15 +911,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
 
 	opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
 	if (opt == -ENOPARAM) {
-		if (strcmp(param->key, "source") == 0) {
-			if (param->type != fs_value_is_string)
-				return invalf(fc, "Non-string source");
-			if (fc->source)
-				return invalf(fc, "Multiple sources not supported");
-			fc->source = param->string;
-			param->string = NULL;
-			return 0;
-		}
+		int ret;
+
+		ret = vfs_parse_fs_param_source(fc, param);
+		if (ret != -ENOPARAM)
+			return ret;
 		for_each_subsys(ss, i) {
 			if (strcmp(param->key, ss->legacy_name))
 				continue;
-- 
cgit v1.2.3


From 2db710cc846d3321a4dc0977fa13769bddba2351 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 14 Jul 2021 21:26:40 -0700
Subject: kasan: fix build by including kernel.h

The <linux/kasan.h> header relies on _RET_IP_ being defined, and had been
receiving that definition via inclusion of bug.h which includes kernel.h.
However, since f39650de687e ("kernel.h: split out panic and oops helpers")
that is no longer the case and get the following build error when building
CONFIG_KASAN_HW_TAGS on arm64:

  In file included from arch/arm64/mm/kasan_init.c:10:
  include/linux/kasan.h: In function 'kasan_slab_free':
  include/linux/kasan.h:230:39: error: '_RET_IP_' undeclared (first use in this function)
    230 |   return __kasan_slab_free(s, object, _RET_IP_, init);

Fix it by including kernel.h from kasan.h.

Link: https://lkml.kernel.org/r/20210705072716.2125074-1-elver@google.com
Fixes: f39650de687e ("kernel.h: split out panic and oops helpers")
Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 5310e217bd74..dd874a1ee862 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -3,6 +3,7 @@
 #define _LINUX_KASAN_H
 
 #include <linux/bug.h>
+#include <linux/kernel.h>
 #include <linux/static_key.h>
 #include <linux/types.h>
 
-- 
cgit v1.2.3


From ab7965de1725cd8514f0edbced5c2fb793846078 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jul 2021 21:26:55 -0700
Subject: mm: fix the try_to_unmap prototype for !CONFIG_MMU

Adjust the nommu stub of try_to_unmap to match the changed protype for the
full version.  Turn it into an inline instead of a macro to generally
improve the type checking.

Link: https://lkml.kernel.org/r/20210705053944.885828-1-hch@lst.de
Fixes: 1fb08ac63bee ("mm: rmap: make try_to_unmap() void function")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 83fb86133fe1..c976cc6de257 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -291,7 +291,9 @@ static inline int page_referenced(struct page *page, int is_locked,
 	return 0;
 }
 
-#define try_to_unmap(page, refs) false
+static inline void try_to_unmap(struct page *page, enum ttu_flags flags)
+{
+}
 
 static inline int page_mkclean(struct page *page)
 {
-- 
cgit v1.2.3


From 65875073eddd24d7b3968c1501ef29277398dc7b Mon Sep 17 00:00:00 2001
From: Qitao Xu <qitao.xu@bytedance.com>
Date: Wed, 14 Jul 2021 22:59:23 -0700
Subject: net: use %px to print skb address in trace_netif_receive_skb

The print format of skb adress in tracepoint class net_dev_template
is changed to %px from %p, because we want to use skb address
as a quick way to identify a packet.

Note, trace ring buffer is only accessible to privileged users,
it is safe to use a real kernel address here.

Reviewed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 2399073c3afc..78c448c6ab4c 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -136,7 +136,7 @@ DECLARE_EVENT_CLASS(net_dev_template,
 		__assign_str(name, skb->dev->name);
 	),
 
-	TP_printk("dev=%s skbaddr=%p len=%u",
+	TP_printk("dev=%s skbaddr=%px len=%u",
 		__get_str(name), __entry->skbaddr, __entry->len)
 )
 
-- 
cgit v1.2.3


From 851f36e40962408309ad2665bf0056c19a97881c Mon Sep 17 00:00:00 2001
From: Qitao Xu <qitao.xu@bytedance.com>
Date: Wed, 14 Jul 2021 23:00:21 -0700
Subject: net_sched: use %px to print skb address in trace_qdisc_dequeue()

Print format of skbaddr is changed to %px from %p, because we want
to use skb address as a quick way to identify a packet.

Note, trace ring buffer is only accessible to privileged users,
it is safe to use a real kernel address here.

Reviewed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/qdisc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
index 330d32d84485..58209557cb3a 100644
--- a/include/trace/events/qdisc.h
+++ b/include/trace/events/qdisc.h
@@ -41,7 +41,7 @@ TRACE_EVENT(qdisc_dequeue,
 		__entry->txq_state	= txq->state;
 	),
 
-	TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%p",
+	TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%px",
 		  __entry->ifindex, __entry->handle, __entry->parent,
 		  __entry->txq_state, __entry->packets, __entry->skbaddr )
 );
-- 
cgit v1.2.3


From 70713dddf3d25a02d1952f8c5d2688c986d2f2fb Mon Sep 17 00:00:00 2001
From: Qitao Xu <qitao.xu@bytedance.com>
Date: Wed, 14 Jul 2021 23:03:24 -0700
Subject: net_sched: introduce tracepoint trace_qdisc_enqueue()

Tracepoint trace_qdisc_enqueue() is introduced to trace skb at
the entrance of TC layer on TX side. This is similar to
trace_qdisc_dequeue():

1. For both we only trace successful cases. The failure cases
   can be traced via trace_kfree_skb().

2. They are called at entrance or exit of TC layer, not for each
   ->enqueue() or ->dequeue(). This is intentional, because
   we want to make trace_qdisc_enqueue() symmetric to
   trace_qdisc_dequeue(), which is easier to use.

The return value of qdisc_enqueue() is not interesting here,
we have Qdisc's drop packets in ->dequeue(), it is impossible to
trace them even if we have the return value, the only way to trace
them is tracing kfree_skb().

We only add information we need to trace ring buffer. If any other
information is needed, it is easy to extend it without breaking ABI,
see commit 3dd344ea84e1 ("net: tracepoint: exposing sk_family in all
tcp:tracepoints").

Reviewed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Qitao Xu <qitao.xu@bytedance.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/qdisc.h | 26 ++++++++++++++++++++++++++
 net/core/dev.c               | 20 ++++++++++++++++----
 2 files changed, 42 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
index 58209557cb3a..c3006c6b4a87 100644
--- a/include/trace/events/qdisc.h
+++ b/include/trace/events/qdisc.h
@@ -46,6 +46,32 @@ TRACE_EVENT(qdisc_dequeue,
 		  __entry->txq_state, __entry->packets, __entry->skbaddr )
 );
 
+TRACE_EVENT(qdisc_enqueue,
+
+	TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb),
+
+	TP_ARGS(qdisc, txq, skb),
+
+	TP_STRUCT__entry(
+		__field(struct Qdisc *, qdisc)
+		__field(void *,	skbaddr)
+		__field(int, ifindex)
+		__field(u32, handle)
+		__field(u32, parent)
+	),
+
+	TP_fast_assign(
+		__entry->qdisc = qdisc;
+		__entry->skbaddr = skb;
+		__entry->ifindex = txq->dev ? txq->dev->ifindex : 0;
+		__entry->handle	 = qdisc->handle;
+		__entry->parent	 = qdisc->parent;
+	),
+
+	TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%px",
+		  __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr)
+);
+
 TRACE_EVENT(qdisc_reset,
 
 	TP_PROTO(struct Qdisc *q),
diff --git a/net/core/dev.c b/net/core/dev.c
index 64b21f0a2048..7aeefc467ddc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -131,6 +131,7 @@
 #include <trace/events/napi.h>
 #include <trace/events/net.h>
 #include <trace/events/skb.h>
+#include <trace/events/qdisc.h>
 #include <linux/inetdevice.h>
 #include <linux/cpu_rmap.h>
 #include <linux/static_key.h>
@@ -3844,6 +3845,18 @@ static void qdisc_pkt_len_init(struct sk_buff *skb)
 	}
 }
 
+static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
+			     struct sk_buff **to_free,
+			     struct netdev_queue *txq)
+{
+	int rc;
+
+	rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
+	if (rc == NET_XMIT_SUCCESS)
+		trace_qdisc_enqueue(q, txq, skb);
+	return rc;
+}
+
 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 				 struct net_device *dev,
 				 struct netdev_queue *txq)
@@ -3862,8 +3875,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			 * of q->seqlock to protect from racing with requeuing.
 			 */
 			if (unlikely(!nolock_qdisc_is_empty(q))) {
-				rc = q->enqueue(skb, q, &to_free) &
-					NET_XMIT_MASK;
+				rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 				__qdisc_run(q);
 				qdisc_run_end(q);
 
@@ -3879,7 +3891,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 			return NET_XMIT_SUCCESS;
 		}
 
-		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 		qdisc_run(q);
 
 no_lock_out:
@@ -3923,7 +3935,7 @@ no_lock_out:
 		qdisc_run_end(q);
 		rc = NET_XMIT_SUCCESS;
 	} else {
-		rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+		rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
 		if (qdisc_run_begin(q)) {
 			if (unlikely(contended)) {
 				spin_unlock(&q->busylock);
-- 
cgit v1.2.3


From e48bf29cf9d6d60d810e2af71e54b71a324094e0 Mon Sep 17 00:00:00 2001
From: Ye Xiang <xiang.ye@intel.com>
Date: Sun, 13 Jun 2021 11:25:07 +0800
Subject: HID: intel-ish-hid: use async resume function

ISH IPC driver uses asynchronous workqueue to do resume now, but there is
a potential timing issue: when child devices resume before bus driver, it
will cause child devices resume failed and cannot be recovered until
reboot. The current implementation in this case do wait for IPC to resume
but fail to accommodate for a case when there is no ISH reboot and soft
resume is taking time. This issue is apparent on Tiger Lake platform with
5.11.13 kernel when doing suspend to idle then resume(s0ix) test. To
resolve this issue, we change ISHTP HID client to use asynchronous resume
callback too. In the asynchronous resume callback, it waits for the ISHTP
resume done event, and then notify ISHTP HID client link ready.

Signed-off-by: Ye Xiang <xiang.ye@intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/intel-ish-hid/ishtp-hid-client.c | 15 +++++++++++++-
 drivers/hid/intel-ish-hid/ishtp-hid.h        |  1 +
 drivers/hid/intel-ish-hid/ishtp/bus.c        | 29 +++++++++++++++++++++-------
 include/linux/intel-ish-client-if.h          |  2 ++
 4 files changed, 39 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/hid/intel-ish-hid/ishtp-hid-client.c b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
index 6b1fa971b33e..91bf4d01e91a 100644
--- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c
+++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c
@@ -784,6 +784,17 @@ static void hid_ishtp_cl_reset_handler(struct work_struct *work)
 	}
 }
 
+static void hid_ishtp_cl_resume_handler(struct work_struct *work)
+{
+	struct ishtp_cl_data *client_data = container_of(work, struct ishtp_cl_data, resume_work);
+	struct ishtp_cl *hid_ishtp_cl = client_data->hid_ishtp_cl;
+
+	if (ishtp_wait_resume(ishtp_get_ishtp_device(hid_ishtp_cl))) {
+		client_data->suspended = false;
+		wake_up_interruptible(&client_data->ishtp_resume_wait);
+	}
+}
+
 ishtp_print_log ishtp_hid_print_trace;
 
 /**
@@ -822,6 +833,8 @@ static int hid_ishtp_cl_probe(struct ishtp_cl_device *cl_device)
 	init_waitqueue_head(&client_data->ishtp_resume_wait);
 
 	INIT_WORK(&client_data->work, hid_ishtp_cl_reset_handler);
+	INIT_WORK(&client_data->resume_work, hid_ishtp_cl_resume_handler);
+
 
 	ishtp_hid_print_trace = ishtp_trace_callback(cl_device);
 
@@ -921,7 +934,7 @@ static int hid_ishtp_cl_resume(struct device *device)
 
 	hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__,
 			hid_ishtp_cl);
-	client_data->suspended = false;
+	schedule_work(&client_data->resume_work);
 	return 0;
 }
 
diff --git a/drivers/hid/intel-ish-hid/ishtp-hid.h b/drivers/hid/intel-ish-hid/ishtp-hid.h
index f88443a7d935..6a5cc11aefd8 100644
--- a/drivers/hid/intel-ish-hid/ishtp-hid.h
+++ b/drivers/hid/intel-ish-hid/ishtp-hid.h
@@ -135,6 +135,7 @@ struct ishtp_cl_data {
 	int multi_packet_cnt;
 
 	struct work_struct work;
+	struct work_struct resume_work;
 	struct ishtp_cl_device *cl_device;
 };
 
diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c
index f0802b047ed8..aa2c51624012 100644
--- a/drivers/hid/intel-ish-hid/ishtp/bus.c
+++ b/drivers/hid/intel-ish-hid/ishtp/bus.c
@@ -314,13 +314,6 @@ static int ishtp_cl_device_resume(struct device *dev)
 	if (!device)
 		return 0;
 
-	/*
-	 * When ISH needs hard reset, it is done asynchrnously, hence bus
-	 * resume will  be called before full ISH resume
-	 */
-	if (device->ishtp_dev->resume_flag)
-		return 0;
-
 	driver = to_ishtp_cl_driver(dev->driver);
 	if (driver && driver->driver.pm) {
 		if (driver->driver.pm->resume)
@@ -849,6 +842,28 @@ struct device *ishtp_device(struct ishtp_cl_device *device)
 }
 EXPORT_SYMBOL(ishtp_device);
 
+/**
+ * ishtp_wait_resume() - Wait for IPC resume
+ *
+ * Wait for IPC resume
+ *
+ * Return: resume complete or not
+ */
+bool ishtp_wait_resume(struct ishtp_device *dev)
+{
+	/* 50ms to get resume response */
+	#define WAIT_FOR_RESUME_ACK_MS		50
+
+	/* Waiting to get resume response */
+	if (dev->resume_flag)
+		wait_event_interruptible_timeout(dev->resume_wait,
+						 !dev->resume_flag,
+						 msecs_to_jiffies(WAIT_FOR_RESUME_ACK_MS));
+
+	return (!dev->resume_flag);
+}
+EXPORT_SYMBOL_GPL(ishtp_wait_resume);
+
 /**
  * ishtp_get_pci_device() - Return PCI device dev pointer
  * This interface is used to return PCI device pointer
diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h
index 25e2b4e80502..aee8ff4739b1 100644
--- a/include/linux/intel-ish-client-if.h
+++ b/include/linux/intel-ish-client-if.h
@@ -81,6 +81,8 @@ int ishtp_register_event_cb(struct ishtp_cl_device *device,
 
 /* Get the device * from ishtp device instance */
 struct device *ishtp_device(struct ishtp_cl_device *cl_device);
+/* wait for IPC resume */
+bool ishtp_wait_resume(struct ishtp_device *dev);
 /* Trace interface for clients */
 ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device);
 /* Get device pointer of PCI device for DMA acces */
-- 
cgit v1.2.3


From e042aa532c84d18ff13291d00620502ce7a38dda Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 16 Jul 2021 09:18:21 +0000
Subject: bpf: Fix pointer arithmetic mask tightening under state pruning

In 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask") we
narrowed the offset mask for unprivileged pointer arithmetic in order to
mitigate a corner case where in the speculative domain it is possible to
advance, for example, the map value pointer by up to value_size-1 out-of-
bounds in order to leak kernel memory via side-channel to user space.

The verifier's state pruning for scalars leaves one corner case open
where in the first verification path R_x holds an unknown scalar with an
aux->alu_limit of e.g. 7, and in a second verification path that same
register R_x, here denoted as R_x', holds an unknown scalar which has
tighter bounds and would thus satisfy range_within(R_x, R_x') as well as
tnum_in(R_x, R_x') for state pruning, yielding an aux->alu_limit of 3:
Given the second path fits the register constraints for pruning, the final
generated mask from aux->alu_limit will remain at 7. While technically
not wrong for the non-speculative domain, it would however be possible
to craft similar cases where the mask would be too wide as in 7fedb63a8307.

One way to fix it is to detect the presence of unknown scalar map pointer
arithmetic and force a deeper search on unknown scalars to ensure that
we do not run into a masking mismatch.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/verifier.c        | 27 +++++++++++++++++----------
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e774ecc1cd1f..7ba7e800d472 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -414,6 +414,7 @@ struct bpf_verifier_env {
 	u32 used_map_cnt;		/* number of used maps */
 	u32 used_btf_cnt;		/* number of used BTF objects */
 	u32 id_gen;			/* used to generate unique reg IDs */
+	bool explore_alu_limits;
 	bool allow_ptr_leaks;
 	bool allow_uninit_stack;
 	bool allow_ptr_to_map_access;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8a7a28b4cfb9..657062cb4d85 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6561,6 +6561,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 		alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
 		alu_state |= ptr_is_dst_reg ?
 			     BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+
+		/* Limit pruning on unknown scalars to enable deep search for
+		 * potential masking differences from other program paths.
+		 */
+		if (!off_is_imm)
+			env->explore_alu_limits = true;
 	}
 
 	err = update_alu_sanitation_state(aux, alu_state, alu_limit);
@@ -9936,8 +9942,8 @@ next:
 }
 
 /* Returns true if (rold safe implies rcur safe) */
-static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
-		    struct bpf_id_pair *idmap)
+static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
+		    struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
 {
 	bool equal;
 
@@ -9963,6 +9969,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 		return false;
 	switch (rold->type) {
 	case SCALAR_VALUE:
+		if (env->explore_alu_limits)
+			return false;
 		if (rcur->type == SCALAR_VALUE) {
 			if (!rold->precise && !rcur->precise)
 				return true;
@@ -10053,9 +10061,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	return false;
 }
 
-static bool stacksafe(struct bpf_func_state *old,
-		      struct bpf_func_state *cur,
-		      struct bpf_id_pair *idmap)
+static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
+		      struct bpf_func_state *cur, struct bpf_id_pair *idmap)
 {
 	int i, spi;
 
@@ -10100,9 +10107,8 @@ static bool stacksafe(struct bpf_func_state *old,
 			continue;
 		if (old->stack[spi].slot_type[0] != STACK_SPILL)
 			continue;
-		if (!regsafe(&old->stack[spi].spilled_ptr,
-			     &cur->stack[spi].spilled_ptr,
-			     idmap))
+		if (!regsafe(env, &old->stack[spi].spilled_ptr,
+			     &cur->stack[spi].spilled_ptr, idmap))
 			/* when explored and current stack slot are both storing
 			 * spilled registers, check that stored pointers types
 			 * are the same as well.
@@ -10159,10 +10165,11 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
 
 	memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch))
+		if (!regsafe(env, &old->regs[i], &cur->regs[i],
+			     env->idmap_scratch))
 			return false;
 
-	if (!stacksafe(old, cur, env->idmap_scratch))
+	if (!stacksafe(env, old, cur, env->idmap_scratch))
 		return false;
 
 	if (!refsafe(old, cur))
-- 
cgit v1.2.3


From ec645dc96699ea6c37b6de86c84d7288ea9a4ddf Mon Sep 17 00:00:00 2001
From: Oleksandr Natalenko <oleksandr@natalenko.name>
Date: Sat, 17 Jul 2021 14:33:28 +0200
Subject: block: increase BLKCG_MAX_POLS

After mq-deadline learned to deal with cgroups, the BLKCG_MAX_POLS value
became too small for all the elevators to be registered properly. The
following issue is seen:

```
calling  bfq_init+0x0/0x8b @ 1
blkcg_policy_register: BLKCG_MAX_POLS too small
initcall bfq_init+0x0/0x8b returned -28 after 507 usecs
```

which renders BFQ non-functional.

Increase BLKCG_MAX_POLS to allow enough space for everyone.

Fixes: 08a9ad8bf607 ("block/mq-deadline: Add cgroup support")
Link: https://lore.kernel.org/lkml/8988303.mDXGIdCtx8@natalenko.name/
Signed-off-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Link: https://lore.kernel.org/r/20210717123328.945810-1-oleksandr@natalenko.name
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c454fb446fd0..2e12320cb121 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -57,7 +57,7 @@ struct blk_keyslot_manager;
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
  */
-#define BLKCG_MAX_POLS		5
+#define BLKCG_MAX_POLS		6
 
 typedef void (rq_end_io_fn)(struct request *, blk_status_t);
 
-- 
cgit v1.2.3


From 71f6428332844f38c7cb10461d9f29e9c9b983a0 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andy.shevchenko@gmail.com>
Date: Mon, 12 Jul 2021 21:21:21 +0300
Subject: ACPI: utils: Fix reference counting in for_each_acpi_dev_match()

Currently it's possible to iterate over the dangling pointer in case the device
suddenly disappears. This may happen becase callers put it at the end of a loop.

Instead, let's move that call inside acpi_dev_get_next_match_dev().

Fixes: 803abec64ef9 ("media: ipu3-cio2: Add cio2-bridge to ipu3-cio2 driver")
Fixes: bf263f64e804 ("media: ACPI / bus: Add acpi_dev_get_next_match_dev() and helper macro")
Fixes: edbd1bc4951e ("efi/dev-path-parser: Switch to use for_each_acpi_dev_match()")
Signed-off-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Daniel Scally <djrscally@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/utils.c                       | 7 +++----
 drivers/firmware/efi/dev-path-parser.c     | 1 -
 drivers/media/pci/intel/ipu3/cio2-bridge.c | 6 ++----
 include/acpi/acpi_bus.h                    | 5 -----
 4 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index e7ddd281afff..d5cedffeeff9 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -860,11 +860,9 @@ EXPORT_SYMBOL(acpi_dev_present);
  * Return the next match of ACPI device if another matching device was present
  * at the moment of invocation, or NULL otherwise.
  *
- * FIXME: The function does not tolerate the sudden disappearance of @adev, e.g.
- * in the case of a hotplug event. That said, the caller should ensure that
- * this will never happen.
- *
  * The caller is responsible for invoking acpi_dev_put() on the returned device.
+ * On the other hand the function invokes  acpi_dev_put() on the given @adev
+ * assuming that its reference counter had been increased beforehand.
  *
  * See additional information in acpi_dev_present() as well.
  */
@@ -880,6 +878,7 @@ acpi_dev_get_next_match_dev(struct acpi_device *adev, const char *hid, const cha
 	match.hrv = hrv;
 
 	dev = bus_find_device(&acpi_bus_type, start, &match, acpi_dev_match_cb);
+	acpi_dev_put(adev);
 	return dev ? to_acpi_device(dev) : NULL;
 }
 EXPORT_SYMBOL(acpi_dev_get_next_match_dev);
diff --git a/drivers/firmware/efi/dev-path-parser.c b/drivers/firmware/efi/dev-path-parser.c
index 10d4457417a4..eb9c65f97841 100644
--- a/drivers/firmware/efi/dev-path-parser.c
+++ b/drivers/firmware/efi/dev-path-parser.c
@@ -34,7 +34,6 @@ static long __init parse_acpi_path(const struct efi_dev_path *node,
 			break;
 		if (!adev->pnp.unique_id && node->acpi.uid == 0)
 			break;
-		acpi_dev_put(adev);
 	}
 	if (!adev)
 		return -ENODEV;
diff --git a/drivers/media/pci/intel/ipu3/cio2-bridge.c b/drivers/media/pci/intel/ipu3/cio2-bridge.c
index 4657e99df033..59a36f922675 100644
--- a/drivers/media/pci/intel/ipu3/cio2-bridge.c
+++ b/drivers/media/pci/intel/ipu3/cio2-bridge.c
@@ -173,10 +173,8 @@ static int cio2_bridge_connect_sensor(const struct cio2_sensor_config *cfg,
 	int ret;
 
 	for_each_acpi_dev_match(adev, cfg->hid, NULL, -1) {
-		if (!adev->status.enabled) {
-			acpi_dev_put(adev);
+		if (!adev->status.enabled)
 			continue;
-		}
 
 		if (bridge->n_sensors >= CIO2_NUM_PORTS) {
 			acpi_dev_put(adev);
@@ -185,7 +183,6 @@ static int cio2_bridge_connect_sensor(const struct cio2_sensor_config *cfg,
 		}
 
 		sensor = &bridge->sensors[bridge->n_sensors];
-		sensor->adev = adev;
 		strscpy(sensor->name, cfg->hid, sizeof(sensor->name));
 
 		ret = cio2_bridge_read_acpi_buffer(adev, "SSDB",
@@ -215,6 +212,7 @@ static int cio2_bridge_connect_sensor(const struct cio2_sensor_config *cfg,
 			goto err_free_swnodes;
 		}
 
+		sensor->adev = acpi_dev_get(adev);
 		adev->fwnode.secondary = fwnode;
 
 		dev_info(&cio2->dev, "Found supported sensor %s\n",
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 1ae993fee4a5..b9d434a93632 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -707,11 +707,6 @@ acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv);
  * @hrv: Hardware Revision of the device, pass -1 to not check _HRV
  *
  * The caller is responsible for invoking acpi_dev_put() on the returned device.
- *
- * FIXME: Due to above requirement there is a window that may invalidate @adev
- * and next iteration will use a dangling pointer, e.g. in the case of a
- * hotplug event. That said, the caller should ensure that this will never
- * happen.
  */
 #define for_each_acpi_dev_match(adev, hid, uid, hrv)			\
 	for (adev = acpi_dev_get_first_match_dev(hid, uid, hrv);	\
-- 
cgit v1.2.3


From d6371c76e20d7d3f61b05fd67b596af4d14a8886 Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Mon, 19 Jul 2021 09:51:34 +0100
Subject: bpf: Fix OOB read when printing XDP link fdinfo

We got the following UBSAN report on one of our testing machines:

    ================================================================================
    UBSAN: array-index-out-of-bounds in kernel/bpf/syscall.c:2389:24
    index 6 is out of range for type 'char *[6]'
    CPU: 43 PID: 930921 Comm: systemd-coredum Tainted: G           O      5.10.48-cloudflare-kasan-2021.7.0 #1
    Hardware name: <snip>
    Call Trace:
     dump_stack+0x7d/0xa3
     ubsan_epilogue+0x5/0x40
     __ubsan_handle_out_of_bounds.cold+0x43/0x48
     ? seq_printf+0x17d/0x250
     bpf_link_show_fdinfo+0x329/0x380
     ? bpf_map_value_size+0xe0/0xe0
     ? put_files_struct+0x20/0x2d0
     ? __kasan_kmalloc.constprop.0+0xc2/0xd0
     seq_show+0x3f7/0x540
     seq_read_iter+0x3f8/0x1040
     seq_read+0x329/0x500
     ? seq_read_iter+0x1040/0x1040
     ? __fsnotify_parent+0x80/0x820
     ? __fsnotify_update_child_dentry_flags+0x380/0x380
     vfs_read+0x123/0x460
     ksys_read+0xed/0x1c0
     ? __x64_sys_pwrite64+0x1f0/0x1f0
     do_syscall_64+0x33/0x40
     entry_SYSCALL_64_after_hwframe+0x44/0xa9
    <snip>
    ================================================================================
    ================================================================================
    UBSAN: object-size-mismatch in kernel/bpf/syscall.c:2384:2

From the report, we can infer that some array access in bpf_link_show_fdinfo at index 6
is out of bounds. The obvious candidate is bpf_link_type_strs[BPF_LINK_TYPE_XDP] with
BPF_LINK_TYPE_XDP == 6. It turns out that BPF_LINK_TYPE_XDP is missing from bpf_types.h
and therefore doesn't have an entry in bpf_link_type_strs:

    pos:	0
    flags:	02000000
    mnt_id:	13
    link_type:	(null)
    link_id:	4
    prog_tag:	bcf7977d3b93787c
    prog_id:	4
    ifindex:	1

Fixes: aa8d3a716b59 ("bpf, xdp: Add bpf_link-based XDP attachment API")
Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210719085134.43325-2-lmb@cloudflare.com
---
 include/linux/bpf_types.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index a9db1eae6796..ae3ac3a2018c 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -134,4 +134,5 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup)
 BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter)
 #ifdef CONFIG_NET
 BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns)
+BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp)
 #endif
-- 
cgit v1.2.3


From 749468760b952e555529ca8a71256b991455101e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Jul 2021 02:20:28 -0700
Subject: net/tcp_fastopen: remove obsolete extern

After cited commit, sysctl_tcp_fastopen_blackhole_timeout is no longer
a global variable.

Fixes: 3733be14a32b ("ipv4: Namespaceify tcp_fastopen_blackhole_timeout knob")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: Wei Wang <weiwan@google.com>
Link: https://lore.kernel.org/r/20210719092028.3016745-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 17df9b047ee4..784d5c3ef1c5 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1709,7 +1709,6 @@ struct tcp_fastopen_context {
 	struct rcu_head	rcu;
 };
 
-extern unsigned int sysctl_tcp_fastopen_blackhole_timeout;
 void tcp_fastopen_active_disable(struct sock *sk);
 bool tcp_fastopen_active_should_disable(struct sock *sk);
 void tcp_fastopen_active_disable_ofo_check(struct sock *sk);
-- 
cgit v1.2.3


From 3abab27c322e0f2acf981595aa8040c9164dc9fb Mon Sep 17 00:00:00 2001
From: Charles Baylis <cb-kernel@fishzet.co.uk>
Date: Fri, 16 Jul 2021 17:43:12 +0100
Subject: drm: Return -ENOTTY for non-drm ioctls

drm: Return -ENOTTY for non-drm ioctls

Return -ENOTTY from drm_ioctl() when userspace passes in a cmd number
which doesn't relate to the drm subsystem.

Glibc uses the TCGETS ioctl to implement isatty(), and without this
change isatty() returns it incorrectly returns true for drm devices.

To test run this command:
$ if [ -t 0 ]; then echo is a tty; fi < /dev/dri/card0
which shows "is a tty" without this patch.

This may also modify memory which the userspace application is not
expecting.

Signed-off-by: Charles Baylis <cb-kernel@fishzet.co.uk>
Cc: stable@vger.kernel.org
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/YPG3IBlzaMhfPqCr@stando.fishzet.co.uk
---
 drivers/gpu/drm/drm_ioctl.c | 3 +++
 include/drm/drm_ioctl.h     | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index 98ae00661656..f454e0424086 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c
@@ -834,6 +834,9 @@ long drm_ioctl(struct file *filp,
 	if (drm_dev_is_unplugged(dev))
 		return -ENODEV;
 
+       if (DRM_IOCTL_TYPE(cmd) != DRM_IOCTL_BASE)
+               return -ENOTTY;
+
 	is_driver_ioctl = nr >= DRM_COMMAND_BASE && nr < DRM_COMMAND_END;
 
 	if (is_driver_ioctl) {
diff --git a/include/drm/drm_ioctl.h b/include/drm/drm_ioctl.h
index 10100a4bbe2a..afb27cb6a7bd 100644
--- a/include/drm/drm_ioctl.h
+++ b/include/drm/drm_ioctl.h
@@ -68,6 +68,7 @@ typedef int drm_ioctl_compat_t(struct file *filp, unsigned int cmd,
 			       unsigned long arg);
 
 #define DRM_IOCTL_NR(n)                _IOC_NR(n)
+#define DRM_IOCTL_TYPE(n)              _IOC_TYPE(n)
 #define DRM_MAJOR       226
 
 /**
-- 
cgit v1.2.3


From dc7019b7d0e188d4093b34bd0747ed0d668c63bf Mon Sep 17 00:00:00 2001
From: Jens Wiklander <jens.wiklander@linaro.org>
Date: Mon, 14 Jun 2021 17:33:14 -0500
Subject: tee: add tee_shm_alloc_kernel_buf()

Adds a new function tee_shm_alloc_kernel_buf() to allocate shared memory
from a kernel driver. This function can later be made more lightweight
by unnecessary dma-buf export.

Cc: stable@vger.kernel.org
Reviewed-by: Tyler Hicks <tyhicks@linux.microsoft.com>
Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/tee_shm.c   | 18 ++++++++++++++++++
 include/linux/tee_drv.h |  1 +
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 00472f5ce22e..c65e44707cd6 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -193,6 +193,24 @@ err_dev_put:
 }
 EXPORT_SYMBOL_GPL(tee_shm_alloc);
 
+/**
+ * tee_shm_alloc_kernel_buf() - Allocate shared memory for kernel buffer
+ * @ctx:	Context that allocates the shared memory
+ * @size:	Requested size of shared memory
+ *
+ * The returned memory registered in secure world and is suitable to be
+ * passed as a memory buffer in parameter argument to
+ * tee_client_invoke_func(). The memory allocated is later freed with a
+ * call to tee_shm_free().
+ *
+ * @returns a pointer to 'struct tee_shm'
+ */
+struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size)
+{
+	return tee_shm_alloc(ctx, size, TEE_SHM_MAPPED | TEE_SHM_DMA_BUF);
+}
+EXPORT_SYMBOL_GPL(tee_shm_alloc_kernel_buf);
+
 struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,
 				 size_t length, u32 flags)
 {
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 54269e47ac9a..8990f7628387 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -332,6 +332,7 @@ void *tee_get_drvdata(struct tee_device *teedev);
  * @returns a pointer to 'struct tee_shm'
  */
 struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags);
+struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size);
 
 /**
  * tee_shm_register() - Register shared memory buffer
-- 
cgit v1.2.3


From 376e4199e327a5cf29b8ec8fb0f64f3d8b429819 Mon Sep 17 00:00:00 2001
From: Sumit Garg <sumit.garg@linaro.org>
Date: Mon, 14 Jun 2021 17:33:15 -0500
Subject: tee: Correct inappropriate usage of TEE_SHM_DMA_BUF flag

Currently TEE_SHM_DMA_BUF flag has been inappropriately used to not
register shared memory allocated for private usage by underlying TEE
driver: OP-TEE in this case. So rather add a new flag as TEE_SHM_PRIV
that can be utilized by underlying TEE drivers for private allocation
and usage of shared memory.

With this corrected, allow tee_shm_alloc_kernel_buf() to allocate a
shared memory region without the backing of dma-buf.

Cc: stable@vger.kernel.org
Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
Co-developed-by: Tyler Hicks <tyhicks@linux.microsoft.com>
Signed-off-by: Tyler Hicks <tyhicks@linux.microsoft.com>
Reviewed-by: Jens Wiklander <jens.wiklander@linaro.org>
Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/optee/call.c     | 2 +-
 drivers/tee/optee/core.c     | 3 ++-
 drivers/tee/optee/rpc.c      | 5 +++--
 drivers/tee/optee/shm_pool.c | 8 ++++++--
 drivers/tee/tee_shm.c        | 4 ++--
 include/linux/tee_drv.h      | 1 +
 6 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c
index 387e94768182..945f03da0223 100644
--- a/drivers/tee/optee/call.c
+++ b/drivers/tee/optee/call.c
@@ -184,7 +184,7 @@ static struct tee_shm *get_msg_arg(struct tee_context *ctx, size_t num_params,
 	struct optee_msg_arg *ma;
 
 	shm = tee_shm_alloc(ctx, OPTEE_MSG_GET_ARG_SIZE(num_params),
-			    TEE_SHM_MAPPED);
+			    TEE_SHM_MAPPED | TEE_SHM_PRIV);
 	if (IS_ERR(shm))
 		return shm;
 
diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index 651d49b53d3b..5ce13b099d7d 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -278,7 +278,8 @@ static void optee_release(struct tee_context *ctx)
 	if (!ctxdata)
 		return;
 
-	shm = tee_shm_alloc(ctx, sizeof(struct optee_msg_arg), TEE_SHM_MAPPED);
+	shm = tee_shm_alloc(ctx, sizeof(struct optee_msg_arg),
+			    TEE_SHM_MAPPED | TEE_SHM_PRIV);
 	if (!IS_ERR(shm)) {
 		arg = tee_shm_get_va(shm, 0);
 		/*
diff --git a/drivers/tee/optee/rpc.c b/drivers/tee/optee/rpc.c
index 1849180b0278..efbaff7ad7e5 100644
--- a/drivers/tee/optee/rpc.c
+++ b/drivers/tee/optee/rpc.c
@@ -314,7 +314,7 @@ static void handle_rpc_func_cmd_shm_alloc(struct tee_context *ctx,
 		shm = cmd_alloc_suppl(ctx, sz);
 		break;
 	case OPTEE_RPC_SHM_TYPE_KERNEL:
-		shm = tee_shm_alloc(ctx, sz, TEE_SHM_MAPPED);
+		shm = tee_shm_alloc(ctx, sz, TEE_SHM_MAPPED | TEE_SHM_PRIV);
 		break;
 	default:
 		arg->ret = TEEC_ERROR_BAD_PARAMETERS;
@@ -502,7 +502,8 @@ void optee_handle_rpc(struct tee_context *ctx, struct optee_rpc_param *param,
 
 	switch (OPTEE_SMC_RETURN_GET_RPC_FUNC(param->a0)) {
 	case OPTEE_SMC_RPC_FUNC_ALLOC:
-		shm = tee_shm_alloc(ctx, param->a1, TEE_SHM_MAPPED);
+		shm = tee_shm_alloc(ctx, param->a1,
+				    TEE_SHM_MAPPED | TEE_SHM_PRIV);
 		if (!IS_ERR(shm) && !tee_shm_get_pa(shm, 0, &pa)) {
 			reg_pair_from_64(&param->a1, &param->a2, pa);
 			reg_pair_from_64(&param->a4, &param->a5,
diff --git a/drivers/tee/optee/shm_pool.c b/drivers/tee/optee/shm_pool.c
index da06ce9b9313..c41a9a501a6e 100644
--- a/drivers/tee/optee/shm_pool.c
+++ b/drivers/tee/optee/shm_pool.c
@@ -27,7 +27,11 @@ static int pool_op_alloc(struct tee_shm_pool_mgr *poolm,
 	shm->paddr = page_to_phys(page);
 	shm->size = PAGE_SIZE << order;
 
-	if (shm->flags & TEE_SHM_DMA_BUF) {
+	/*
+	 * Shared memory private to the OP-TEE driver doesn't need
+	 * to be registered with OP-TEE.
+	 */
+	if (!(shm->flags & TEE_SHM_PRIV)) {
 		unsigned int nr_pages = 1 << order, i;
 		struct page **pages;
 
@@ -60,7 +64,7 @@ err:
 static void pool_op_free(struct tee_shm_pool_mgr *poolm,
 			 struct tee_shm *shm)
 {
-	if (shm->flags & TEE_SHM_DMA_BUF)
+	if (!(shm->flags & TEE_SHM_PRIV))
 		optee_shm_unregister(shm->ctx, shm);
 
 	free_pages((unsigned long)shm->kaddr, get_order(shm->size));
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index c65e44707cd6..8a9384a64f3e 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -117,7 +117,7 @@ struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if ((flags & ~(TEE_SHM_MAPPED | TEE_SHM_DMA_BUF))) {
+	if ((flags & ~(TEE_SHM_MAPPED | TEE_SHM_DMA_BUF | TEE_SHM_PRIV))) {
 		dev_err(teedev->dev.parent, "invalid shm flags 0x%x", flags);
 		return ERR_PTR(-EINVAL);
 	}
@@ -207,7 +207,7 @@ EXPORT_SYMBOL_GPL(tee_shm_alloc);
  */
 struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size)
 {
-	return tee_shm_alloc(ctx, size, TEE_SHM_MAPPED | TEE_SHM_DMA_BUF);
+	return tee_shm_alloc(ctx, size, TEE_SHM_MAPPED);
 }
 EXPORT_SYMBOL_GPL(tee_shm_alloc_kernel_buf);
 
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 8990f7628387..3ebfea0781f1 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -27,6 +27,7 @@
 #define TEE_SHM_USER_MAPPED	BIT(4)  /* Memory mapped in user space */
 #define TEE_SHM_POOL		BIT(5)  /* Memory allocated from pool */
 #define TEE_SHM_KERNEL_MAPPED	BIT(6)  /* Memory mapped in kernel space */
+#define TEE_SHM_PRIV		BIT(7)  /* Memory private to TEE driver */
 
 struct device;
 struct tee_device;
-- 
cgit v1.2.3


From d8a719059b9dc963aa190598778ac804ff3e6a87 Mon Sep 17 00:00:00 2001
From: Jonathan Marek <jonathan@marek.ca>
Date: Wed, 21 Jul 2021 17:02:13 +1000
Subject: Revert "mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge"

This reverts commit c742199a014de23ee92055c2473d91fe5561ffdf.

c742199a014d ("mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge")
breaks arm64 in at least two ways for configurations where PUD or PMD
folding occur:

  1. We no longer install huge-vmap mappings and silently fall back to
     page-granular entries, despite being able to install block entries
     at what is effectively the PGD level.

  2. If the linear map is backed with block mappings, these will now
     silently fail to be created in alloc_init_pud(), causing a panic
     early during boot.

The pgtable selftests caught this, although a fix has not been
forthcoming and Christophe is AWOL at the moment, so just revert the
change for now to get a working -rc3 on which we can queue patches for
5.15.

A simple revert breaks the build for 32-bit PowerPC 8xx machines, which
rely on the default function definitions when the corresponding
page-table levels are folded, since commit a6a8f7c4aa7e ("powerpc/8xx:
add support for huge pages on VMAP and VMALLOC"), eg:

  powerpc64-linux-ld: mm/vmalloc.o: in function `vunmap_pud_range':
  linux/mm/vmalloc.c:362: undefined reference to `pud_clear_huge'

To avoid that, add stubs for pud_clear_huge() and pmd_clear_huge() in
arch/powerpc/mm/nohash/8xx.c as suggested by Christophe.

Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Fixes: c742199a014d ("mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge")
Signed-off-by: Jonathan Marek <jonathan@marek.ca>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Marc Zyngier <maz@kernel.org>
[mpe: Fold in 8xx.c changes from Christophe and mention in change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/linux-arm-kernel/CAMuHMdXShORDox-xxaeUfDW3wx2PeggFSqhVSHVZNKCGK-y_vQ@mail.gmail.com/
Link: https://lore.kernel.org/r/20210717160118.9855-1-jonathan@marek.ca
Link: https://lore.kernel.org/r/87r1fs1762.fsf@mpe.ellerman.id.au
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/mm/mmu.c          | 20 ++++++++------------
 arch/powerpc/mm/nohash/8xx.c | 10 ++++++++++
 arch/x86/mm/pgtable.c        | 34 +++++++++++++++-------------------
 include/linux/pgtable.h      | 26 +-------------------------
 4 files changed, 34 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index d74586508448..9ff0de1b2b93 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1339,7 +1339,6 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
 	return dt_virt;
 }
 
-#if CONFIG_PGTABLE_LEVELS > 3
 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
 {
 	pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));
@@ -1354,16 +1353,6 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
 	return 1;
 }
 
-int pud_clear_huge(pud_t *pudp)
-{
-	if (!pud_sect(READ_ONCE(*pudp)))
-		return 0;
-	pud_clear(pudp);
-	return 1;
-}
-#endif
-
-#if CONFIG_PGTABLE_LEVELS > 2
 int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
 {
 	pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));
@@ -1378,6 +1367,14 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
 	return 1;
 }
 
+int pud_clear_huge(pud_t *pudp)
+{
+	if (!pud_sect(READ_ONCE(*pudp)))
+		return 0;
+	pud_clear(pudp);
+	return 1;
+}
+
 int pmd_clear_huge(pmd_t *pmdp)
 {
 	if (!pmd_sect(READ_ONCE(*pmdp)))
@@ -1385,7 +1382,6 @@ int pmd_clear_huge(pmd_t *pmdp)
 	pmd_clear(pmdp);
 	return 1;
 }
-#endif
 
 int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
 {
diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c
index 60780e089118..0df9fe29dd56 100644
--- a/arch/powerpc/mm/nohash/8xx.c
+++ b/arch/powerpc/mm/nohash/8xx.c
@@ -240,3 +240,13 @@ void __init setup_kuap(bool disabled)
 	mtspr(SPRN_MD_AP, MD_APG_KUAP);
 }
 #endif
+
+int pud_clear_huge(pud_t *pud)
+{
+	 return 0;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+	 return 0;
+}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3364fe62b903..3481b35cb4ec 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -682,7 +682,6 @@ int p4d_clear_huge(p4d_t *p4d)
 }
 #endif
 
-#if CONFIG_PGTABLE_LEVELS > 3
 /**
  * pud_set_huge - setup kernel PUD mapping
  *
@@ -721,23 +720,6 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 	return 1;
 }
 
-/**
- * pud_clear_huge - clear kernel PUD mapping when it is set
- *
- * Returns 1 on success and 0 on failure (no PUD map is found).
- */
-int pud_clear_huge(pud_t *pud)
-{
-	if (pud_large(*pud)) {
-		pud_clear(pud);
-		return 1;
-	}
-
-	return 0;
-}
-#endif
-
-#if CONFIG_PGTABLE_LEVELS > 2
 /**
  * pmd_set_huge - setup kernel PMD mapping
  *
@@ -768,6 +750,21 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 	return 1;
 }
 
+/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
+int pud_clear_huge(pud_t *pud)
+{
+	if (pud_large(*pud)) {
+		pud_clear(pud);
+		return 1;
+	}
+
+	return 0;
+}
+
 /**
  * pmd_clear_huge - clear kernel PMD mapping when it is set
  *
@@ -782,7 +779,6 @@ int pmd_clear_huge(pmd_t *pmd)
 
 	return 0;
 }
-#endif
 
 #ifdef CONFIG_X86_64
 /**
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index d147480cdefc..e24d2c992b11 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1397,34 +1397,10 @@ static inline int p4d_clear_huge(p4d_t *p4d)
 }
 #endif /* !__PAGETABLE_P4D_FOLDED */
 
-#ifndef __PAGETABLE_PUD_FOLDED
 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
-int pud_clear_huge(pud_t *pud);
-#else
-static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
-{
-	return 0;
-}
-static inline int pud_clear_huge(pud_t *pud)
-{
-	return 0;
-}
-#endif /* !__PAGETABLE_PUD_FOLDED */
-
-#ifndef __PAGETABLE_PMD_FOLDED
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
+int pud_clear_huge(pud_t *pud);
 int pmd_clear_huge(pmd_t *pmd);
-#else
-static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
-{
-	return 0;
-}
-static inline int pmd_clear_huge(pmd_t *pmd)
-{
-	return 0;
-}
-#endif /* !__PAGETABLE_PMD_FOLDED */
-
 int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
 int pud_free_pmd_page(pud_t *pud, unsigned long addr);
 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
-- 
cgit v1.2.3


From 853a9ae29e978d37f5dfa72622a68c9ae3d7fa89 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan@kernel.org>
Date: Wed, 14 Jul 2021 10:04:27 +0200
Subject: serial: 8250: fix handle_irq locking

The 8250 handle_irq callback is not just called from the interrupt
handler but also from a timer callback when polling (e.g. for ports
without an interrupt line). Consequently the callback must explicitly
disable interrupts to avoid a potential deadlock with another interrupt
in polled mode.

Add back an irqrestore-version of the sysrq port-unlock helper and use
it in the 8250 callbacks that need it.

Fixes: 75f4e830fa9c ("serial: do not restore interrupt state in sysrq helper")
Cc: stable@vger.kernel.org	# 5.13
Cc: Joel Stanley <joel@jms.id.au>
Cc: Andrew Jeffery <andrew@aj.id.au>
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Johan Hovold <johan@kernel.org>
Link: https://lore.kernel.org/r/20210714080427.28164-1-johan@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_aspeed_vuart.c |  5 +++--
 drivers/tty/serial/8250/8250_fsl.c          |  5 +++--
 drivers/tty/serial/8250/8250_port.c         |  5 +++--
 include/linux/serial_core.h                 | 24 ++++++++++++++++++++++++
 4 files changed, 33 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c
index 4caab8714e2c..2350fb3bb5e4 100644
--- a/drivers/tty/serial/8250/8250_aspeed_vuart.c
+++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c
@@ -329,6 +329,7 @@ static int aspeed_vuart_handle_irq(struct uart_port *port)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
 	unsigned int iir, lsr;
+	unsigned long flags;
 	unsigned int space, count;
 
 	iir = serial_port_in(port, UART_IIR);
@@ -336,7 +337,7 @@ static int aspeed_vuart_handle_irq(struct uart_port *port)
 	if (iir & UART_IIR_NO_INT)
 		return 0;
 
-	spin_lock(&port->lock);
+	spin_lock_irqsave(&port->lock, flags);
 
 	lsr = serial_port_in(port, UART_LSR);
 
@@ -370,7 +371,7 @@ static int aspeed_vuart_handle_irq(struct uart_port *port)
 	if (lsr & UART_LSR_THRE)
 		serial8250_tx_chars(up);
 
-	uart_unlock_and_check_sysrq(port);
+	uart_unlock_and_check_sysrq_irqrestore(port, flags);
 
 	return 1;
 }
diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c
index 4e75d2e4f87c..fc65a2293ce9 100644
--- a/drivers/tty/serial/8250/8250_fsl.c
+++ b/drivers/tty/serial/8250/8250_fsl.c
@@ -30,10 +30,11 @@ struct fsl8250_data {
 int fsl8250_handle_irq(struct uart_port *port)
 {
 	unsigned char lsr, orig_lsr;
+	unsigned long flags;
 	unsigned int iir;
 	struct uart_8250_port *up = up_to_u8250p(port);
 
-	spin_lock(&up->port.lock);
+	spin_lock_irqsave(&up->port.lock, flags);
 
 	iir = port->serial_in(port, UART_IIR);
 	if (iir & UART_IIR_NO_INT) {
@@ -82,7 +83,7 @@ int fsl8250_handle_irq(struct uart_port *port)
 
 	up->lsr_saved_flags = orig_lsr;
 
-	uart_unlock_and_check_sysrq(&up->port);
+	uart_unlock_and_check_sysrq_irqrestore(&up->port, flags);
 
 	return 1;
 }
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 2e7000f79b03..1da29a219842 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1899,11 +1899,12 @@ int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
 	unsigned char status;
 	struct uart_8250_port *up = up_to_u8250p(port);
 	bool skip_rx = false;
+	unsigned long flags;
 
 	if (iir & UART_IIR_NO_INT)
 		return 0;
 
-	spin_lock(&port->lock);
+	spin_lock_irqsave(&port->lock, flags);
 
 	status = serial_port_in(port, UART_LSR);
 
@@ -1929,7 +1930,7 @@ int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
 		(up->ier & UART_IER_THRI))
 		serial8250_tx_chars(up);
 
-	uart_unlock_and_check_sysrq(port);
+	uart_unlock_and_check_sysrq_irqrestore(port, flags);
 
 	return 1;
 }
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 52d7fb92a69d..c58cc142d23f 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -518,6 +518,25 @@ static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
 	if (sysrq_ch)
 		handle_sysrq(sysrq_ch);
 }
+
+static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port,
+		unsigned long flags)
+{
+	int sysrq_ch;
+
+	if (!port->has_sysrq) {
+		spin_unlock_irqrestore(&port->lock, flags);
+		return;
+	}
+
+	sysrq_ch = port->sysrq_ch;
+	port->sysrq_ch = 0;
+
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	if (sysrq_ch)
+		handle_sysrq(sysrq_ch);
+}
 #else	/* CONFIG_MAGIC_SYSRQ_SERIAL */
 static inline int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch)
 {
@@ -531,6 +550,11 @@ static inline void uart_unlock_and_check_sysrq(struct uart_port *port)
 {
 	spin_unlock(&port->lock);
 }
+static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port,
+		unsigned long flags)
+{
+	spin_unlock_irqrestore(&port->lock, flags);
+}
 #endif	/* CONFIG_MAGIC_SYSRQ_SERIAL */
 
 /*
-- 
cgit v1.2.3


From 6c881ca0b3040f3e724eae513117ba4ddef86057 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 15 Jun 2021 11:57:26 +0100
Subject: afs: Fix tracepoint string placement with built-in AFS

To quote Alexey[1]:

    I was adding custom tracepoint to the kernel, grabbed full F34 kernel
    .config, disabled modules and booted whole shebang as VM kernel.

    Then did

	perf record -a -e ...

    It crashed:

	general protection fault, probably for non-canonical address 0x435f5346592e4243: 0000 [#1] SMP PTI
	CPU: 1 PID: 842 Comm: cat Not tainted 5.12.6+ #26
	Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014
	RIP: 0010:t_show+0x22/0xd0

    Then reproducer was narrowed to

	# cat /sys/kernel/tracing/printk_formats

    Original F34 kernel with modules didn't crash.

    So I started to disable options and after disabling AFS everything
    started working again.

    The root cause is that AFS was placing char arrays content into a
    section full of _pointers_ to strings with predictable consequences.

    Non canonical address 435f5346592e4243 is "CB.YFS_" which came from
    CM_NAME macro.

    Steps to reproduce:

	CONFIG_AFS=y
	CONFIG_TRACING=y

	# cat /sys/kernel/tracing/printk_formats

Fix this by the following means:

 (1) Add enum->string translation tables in the event header with the AFS
     and YFS cache/callback manager operations listed by RPC operation ID.

 (2) Modify the afs_cb_call tracepoint to print the string from the
     translation table rather than using the string at the afs_call name
     pointer.

 (3) Switch translation table depending on the service we're being accessed
     as (AFS or YFS) in the tracepoint print clause.  Will this cause
     problems to userspace utilities?

     Note that the symbolic representation of the YFS service ID isn't
     available to this header, so I've put it in as a number.  I'm not sure
     if this is the best way to do this.

 (4) Remove the name wrangling (CM_NAME) macro and put the names directly
     into the afs_call_type structs in cmservice.c.

Fixes: 8e8d7f13b6d5a9 ("afs: Add some tracepoints")
Reported-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
cc: Andrew Morton <akpm@linux-foundation.org>
cc: linux-afs@lists.infradead.org
Link: https://lore.kernel.org/r/YLAXfvZ+rObEOdc%2F@localhost.localdomain/ [1]
Link: https://lore.kernel.org/r/643721.1623754699@warthog.procyon.org.uk/
Link: https://lore.kernel.org/r/162430903582.2896199.6098150063997983353.stgit@warthog.procyon.org.uk/ # v1
Link: https://lore.kernel.org/r/162609463957.3133237.15916579353149746363.stgit@warthog.procyon.org.uk/ # v1 (repost)
Link: https://lore.kernel.org/r/162610726860.3408253.445207609466288531.stgit@warthog.procyon.org.uk/ # v2
---
 fs/afs/cmservice.c         | 25 +++++------------
 include/trace/events/afs.h | 67 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 69 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index d3c6bb22c5f4..a3f5de28be79 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -29,16 +29,11 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *);
 
 static int afs_deliver_yfs_cb_callback(struct afs_call *);
 
-#define CM_NAME(name) \
-	char afs_SRXCB##name##_name[] __tracepoint_string =	\
-		"CB." #name
-
 /*
  * CB.CallBack operation type
  */
-static CM_NAME(CallBack);
 static const struct afs_call_type afs_SRXCBCallBack = {
-	.name		= afs_SRXCBCallBack_name,
+	.name		= "CB.CallBack",
 	.deliver	= afs_deliver_cb_callback,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_CallBack,
@@ -47,9 +42,8 @@ static const struct afs_call_type afs_SRXCBCallBack = {
 /*
  * CB.InitCallBackState operation type
  */
-static CM_NAME(InitCallBackState);
 static const struct afs_call_type afs_SRXCBInitCallBackState = {
-	.name		= afs_SRXCBInitCallBackState_name,
+	.name		= "CB.InitCallBackState",
 	.deliver	= afs_deliver_cb_init_call_back_state,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_InitCallBackState,
@@ -58,9 +52,8 @@ static const struct afs_call_type afs_SRXCBInitCallBackState = {
 /*
  * CB.InitCallBackState3 operation type
  */
-static CM_NAME(InitCallBackState3);
 static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
-	.name		= afs_SRXCBInitCallBackState3_name,
+	.name		= "CB.InitCallBackState3",
 	.deliver	= afs_deliver_cb_init_call_back_state3,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_InitCallBackState,
@@ -69,9 +62,8 @@ static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
 /*
  * CB.Probe operation type
  */
-static CM_NAME(Probe);
 static const struct afs_call_type afs_SRXCBProbe = {
-	.name		= afs_SRXCBProbe_name,
+	.name		= "CB.Probe",
 	.deliver	= afs_deliver_cb_probe,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_Probe,
@@ -80,9 +72,8 @@ static const struct afs_call_type afs_SRXCBProbe = {
 /*
  * CB.ProbeUuid operation type
  */
-static CM_NAME(ProbeUuid);
 static const struct afs_call_type afs_SRXCBProbeUuid = {
-	.name		= afs_SRXCBProbeUuid_name,
+	.name		= "CB.ProbeUuid",
 	.deliver	= afs_deliver_cb_probe_uuid,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_ProbeUuid,
@@ -91,9 +82,8 @@ static const struct afs_call_type afs_SRXCBProbeUuid = {
 /*
  * CB.TellMeAboutYourself operation type
  */
-static CM_NAME(TellMeAboutYourself);
 static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
-	.name		= afs_SRXCBTellMeAboutYourself_name,
+	.name		= "CB.TellMeAboutYourself",
 	.deliver	= afs_deliver_cb_tell_me_about_yourself,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_TellMeAboutYourself,
@@ -102,9 +92,8 @@ static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
 /*
  * YFS CB.CallBack operation type
  */
-static CM_NAME(YFS_CallBack);
 static const struct afs_call_type afs_SRXYFSCB_CallBack = {
-	.name		= afs_SRXCBYFS_CallBack_name,
+	.name		= "YFSCB.CallBack",
 	.deliver	= afs_deliver_yfs_cb_callback,
 	.destructor	= afs_cm_destructor,
 	.work		= SRXAFSCB_CallBack,
diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
index 3ccf591b2374..9f73ed2cf061 100644
--- a/include/trace/events/afs.h
+++ b/include/trace/events/afs.h
@@ -174,6 +174,34 @@ enum afs_vl_operation {
 	afs_VL_GetCapabilities	= 65537,	/* AFS Get VL server capabilities */
 };
 
+enum afs_cm_operation {
+	afs_CB_CallBack			= 204,	/* AFS break callback promises */
+	afs_CB_InitCallBackState	= 205,	/* AFS initialise callback state */
+	afs_CB_Probe			= 206,	/* AFS probe client */
+	afs_CB_GetLock			= 207,	/* AFS get contents of CM lock table */
+	afs_CB_GetCE			= 208,	/* AFS get cache file description */
+	afs_CB_GetXStatsVersion		= 209,	/* AFS get version of extended statistics */
+	afs_CB_GetXStats		= 210,	/* AFS get contents of extended statistics data */
+	afs_CB_InitCallBackState3	= 213,	/* AFS initialise callback state, version 3 */
+	afs_CB_ProbeUuid		= 214,	/* AFS check the client hasn't rebooted */
+};
+
+enum yfs_cm_operation {
+	yfs_CB_Probe			= 206,	/* YFS probe client */
+	yfs_CB_GetLock			= 207,	/* YFS get contents of CM lock table */
+	yfs_CB_XStatsVersion		= 209,	/* YFS get version of extended statistics */
+	yfs_CB_GetXStats		= 210,	/* YFS get contents of extended statistics data */
+	yfs_CB_InitCallBackState3	= 213,	/* YFS initialise callback state, version 3 */
+	yfs_CB_ProbeUuid		= 214,	/* YFS check the client hasn't rebooted */
+	yfs_CB_GetServerPrefs		= 215,
+	yfs_CB_GetCellServDV		= 216,
+	yfs_CB_GetLocalCell		= 217,
+	yfs_CB_GetCacheConfig		= 218,
+	yfs_CB_GetCellByNum		= 65537,
+	yfs_CB_TellMeAboutYourself	= 65538, /* get client capabilities */
+	yfs_CB_CallBack			= 64204,
+};
+
 enum afs_edit_dir_op {
 	afs_edit_dir_create,
 	afs_edit_dir_create_error,
@@ -436,6 +464,32 @@ enum afs_cb_break_reason {
 	EM(afs_YFSVL_GetCellName,		"YFSVL.GetCellName") \
 	E_(afs_VL_GetCapabilities,		"VL.GetCapabilities")
 
+#define afs_cm_operations \
+	EM(afs_CB_CallBack,			"CB.CallBack") \
+	EM(afs_CB_InitCallBackState,		"CB.InitCallBackState") \
+	EM(afs_CB_Probe,			"CB.Probe") \
+	EM(afs_CB_GetLock,			"CB.GetLock") \
+	EM(afs_CB_GetCE,			"CB.GetCE") \
+	EM(afs_CB_GetXStatsVersion,		"CB.GetXStatsVersion") \
+	EM(afs_CB_GetXStats,			"CB.GetXStats") \
+	EM(afs_CB_InitCallBackState3,		"CB.InitCallBackState3") \
+	E_(afs_CB_ProbeUuid,			"CB.ProbeUuid")
+
+#define yfs_cm_operations \
+	EM(yfs_CB_Probe,			"YFSCB.Probe") \
+	EM(yfs_CB_GetLock,			"YFSCB.GetLock") \
+	EM(yfs_CB_XStatsVersion,		"YFSCB.XStatsVersion") \
+	EM(yfs_CB_GetXStats,			"YFSCB.GetXStats") \
+	EM(yfs_CB_InitCallBackState3,		"YFSCB.InitCallBackState3") \
+	EM(yfs_CB_ProbeUuid,			"YFSCB.ProbeUuid") \
+	EM(yfs_CB_GetServerPrefs,		"YFSCB.GetServerPrefs") \
+	EM(yfs_CB_GetCellServDV,		"YFSCB.GetCellServDV") \
+	EM(yfs_CB_GetLocalCell,			"YFSCB.GetLocalCell") \
+	EM(yfs_CB_GetCacheConfig,		"YFSCB.GetCacheConfig") \
+	EM(yfs_CB_GetCellByNum,			"YFSCB.GetCellByNum") \
+	EM(yfs_CB_TellMeAboutYourself,		"YFSCB.TellMeAboutYourself") \
+	E_(yfs_CB_CallBack,			"YFSCB.CallBack")
+
 #define afs_edit_dir_ops				  \
 	EM(afs_edit_dir_create,			"create") \
 	EM(afs_edit_dir_create_error,		"c_fail") \
@@ -569,6 +623,8 @@ afs_server_traces;
 afs_cell_traces;
 afs_fs_operations;
 afs_vl_operations;
+afs_cm_operations;
+yfs_cm_operations;
 afs_edit_dir_ops;
 afs_edit_dir_reasons;
 afs_eproto_causes;
@@ -649,20 +705,21 @@ TRACE_EVENT(afs_cb_call,
 
 	    TP_STRUCT__entry(
 		    __field(unsigned int,		call		)
-		    __field(const char *,		name		)
 		    __field(u32,			op		)
+		    __field(u16,			service_id	)
 			     ),
 
 	    TP_fast_assign(
 		    __entry->call	= call->debug_id;
-		    __entry->name	= call->type->name;
 		    __entry->op		= call->operation_ID;
+		    __entry->service_id	= call->service_id;
 			   ),
 
-	    TP_printk("c=%08x %s o=%u",
+	    TP_printk("c=%08x %s",
 		      __entry->call,
-		      __entry->name,
-		      __entry->op)
+		      __entry->service_id == 2501 ?
+		      __print_symbolic(__entry->op, yfs_cm_operations) :
+		      __print_symbolic(__entry->op, afs_cm_operations))
 	    );
 
 TRACE_EVENT(afs_call,
-- 
cgit v1.2.3


From 1e7107c5ef44431bc1ebbd4c353f1d7c22e5f2ec Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Wed, 16 Jun 2021 08:51:57 -0400
Subject: cgroup1: fix leaked context root causing sporadic NULL deref in LTP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Richard reported sporadic (roughly one in 10 or so) null dereferences and
other strange behaviour for a set of automated LTP tests.  Things like:

   BUG: kernel NULL pointer dereference, address: 0000000000000008
   #PF: supervisor read access in kernel mode
   #PF: error_code(0x0000) - not-present page
   PGD 0 P4D 0
   Oops: 0000 [#1] PREEMPT SMP PTI
   CPU: 0 PID: 1516 Comm: umount Not tainted 5.10.0-yocto-standard #1
   Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-48-gd9c812dda519-prebuilt.qemu.org 04/01/2014
   RIP: 0010:kernfs_sop_show_path+0x1b/0x60

...or these others:

   RIP: 0010:do_mkdirat+0x6a/0xf0
   RIP: 0010:d_alloc_parallel+0x98/0x510
   RIP: 0010:do_readlinkat+0x86/0x120

There were other less common instances of some kind of a general scribble
but the common theme was mount and cgroup and a dubious dentry triggering
the NULL dereference.  I was only able to reproduce it under qemu by
replicating Richard's setup as closely as possible - I never did get it
to happen on bare metal, even while keeping everything else the same.

In commit 71d883c37e8d ("cgroup_do_mount(): massage calling conventions")
we see this as a part of the overall change:

   --------------
           struct cgroup_subsys *ss;
   -       struct dentry *dentry;

   [...]

   -       dentry = cgroup_do_mount(&cgroup_fs_type, fc->sb_flags, root,
   -                                CGROUP_SUPER_MAGIC, ns);

   [...]

   -       if (percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
   -               struct super_block *sb = dentry->d_sb;
   -               dput(dentry);
   +       ret = cgroup_do_mount(fc, CGROUP_SUPER_MAGIC, ns);
   +       if (!ret && percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
   +               struct super_block *sb = fc->root->d_sb;
   +               dput(fc->root);
                   deactivate_locked_super(sb);
                   msleep(10);
                   return restart_syscall();
           }
   --------------

In changing from the local "*dentry" variable to using fc->root, we now
export/leave that dentry pointer in the file context after doing the dput()
in the unlikely "is_dying" case.   With LTP doing a crazy amount of back to
back mount/unmount [testcases/bin/cgroup_regression_5_1.sh] the unlikely
becomes slightly likely and then bad things happen.

A fix would be to not leave the stale reference in fc->root as follows:

   --------------
                  dput(fc->root);
  +               fc->root = NULL;
                  deactivate_locked_super(sb);
   --------------

...but then we are just open-coding a duplicate of fc_drop_locked() so we
simply use that instead.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zefan Li <lizefan.x@bytedance.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: stable@vger.kernel.org      # v5.1+
Reported-by: Richard Purdie <richard.purdie@linuxfoundation.org>
Fixes: 71d883c37e8d ("cgroup_do_mount(): massage calling conventions")
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 fs/internal.h              | 1 -
 include/linux/fs_context.h | 1 +
 kernel/cgroup/cgroup-v1.c  | 4 +---
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/internal.h b/fs/internal.h
index 3ce8edbaa3ca..82e8eb32ff3d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -61,7 +61,6 @@ extern void __init chrdev_init(void);
  */
 extern const struct fs_context_operations legacy_fs_context_ops;
 extern int parse_monolithic_mount_data(struct fs_context *, void *);
-extern void fc_drop_locked(struct fs_context *);
 extern void vfs_clean_context(struct fs_context *fc);
 extern int finish_clean_context(struct fs_context *fc);
 
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index e2bc16300c82..6b54982fc5f3 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -141,6 +141,7 @@ extern int vfs_get_tree(struct fs_context *fc);
 extern void put_fs_context(struct fs_context *fc);
 extern int vfs_parse_fs_param_source(struct fs_context *fc,
 				     struct fs_parameter *param);
+extern void fc_drop_locked(struct fs_context *fc);
 
 /*
  * sget() wrappers to be called from the ->get_tree() op.
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 8d6bf56ed77a..de2c432dee20 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1221,9 +1221,7 @@ int cgroup1_get_tree(struct fs_context *fc)
 		ret = cgroup_do_get_tree(fc);
 
 	if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
-		struct super_block *sb = fc->root->d_sb;
-		dput(fc->root);
-		deactivate_locked_super(sb);
+		fc_drop_locked(fc);
 		ret = 1;
 	}
 
-- 
cgit v1.2.3


From 8dad53a11f8d94dceb540a5f8f153484f42be84b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Jul 2021 15:50:17 -0700
Subject: mm: call flush_dcache_page() in memcpy_to_page() and memzero_page()

memcpy_to_page and memzero_page can write to arbitrary pages, which
could be in the page cache or in high memory, so call
flush_kernel_dcache_pages to flush the dcache.

This is a problem when using these helpers on dcache challeneged
architectures.  Right now there are just a few users, chances are no one
used the PC floppy driver, the aha1542 driver for an ISA SCSI HBA, and a
few advanced and optional btrfs and ext4 features on those platforms yet
since the conversion.

Link: https://lkml.kernel.org/r/20210713055231.137602-2-hch@lst.de
Fixes: bb90d4bc7b6a ("mm/highmem: Lift memcpy_[to|from]_page to core")
Fixes: 28961998f858 ("iov_iter: lift memzero_page() to highmem.h")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Cc: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 8c6e8e996c87..8e7e50a53a12 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -318,6 +318,7 @@ static inline void memcpy_to_page(struct page *page, size_t offset,
 
 	VM_BUG_ON(offset + len > PAGE_SIZE);
 	memcpy(to + offset, from, len);
+	flush_dcache_page(page);
 	kunmap_local(to);
 }
 
@@ -325,6 +326,7 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len)
 {
 	char *addr = kmap_atomic(page);
 	memset(addr + offset, 0, len);
+	flush_dcache_page(page);
 	kunmap_atomic(addr);
 }
 
-- 
cgit v1.2.3


From d9a42b53bdf7b0329dc09a59fc1b092640b6da19 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 23 Jul 2021 15:50:20 -0700
Subject: mm: use kmap_local_page in memzero_page

The commit message introducing the global memzero_page explicitly
mentions switching to kmap_local_page in the commit log but doesn't
actually do that.

Link: https://lkml.kernel.org/r/20210713055231.137602-3-hch@lst.de
Fixes: 28961998f858 ("iov_iter: lift memzero_page() to highmem.h")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/highmem.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 8e7e50a53a12..d9a606a9fc64 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -324,10 +324,10 @@ static inline void memcpy_to_page(struct page *page, size_t offset,
 
 static inline void memzero_page(struct page *page, size_t offset, size_t len)
 {
-	char *addr = kmap_atomic(page);
+	char *addr = kmap_local_page(page);
 	memset(addr + offset, 0, len);
 	flush_dcache_page(page);
-	kunmap_atomic(addr);
+	kunmap_local(addr);
 }
 
 #endif /* _LINUX_HIGHMEM_H */
-- 
cgit v1.2.3


From 79e482e9c3ae86e849c701c846592e72baddda5a Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Fri, 23 Jul 2021 15:50:26 -0700
Subject: memblock: make for_each_mem_range() traverse MEMBLOCK_HOTPLUG regions

Commit b10d6bca8720 ("arch, drivers: replace for_each_membock() with
for_each_mem_range()") didn't take into account that when there is
movable_node parameter in the kernel command line, for_each_mem_range()
would skip ranges marked with MEMBLOCK_HOTPLUG.

The page table setup code in POWER uses for_each_mem_range() to create
the linear mapping of the physical memory and since the regions marked
as MEMORY_HOTPLUG are skipped, they never make it to the linear map.

A later access to the memory in those ranges will fail:

  BUG: Unable to handle kernel data access on write at 0xc000000400000000
  Faulting instruction address: 0xc00000000008a3c0
  Oops: Kernel access of bad area, sig: 11 [#1]
  LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
  Modules linked in:
  CPU: 0 PID: 53 Comm: kworker/u2:0 Not tainted 5.13.0 #7
  NIP:  c00000000008a3c0 LR: c0000000003c1ed8 CTR: 0000000000000040
  REGS: c000000008a57770 TRAP: 0300   Not tainted  (5.13.0)
  MSR:  8000000002009033 <SF,VEC,EE,ME,IR,DR,RI,LE>  CR: 84222202  XER: 20040000
  CFAR: c0000000003c1ed4 DAR: c000000400000000 DSISR: 42000000 IRQMASK: 0
  GPR00: c0000000003c1ed8 c000000008a57a10 c0000000019da700 c000000400000000
  GPR04: 0000000000000280 0000000000000180 0000000000000400 0000000000000200
  GPR08: 0000000000000100 0000000000000080 0000000000000040 0000000000000300
  GPR12: 0000000000000380 c000000001bc0000 c0000000001660c8 c000000006337e00
  GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
  GPR20: 0000000040000000 0000000020000000 c000000001a81990 c000000008c30000
  GPR24: c000000008c20000 c000000001a81998 000fffffffff0000 c000000001a819a0
  GPR28: c000000001a81908 c00c000001000000 c000000008c40000 c000000008a64680
  NIP clear_user_page+0x50/0x80
  LR __handle_mm_fault+0xc88/0x1910
  Call Trace:
    __handle_mm_fault+0xc44/0x1910 (unreliable)
    handle_mm_fault+0x130/0x2a0
    __get_user_pages+0x248/0x610
    __get_user_pages_remote+0x12c/0x3e0
    get_arg_page+0x54/0xf0
    copy_string_kernel+0x11c/0x210
    kernel_execve+0x16c/0x220
    call_usermodehelper_exec_async+0x1b0/0x2f0
    ret_from_kernel_thread+0x5c/0x70
  Instruction dump:
  79280fa4 79271764 79261f24 794ae8e2 7ca94214 7d683a14 7c893a14 7d893050
  7d4903a6 60000000 60000000 60000000 <7c001fec> 7c091fec 7c081fec 7c051fec
  ---[ end trace 490b8c67e6075e09 ]---

Making for_each_mem_range() include MEMBLOCK_HOTPLUG regions in the
traversal fixes this issue.

Link: https://bugzilla.redhat.com/show_bug.cgi?id=1976100
Link: https://lkml.kernel.org/r/20210712071132.20902-1-rppt@kernel.org
Fixes: b10d6bca8720 ("arch, drivers: replace for_each_membock() with for_each_mem_range()")
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Tested-by: Greg Kurz <groug@kaod.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: <stable@vger.kernel.org>	[5.10+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 4 ++--
 mm/memblock.c            | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index cbf46f56d105..4a53c3ca86bd 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -209,7 +209,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
  */
 #define for_each_mem_range(i, p_start, p_end) \
 	__for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,	\
-			     MEMBLOCK_NONE, p_start, p_end, NULL)
+			     MEMBLOCK_HOTPLUG, p_start, p_end, NULL)
 
 /**
  * for_each_mem_range_rev - reverse iterate through memblock areas from
@@ -220,7 +220,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
  */
 #define for_each_mem_range_rev(i, p_start, p_end)			\
 	__for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \
-				 MEMBLOCK_NONE, p_start, p_end, NULL)
+				 MEMBLOCK_HOTPLUG, p_start, p_end, NULL)
 
 /**
  * for_each_reserved_mem_range - iterate over all reserved memblock areas
diff --git a/mm/memblock.c b/mm/memblock.c
index 0041ff62c584..de7b553baa50 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -947,7 +947,8 @@ static bool should_skip_region(struct memblock_type *type,
 		return true;
 
 	/* skip hotpluggable memory regions if needed */
-	if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
+	if (movable_node_is_enabled() && memblock_is_hotpluggable(m) &&
+	    !(flags & MEMBLOCK_HOTPLUG))
 		return true;
 
 	/* if we want mirror memory skip non-mirror memory regions */
-- 
cgit v1.2.3


From fc68f42aa737dc15e7665a4101d4168aadb8e4c4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 24 Jul 2021 15:25:54 -0700
Subject: ACPI: fix NULL pointer dereference

Commit 71f642833284 ("ACPI: utils: Fix reference counting in
for_each_acpi_dev_match()") started doing "acpi_dev_put()" on a pointer
that was possibly NULL.  That fails miserably, because that helper
inline function is not set up to handle that case.

Just make acpi_dev_put() silently accept a NULL pointer, rather than
calling down to put_device() with an invalid offset off that NULL
pointer.

Link: https://lore.kernel.org/lkml/a607c149-6bf6-0fd0-0e31-100378504da2@kernel.dk/
Reported-and-tested-by: Jens Axboe <axboe@kernel.dk>
Tested-by: Daniel Scally <djrscally@gmail.com>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/acpi/acpi_bus.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index b9d434a93632..13d93371790e 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -720,7 +720,8 @@ static inline struct acpi_device *acpi_dev_get(struct acpi_device *adev)
 
 static inline void acpi_dev_put(struct acpi_device *adev)
 {
-	put_device(&adev->dev);
+	if (adev)
+		put_device(&adev->dev);
 }
 
 struct acpi_device *acpi_bus_get_acpi_device(acpi_handle handle);
-- 
cgit v1.2.3


From 058e6e0ed0eace43401c945082dec1d669b5b231 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 25 Jul 2021 13:42:50 -0400
Subject: sctp: improve the code for pmtu probe send and recv update

This patch does 3 things:

  - make sctp_transport_pl_send() and sctp_transport_pl_recv()
    return bool type to decide if more probe is needed to send.

  - pr_debug() only when probe is really needed to send.

  - count pl.raise_count in sctp_transport_pl_send() instead of
    sctp_transport_pl_recv(), and it's only incremented for the
    1st probe for the same size.

These are preparations for the next patch to make probes happen
only when there's packet loss in Search Complete state.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h |  4 ++--
 net/sctp/sm_statefuns.c    | 15 +++++++--------
 net/sctp/transport.c       | 41 +++++++++++++++++++++++------------------
 3 files changed, 32 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 32fc4a309df5..f3d414ed208e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1024,8 +1024,8 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu);
 void sctp_transport_immediate_rtx(struct sctp_transport *);
 void sctp_transport_dst_release(struct sctp_transport *t);
 void sctp_transport_dst_confirm(struct sctp_transport *t);
-void sctp_transport_pl_send(struct sctp_transport *t);
-void sctp_transport_pl_recv(struct sctp_transport *t);
+bool sctp_transport_pl_send(struct sctp_transport *t);
+bool sctp_transport_pl_recv(struct sctp_transport *t);
 
 
 /* This is the structure we use to queue packets as they come into
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 09a8f23ec709..32df65f68c12 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -1109,12 +1109,12 @@ enum sctp_disposition sctp_sf_send_probe(struct net *net,
 	if (!sctp_transport_pl_enabled(transport))
 		return SCTP_DISPOSITION_CONSUME;
 
-	sctp_transport_pl_send(transport);
-
-	reply = sctp_make_heartbeat(asoc, transport, transport->pl.probe_size);
-	if (!reply)
-		return SCTP_DISPOSITION_NOMEM;
-	sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+	if (sctp_transport_pl_send(transport)) {
+		reply = sctp_make_heartbeat(asoc, transport, transport->pl.probe_size);
+		if (!reply)
+			return SCTP_DISPOSITION_NOMEM;
+		sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply));
+	}
 	sctp_add_cmd_sf(commands, SCTP_CMD_PROBE_TIMER_UPDATE,
 			SCTP_TRANSPORT(transport));
 
@@ -1274,8 +1274,7 @@ enum sctp_disposition sctp_sf_backbeat_8_3(struct net *net,
 		    !sctp_transport_pl_enabled(link))
 			return SCTP_DISPOSITION_DISCARD;
 
-		sctp_transport_pl_recv(link);
-		if (link->pl.state == SCTP_PL_COMPLETE)
+		if (sctp_transport_pl_recv(link))
 			return SCTP_DISPOSITION_CONSUME;
 
 		return sctp_sf_send_probe(net, ep, asoc, type, link, commands);
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 397a6244dd97..23e7bd3e3bd4 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -258,16 +258,12 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
 	sctp_transport_pl_update(transport);
 }
 
-void sctp_transport_pl_send(struct sctp_transport *t)
+bool sctp_transport_pl_send(struct sctp_transport *t)
 {
-	pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n",
-		 __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high);
-
-	if (t->pl.probe_count < SCTP_MAX_PROBES) {
-		t->pl.probe_count++;
-		return;
-	}
+	if (t->pl.probe_count < SCTP_MAX_PROBES)
+		goto out;
 
+	t->pl.probe_count = 0;
 	if (t->pl.state == SCTP_PL_BASE) {
 		if (t->pl.probe_size == SCTP_BASE_PLPMTU) { /* BASE_PLPMTU Confirmation Failed */
 			t->pl.state = SCTP_PL_ERROR; /* Base -> Error */
@@ -299,10 +295,20 @@ void sctp_transport_pl_send(struct sctp_transport *t)
 			sctp_assoc_sync_pmtu(t->asoc);
 		}
 	}
-	t->pl.probe_count = 1;
+
+out:
+	if (t->pl.state == SCTP_PL_COMPLETE && t->pl.raise_count < 30 &&
+	    !t->pl.probe_count)
+		t->pl.raise_count++;
+
+	pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n",
+		 __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high);
+
+	t->pl.probe_count++;
+	return true;
 }
 
-void sctp_transport_pl_recv(struct sctp_transport *t)
+bool sctp_transport_pl_recv(struct sctp_transport *t)
 {
 	pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n",
 		 __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high);
@@ -323,7 +329,7 @@ void sctp_transport_pl_recv(struct sctp_transport *t)
 		if (!t->pl.probe_high) {
 			t->pl.probe_size = min(t->pl.probe_size + SCTP_PL_BIG_STEP,
 					       SCTP_MAX_PLPMTU);
-			return;
+			return false;
 		}
 		t->pl.probe_size += SCTP_PL_MIN_STEP;
 		if (t->pl.probe_size >= t->pl.probe_high) {
@@ -335,14 +341,13 @@ void sctp_transport_pl_recv(struct sctp_transport *t)
 			t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t);
 			sctp_assoc_sync_pmtu(t->asoc);
 		}
-	} else if (t->pl.state == SCTP_PL_COMPLETE) {
-		t->pl.raise_count++;
-		if (t->pl.raise_count == 30) {
-			/* Raise probe_size again after 30 * interval in Search Complete */
-			t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */
-			t->pl.probe_size += SCTP_PL_MIN_STEP;
-		}
+	} else if (t->pl.state == SCTP_PL_COMPLETE && t->pl.raise_count == 30) {
+		/* Raise probe_size again after 30 * interval in Search Complete */
+		t->pl.state = SCTP_PL_SEARCH; /* Search Complete -> Search */
+		t->pl.probe_size += SCTP_PL_MIN_STEP;
 	}
+
+	return t->pl.state == SCTP_PL_COMPLETE;
 }
 
 static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu)
-- 
cgit v1.2.3


From eacf078cf4c7aa23e9591738511f142cc39b5186 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 25 Jul 2021 13:42:51 -0400
Subject: sctp: send pmtu probe only if packet loss in Search Complete state

This patch is to introduce last_rtx_chunks into sctp_transport to detect
if there's any packet retransmission/loss happened by checking against
asoc's rtx_data_chunks in sctp_transport_pl_send().

If there is, namely, transport->last_rtx_chunks != asoc->rtx_data_chunks,
the pmtu probe will be sent out. Otherwise, increment the pl.raise_count
and return when it's in Search Complete state.

With this patch, if in Search Complete state, which is a long period, it
doesn't need to keep probing the current pmtu unless there's data packet
loss. This will save quite some traffic.

v1->v2:
  - add the missing Fixes tag.

Fixes: 0dac127c0557 ("sctp: do black hole detection in search complete state")
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 1 +
 net/sctp/transport.c       | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index f3d414ed208e..651bba654d77 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -984,6 +984,7 @@ struct sctp_transport {
 	} cacc;
 
 	struct {
+		__u32 last_rtx_chunks;
 		__u16 pmtu;
 		__u16 probe_size;
 		__u16 probe_high;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 23e7bd3e3bd4..a3d3ca6dd63d 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -263,6 +263,7 @@ bool sctp_transport_pl_send(struct sctp_transport *t)
 	if (t->pl.probe_count < SCTP_MAX_PROBES)
 		goto out;
 
+	t->pl.last_rtx_chunks = t->asoc->rtx_data_chunks;
 	t->pl.probe_count = 0;
 	if (t->pl.state == SCTP_PL_BASE) {
 		if (t->pl.probe_size == SCTP_BASE_PLPMTU) { /* BASE_PLPMTU Confirmation Failed */
@@ -298,8 +299,10 @@ bool sctp_transport_pl_send(struct sctp_transport *t)
 
 out:
 	if (t->pl.state == SCTP_PL_COMPLETE && t->pl.raise_count < 30 &&
-	    !t->pl.probe_count)
+	    !t->pl.probe_count && t->pl.last_rtx_chunks == t->asoc->rtx_data_chunks) {
 		t->pl.raise_count++;
+		return false;
+	}
 
 	pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n",
 		 __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high);
@@ -313,6 +316,7 @@ bool sctp_transport_pl_recv(struct sctp_transport *t)
 	pr_debug("%s: PLPMTUD: transport: %p, state: %d, pmtu: %d, size: %d, high: %d\n",
 		 __func__, t, t->pl.state, t->pl.pmtu, t->pl.probe_size, t->pl.probe_high);
 
+	t->pl.last_rtx_chunks = t->asoc->rtx_data_chunks;
 	t->pl.pmtu = t->pl.probe_size;
 	t->pl.probe_count = 0;
 	if (t->pl.state == SCTP_PL_BASE) {
-- 
cgit v1.2.3


From c7c9d2102c9c098916ab9e0ab248006107d00d6c Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Sun, 25 Jul 2021 00:11:59 +0300
Subject: net: llc: fix skb_over_panic

Syzbot reported skb_over_panic() in llc_pdu_init_as_xid_cmd(). The
problem was in wrong LCC header manipulations.

Syzbot's reproducer tries to send XID packet. llc_ui_sendmsg() is
doing following steps:

	1. skb allocation with size = len + header size
		len is passed from userpace and header size
		is 3 since addr->sllc_xid is set.

	2. skb_reserve() for header_len = 3
	3. filling all other space with memcpy_from_msg()

Ok, at this moment we have fully loaded skb, only headers needs to be
filled.

Then code comes to llc_sap_action_send_xid_c(). This function pushes 3
bytes for LLC PDU header and initializes it. Then comes
llc_pdu_init_as_xid_cmd(). It initalizes next 3 bytes *AFTER* LLC PDU
header and call skb_push(skb, 3). This looks wrong for 2 reasons:

	1. Bytes rigth after LLC header are user data, so this function
	   was overwriting payload.

	2. skb_push(skb, 3) call can cause skb_over_panic() since
	   all free space was filled in llc_ui_sendmsg(). (This can
	   happen is user passed 686 len: 686 + 14 (eth header) + 3 (LLC
	   header) = 703. SKB_DATA_ALIGN(703) = 704)

So, in this patch I added 2 new private constansts: LLC_PDU_TYPE_U_XID
and LLC_PDU_LEN_U_XID. LLC_PDU_LEN_U_XID is used to correctly reserve
header size to handle LLC + XID case. LLC_PDU_TYPE_U_XID is used by
llc_pdu_header_init() function to push 6 bytes instead of 3. And finally
I removed skb_push() call from llc_pdu_init_as_xid_cmd().

This changes should not affect other parts of LLC, since after
all steps we just transmit buffer.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-and-tested-by: syzbot+5e5a981ad7cc54c4b2b4@syzkaller.appspotmail.com
Signed-off-by: Pavel Skripkin <paskripkin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/llc_pdu.h | 31 +++++++++++++++++++++++--------
 net/llc/af_llc.c      | 10 +++++++++-
 net/llc/llc_s_ac.c    |  2 +-
 3 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h
index c0f0a13ed818..49aa79c7b278 100644
--- a/include/net/llc_pdu.h
+++ b/include/net/llc_pdu.h
@@ -15,9 +15,11 @@
 #include <linux/if_ether.h>
 
 /* Lengths of frame formats */
-#define LLC_PDU_LEN_I	4       /* header and 2 control bytes */
-#define LLC_PDU_LEN_S	4
-#define LLC_PDU_LEN_U	3       /* header and 1 control byte */
+#define LLC_PDU_LEN_I		4       /* header and 2 control bytes */
+#define LLC_PDU_LEN_S		4
+#define LLC_PDU_LEN_U		3       /* header and 1 control byte */
+/* header and 1 control byte and XID info */
+#define LLC_PDU_LEN_U_XID	(LLC_PDU_LEN_U + sizeof(struct llc_xid_info))
 /* Known SAP addresses */
 #define LLC_GLOBAL_SAP	0xFF
 #define LLC_NULL_SAP	0x00	/* not network-layer visible */
@@ -50,9 +52,10 @@
 #define LLC_PDU_TYPE_U_MASK    0x03	/* 8-bit control field */
 #define LLC_PDU_TYPE_MASK      0x03
 
-#define LLC_PDU_TYPE_I	0	/* first bit */
-#define LLC_PDU_TYPE_S	1	/* first two bits */
-#define LLC_PDU_TYPE_U	3	/* first two bits */
+#define LLC_PDU_TYPE_I		0	/* first bit */
+#define LLC_PDU_TYPE_S		1	/* first two bits */
+#define LLC_PDU_TYPE_U		3	/* first two bits */
+#define LLC_PDU_TYPE_U_XID	4	/* private type for detecting XID commands */
 
 #define LLC_PDU_TYPE_IS_I(pdu) \
 	((!(pdu->ctrl_1 & LLC_PDU_TYPE_I_MASK)) ? 1 : 0)
@@ -230,9 +233,18 @@ static inline struct llc_pdu_un *llc_pdu_un_hdr(struct sk_buff *skb)
 static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type,
 				       u8 ssap, u8 dsap, u8 cr)
 {
-	const int hlen = type == LLC_PDU_TYPE_U ? 3 : 4;
+	int hlen = 4; /* default value for I and S types */
 	struct llc_pdu_un *pdu;
 
+	switch (type) {
+	case LLC_PDU_TYPE_U:
+		hlen = 3;
+		break;
+	case LLC_PDU_TYPE_U_XID:
+		hlen = 6;
+		break;
+	}
+
 	skb_push(skb, hlen);
 	skb_reset_network_header(skb);
 	pdu = llc_pdu_un_hdr(skb);
@@ -374,7 +386,10 @@ static inline void llc_pdu_init_as_xid_cmd(struct sk_buff *skb,
 	xid_info->fmt_id = LLC_XID_FMT_ID;	/* 0x81 */
 	xid_info->type	 = svcs_supported;
 	xid_info->rw	 = rx_window << 1;	/* size of receive window */
-	skb_put(skb, sizeof(struct llc_xid_info));
+
+	/* no need to push/put since llc_pdu_header_init() has already
+	 * pushed 3 + 3 bytes
+	 */
 }
 
 /**
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 7180979114e4..ac5cadd02cfa 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -98,8 +98,16 @@ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr)
 {
 	u8 rc = LLC_PDU_LEN_U;
 
-	if (addr->sllc_test || addr->sllc_xid)
+	if (addr->sllc_test)
 		rc = LLC_PDU_LEN_U;
+	else if (addr->sllc_xid)
+		/* We need to expand header to sizeof(struct llc_xid_info)
+		 * since llc_pdu_init_as_xid_cmd() sets 4,5,6 bytes of LLC header
+		 * as XID PDU. In llc_ui_sendmsg() we reserved header size and then
+		 * filled all other space with user data. If we won't reserve this
+		 * bytes, llc_pdu_init_as_xid_cmd() will overwrite user data
+		 */
+		rc = LLC_PDU_LEN_U_XID;
 	else if (sk->sk_type == SOCK_STREAM)
 		rc = LLC_PDU_LEN_I;
 	return rc;
diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c
index b554f26c68ee..79d1cef8f15a 100644
--- a/net/llc/llc_s_ac.c
+++ b/net/llc/llc_s_ac.c
@@ -79,7 +79,7 @@ int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb)
 	struct llc_sap_state_ev *ev = llc_sap_ev(skb);
 	int rc;
 
-	llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap,
+	llc_pdu_header_init(skb, LLC_PDU_TYPE_U_XID, ev->saddr.lsap,
 			    ev->daddr.lsap, LLC_PDU_CMD);
 	llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0);
 	rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
-- 
cgit v1.2.3


From bf88fef0b6f1488abeca594d377991171c00e52a Mon Sep 17 00:00:00 2001
From: Dmitry Osipenko <digetx@gmail.com>
Date: Sat, 17 Jul 2021 21:21:27 +0300
Subject: usb: otg-fsm: Fix hrtimer list corruption

The HNP work can be re-scheduled while it's still in-fly. This results in
re-initialization of the busy work, resetting the hrtimer's list node of
the work and crashing kernel with null dereference within kernel/timer
once work's timer is expired. It's very easy to trigger this problem by
re-plugging USB cable quickly. Initialize HNP work only once to fix this
trouble.

 Unable to handle kernel NULL pointer dereference at virtual address 00000126)
 ...
 PC is at __run_timers.part.0+0x150/0x228
 LR is at __next_timer_interrupt+0x51/0x9c
 ...
 (__run_timers.part.0) from [<c0187a2b>] (run_timer_softirq+0x2f/0x50)
 (run_timer_softirq) from [<c01013ad>] (__do_softirq+0xd5/0x2f0)
 (__do_softirq) from [<c012589b>] (irq_exit+0xab/0xb8)
 (irq_exit) from [<c0170341>] (handle_domain_irq+0x45/0x60)
 (handle_domain_irq) from [<c04c4a43>] (gic_handle_irq+0x6b/0x7c)
 (gic_handle_irq) from [<c0100b65>] (__irq_svc+0x65/0xac)

Cc: stable@vger.kernel.org
Acked-by: Peter Chen <peter.chen@kernel.org>
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
Link: https://lore.kernel.org/r/20210717182134.30262-6-digetx@gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/common/usb-otg-fsm.c | 6 +++++-
 include/linux/usb/otg-fsm.h      | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/usb/common/usb-otg-fsm.c b/drivers/usb/common/usb-otg-fsm.c
index 3740cf95560e..0697fde51d00 100644
--- a/drivers/usb/common/usb-otg-fsm.c
+++ b/drivers/usb/common/usb-otg-fsm.c
@@ -193,7 +193,11 @@ static void otg_start_hnp_polling(struct otg_fsm *fsm)
 	if (!fsm->host_req_flag)
 		return;
 
-	INIT_DELAYED_WORK(&fsm->hnp_polling_work, otg_hnp_polling_work);
+	if (!fsm->hnp_work_inited) {
+		INIT_DELAYED_WORK(&fsm->hnp_polling_work, otg_hnp_polling_work);
+		fsm->hnp_work_inited = true;
+	}
+
 	schedule_delayed_work(&fsm->hnp_polling_work,
 					msecs_to_jiffies(T_HOST_REQ_POLL));
 }
diff --git a/include/linux/usb/otg-fsm.h b/include/linux/usb/otg-fsm.h
index 3aee78dda16d..784659d4dc99 100644
--- a/include/linux/usb/otg-fsm.h
+++ b/include/linux/usb/otg-fsm.h
@@ -196,6 +196,7 @@ struct otg_fsm {
 	struct mutex lock;
 	u8 *host_req_flag;
 	struct delayed_work hnp_polling_work;
+	bool hnp_work_inited;
 	bool state_changed;
 };
 
-- 
cgit v1.2.3


From 9635720b7c88592214562cb72605bdab6708006c Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 27 Jul 2021 09:05:00 -0700
Subject: bpf, sockmap: Fix memleak on ingress msg enqueue

If backlog handler is running during a tear down operation we may enqueue
data on the ingress msg queue while tear down is trying to free it.

 sk_psock_backlog()
   sk_psock_handle_skb()
     skb_psock_skb_ingress()
       sk_psock_skb_ingress_enqueue()
         sk_psock_queue_msg(psock,msg)
                                           spin_lock(ingress_lock)
                                            sk_psock_zap_ingress()
                                             _sk_psock_purge_ingerss_msg()
                                              _sk_psock_purge_ingress_msg()
                                            -- free ingress_msg list --
                                           spin_unlock(ingress_lock)
           spin_lock(ingress_lock)
           list_add_tail(msg,ingress_msg) <- entry on list with no one
                                             left to free it.
           spin_unlock(ingress_lock)

To fix we only enqueue from backlog if the ENABLED bit is set. The tear
down logic clears the bit with ingress_lock set so we wont enqueue the
msg in the last step.

Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20210727160500.1713554-4-john.fastabend@gmail.com
---
 include/linux/skmsg.h | 54 +++++++++++++++++++++++++++++++++------------------
 net/core/skmsg.c      |  6 ------
 2 files changed, 35 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 96f319099744..14ab0c0bc924 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -285,11 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk)
 	return rcu_dereference_sk_user_data(sk);
 }
 
+static inline void sk_psock_set_state(struct sk_psock *psock,
+				      enum sk_psock_state_bits bit)
+{
+	set_bit(bit, &psock->state);
+}
+
+static inline void sk_psock_clear_state(struct sk_psock *psock,
+					enum sk_psock_state_bits bit)
+{
+	clear_bit(bit, &psock->state);
+}
+
+static inline bool sk_psock_test_state(const struct sk_psock *psock,
+				       enum sk_psock_state_bits bit)
+{
+	return test_bit(bit, &psock->state);
+}
+
+static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
+{
+	sk_drops_add(sk, skb);
+	kfree_skb(skb);
+}
+
+static inline void drop_sk_msg(struct sk_psock *psock, struct sk_msg *msg)
+{
+	if (msg->skb)
+		sock_drop(psock->sk, msg->skb);
+	kfree(msg);
+}
+
 static inline void sk_psock_queue_msg(struct sk_psock *psock,
 				      struct sk_msg *msg)
 {
 	spin_lock_bh(&psock->ingress_lock);
-	list_add_tail(&msg->list, &psock->ingress_msg);
+	if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+		list_add_tail(&msg->list, &psock->ingress_msg);
+	else
+		drop_sk_msg(psock, msg);
 	spin_unlock_bh(&psock->ingress_lock);
 }
 
@@ -406,24 +440,6 @@ static inline void sk_psock_restore_proto(struct sock *sk,
 		psock->psock_update_sk_prot(sk, psock, true);
 }
 
-static inline void sk_psock_set_state(struct sk_psock *psock,
-				      enum sk_psock_state_bits bit)
-{
-	set_bit(bit, &psock->state);
-}
-
-static inline void sk_psock_clear_state(struct sk_psock *psock,
-					enum sk_psock_state_bits bit)
-{
-	clear_bit(bit, &psock->state);
-}
-
-static inline bool sk_psock_test_state(const struct sk_psock *psock,
-				       enum sk_psock_state_bits bit)
-{
-	return test_bit(bit, &psock->state);
-}
-
 static inline struct sk_psock *sk_psock_get(struct sock *sk)
 {
 	struct sk_psock *psock;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 036cdb33a94a..2d6249b28928 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -584,12 +584,6 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
 	return sk_psock_skb_ingress(psock, skb);
 }
 
-static void sock_drop(struct sock *sk, struct sk_buff *skb)
-{
-	sk_drops_add(sk, skb);
-	kfree_skb(skb);
-}
-
 static void sk_psock_skb_state(struct sk_psock *psock,
 			       struct sk_psock_work_state *state,
 			       struct sk_buff *skb,
-- 
cgit v1.2.3


From 25905f602fdb0cfa147017056636768a7aa1ff6f Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Wed, 21 Jul 2021 12:25:20 -0700
Subject: dmaengine: idxd: Change license on idxd.h to LGPL

This file was given GPL-2.0 license. But LGPL-2.1 makes more sense
as it needs to be used by libraries outside of the kernel source tree.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/idxd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/idxd.h b/include/uapi/linux/idxd.h
index e33997b4d750..edc346a77c91 100644
--- a/include/uapi/linux/idxd.h
+++ b/include/uapi/linux/idxd.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: LGPL-2.1 WITH Linux-syscall-note */
 /* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
 #ifndef _USR_IDXD_H_
 #define _USR_IDXD_H_
-- 
cgit v1.2.3


From f5e81d1117501546b7be050c5fbafa6efd2c722c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 13 Jul 2021 08:18:31 +0000
Subject: bpf: Introduce BPF nospec instruction for mitigating Spectre v4

In case of JITs, each of the JIT backends compiles the BPF nospec instruction
/either/ to a machine instruction which emits a speculation barrier /or/ to
/no/ machine instruction in case the underlying architecture is not affected
by Speculative Store Bypass or has different mitigations in place already.

This covers both x86 and (implicitly) arm64: In case of x86, we use 'lfence'
instruction for mitigation. In case of arm64, we rely on the firmware mitigation
as controlled via the ssbd kernel parameter. Whenever the mitigation is enabled,
it works for all of the kernel code with no need to provide any additional
instructions here (hence only comment in arm64 JIT). Other archs can follow
as needed. The BPF nospec instruction is specifically targeting Spectre v4
since i) we don't use a serialization barrier for the Spectre v1 case, and
ii) mitigation instructions for v1 and v4 might be different on some archs.

The BPF nospec is required for a future commit, where the BPF verifier does
annotate intermediate BPF programs with speculation barriers.

Co-developed-by: Piotr Krysiuk <piotras@gmail.com>
Co-developed-by: Benedict Schlueter <benedict.schlueter@rub.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Piotr Krysiuk <piotras@gmail.com>
Signed-off-by: Benedict Schlueter <benedict.schlueter@rub.de>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm/net/bpf_jit_32.c         |  3 +++
 arch/arm64/net/bpf_jit_comp.c     | 13 +++++++++++++
 arch/mips/net/ebpf_jit.c          |  3 +++
 arch/powerpc/net/bpf_jit_comp32.c |  6 ++++++
 arch/powerpc/net/bpf_jit_comp64.c |  6 ++++++
 arch/riscv/net/bpf_jit_comp32.c   |  4 ++++
 arch/riscv/net/bpf_jit_comp64.c   |  4 ++++
 arch/s390/net/bpf_jit_comp.c      |  5 +++++
 arch/sparc/net/bpf_jit_comp_64.c  |  3 +++
 arch/x86/net/bpf_jit_comp.c       |  7 +++++++
 arch/x86/net/bpf_jit_comp32.c     |  6 ++++++
 include/linux/filter.h            | 15 +++++++++++++++
 kernel/bpf/core.c                 | 19 ++++++++++++++++++-
 kernel/bpf/disasm.c               | 16 +++++++++-------
 14 files changed, 102 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 897634d0a67c..a951276f0547 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -1602,6 +1602,9 @@ exit:
 		rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx);
 		emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code));
 		break;
+	/* speculation barrier */
+	case BPF_ST | BPF_NOSPEC:
+		break;
 	/* ST: *(size *)(dst + off) = imm */
 	case BPF_ST | BPF_MEM | BPF_W:
 	case BPF_ST | BPF_MEM | BPF_H:
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index dccf98a37283..41c23f474ea6 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -823,6 +823,19 @@ emit_cond_jmp:
 			return ret;
 		break;
 
+	/* speculation barrier */
+	case BPF_ST | BPF_NOSPEC:
+		/*
+		 * Nothing required here.
+		 *
+		 * In case of arm64, we rely on the firmware mitigation of
+		 * Speculative Store Bypass as controlled via the ssbd kernel
+		 * parameter. Whenever the mitigation is enabled, it works
+		 * for all of the kernel code with no need to provide any
+		 * additional instructions.
+		 */
+		break;
+
 	/* ST: *(size *)(dst + off) = imm */
 	case BPF_ST | BPF_MEM | BPF_W:
 	case BPF_ST | BPF_MEM | BPF_H:
diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c
index 939dd06764bc..3a73e9375712 100644
--- a/arch/mips/net/ebpf_jit.c
+++ b/arch/mips/net/ebpf_jit.c
@@ -1355,6 +1355,9 @@ jeq_common:
 		}
 		break;
 
+	case BPF_ST | BPF_NOSPEC: /* speculation barrier */
+		break;
+
 	case BPF_ST | BPF_B | BPF_MEM:
 	case BPF_ST | BPF_H | BPF_MEM:
 	case BPF_ST | BPF_W | BPF_MEM:
diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c
index 34bb1583fc0c..beb12cbc8c29 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -737,6 +737,12 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
 			}
 			break;
 
+		/*
+		 * BPF_ST NOSPEC (speculation barrier)
+		 */
+		case BPF_ST | BPF_NOSPEC:
+			break;
+
 		/*
 		 * BPF_ST(X)
 		 */
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index de8595880fee..b87a63dba9c8 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -627,6 +627,12 @@ emit_clear:
 			}
 			break;
 
+		/*
+		 * BPF_ST NOSPEC (speculation barrier)
+		 */
+		case BPF_ST | BPF_NOSPEC:
+			break;
+
 		/*
 		 * BPF_ST(X)
 		 */
diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c
index 81de865f4c7c..e6497424cbf6 100644
--- a/arch/riscv/net/bpf_jit_comp32.c
+++ b/arch/riscv/net/bpf_jit_comp32.c
@@ -1251,6 +1251,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 			return -1;
 		break;
 
+	/* speculation barrier */
+	case BPF_ST | BPF_NOSPEC:
+		break;
+
 	case BPF_ST | BPF_MEM | BPF_B:
 	case BPF_ST | BPF_MEM | BPF_H:
 	case BPF_ST | BPF_MEM | BPF_W:
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 87e3bf5b9086..3af4131c22c7 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -939,6 +939,10 @@ out_be:
 		emit_ld(rd, 0, RV_REG_T1, ctx);
 		break;
 
+	/* speculation barrier */
+	case BPF_ST | BPF_NOSPEC:
+		break;
+
 	/* ST: *(size *)(dst + off) = imm */
 	case BPF_ST | BPF_MEM | BPF_B:
 		emit_imm(RV_REG_T1, imm, ctx);
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 2ae419f5115a..88419263a89a 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1153,6 +1153,11 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 			break;
 		}
 		break;
+	/*
+	 * BPF_NOSPEC (speculation barrier)
+	 */
+	case BPF_ST | BPF_NOSPEC:
+		break;
 	/*
 	 * BPF_ST(X)
 	 */
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index 4b8d3c65d266..9a2f20cbd48b 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -1287,6 +1287,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 			return 1;
 		break;
 	}
+	/* speculation barrier */
+	case BPF_ST | BPF_NOSPEC:
+		break;
 	/* ST: *(size *)(dst + off) = imm */
 	case BPF_ST | BPF_MEM | BPF_W:
 	case BPF_ST | BPF_MEM | BPF_H:
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 4b951458c9fc..16d76f814e9b 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1219,6 +1219,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			}
 			break;
 
+			/* speculation barrier */
+		case BPF_ST | BPF_NOSPEC:
+			if (boot_cpu_has(X86_FEATURE_XMM2))
+				/* Emit 'lfence' */
+				EMIT3(0x0F, 0xAE, 0xE8);
+			break;
+
 			/* ST: *(u8*)(dst_reg + off) = imm */
 		case BPF_ST | BPF_MEM | BPF_B:
 			if (is_ereg(dst_reg))
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 3da88ded6ee3..3bfda5f502cb 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -1886,6 +1886,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			i++;
 			break;
 		}
+		/* speculation barrier */
+		case BPF_ST | BPF_NOSPEC:
+			if (boot_cpu_has(X86_FEATURE_XMM2))
+				/* Emit 'lfence' */
+				EMIT3(0x0F, 0xAE, 0xE8);
+			break;
 		/* ST: *(u8*)(dst_reg + off) = imm */
 		case BPF_ST | BPF_MEM | BPF_H:
 		case BPF_ST | BPF_MEM | BPF_B:
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 472f97074da0..83b896044e79 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -73,6 +73,11 @@ struct ctl_table_header;
 /* unused opcode to mark call to interpreter with arguments */
 #define BPF_CALL_ARGS	0xe0
 
+/* unused opcode to mark speculation barrier for mitigating
+ * Speculative Store Bypass
+ */
+#define BPF_NOSPEC	0xc0
+
 /* As per nm, we expose JITed images as text (code) section for
  * kallsyms. That way, tools like perf can find it to match
  * addresses.
@@ -390,6 +395,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
 		.off   = 0,					\
 		.imm   = 0 })
 
+/* Speculation barrier */
+
+#define BPF_ST_NOSPEC()						\
+	((struct bpf_insn) {					\
+		.code  = BPF_ST | BPF_NOSPEC,			\
+		.dst_reg = 0,					\
+		.src_reg = 0,					\
+		.off   = 0,					\
+		.imm   = 0 })
+
 /* Internal classic blocks for direct assignment */
 
 #define __BPF_STMT(CODE, K)					\
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9b1577498373..b1a5fc04492b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -32,6 +32,8 @@
 #include <linux/perf_event.h>
 #include <linux/extable.h>
 #include <linux/log2.h>
+
+#include <asm/barrier.h>
 #include <asm/unaligned.h>
 
 /* Registers */
@@ -1377,6 +1379,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
 		/* Non-UAPI available opcodes. */
 		[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
 		[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
+		[BPF_ST  | BPF_NOSPEC] = &&ST_NOSPEC,
 		[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
 		[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
 		[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
@@ -1621,7 +1624,21 @@ out:
 	COND_JMP(s, JSGE, >=)
 	COND_JMP(s, JSLE, <=)
 #undef COND_JMP
-	/* STX and ST and LDX*/
+	/* ST, STX and LDX*/
+	ST_NOSPEC:
+		/* Speculation barrier for mitigating Speculative Store Bypass.
+		 * In case of arm64, we rely on the firmware mitigation as
+		 * controlled via the ssbd kernel parameter. Whenever the
+		 * mitigation is enabled, it works for all of the kernel code
+		 * with no need to provide any additional instructions here.
+		 * In case of x86, we use 'lfence' insn for mitigation. We
+		 * reuse preexisting logic from Spectre v1 mitigation that
+		 * happens to produce the required code on x86 for v4 as well.
+		 */
+#ifdef CONFIG_X86
+		barrier_nospec();
+#endif
+		CONT;
 #define LDST(SIZEOP, SIZE)						\
 	STX_MEM_##SIZEOP:						\
 		*(SIZE *)(unsigned long) (DST + insn->off) = SRC;	\
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index bbfc6bb79240..ca3cd9aaa6ce 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -206,15 +206,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 			verbose(cbs->private_data, "BUG_%02x\n", insn->code);
 		}
 	} else if (class == BPF_ST) {
-		if (BPF_MODE(insn->code) != BPF_MEM) {
+		if (BPF_MODE(insn->code) == BPF_MEM) {
+			verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg,
+				insn->off, insn->imm);
+		} else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) {
+			verbose(cbs->private_data, "(%02x) nospec\n", insn->code);
+		} else {
 			verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
-			return;
 		}
-		verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
-			insn->code,
-			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-			insn->dst_reg,
-			insn->off, insn->imm);
 	} else if (class == BPF_LDX) {
 		if (BPF_MODE(insn->code) != BPF_MEM) {
 			verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
-- 
cgit v1.2.3


From 2039f26f3aca5b0e419b98f65dd36481337b86ee Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 13 Jul 2021 08:18:31 +0000
Subject: bpf: Fix leakage due to insufficient speculative store bypass
 mitigation

Spectre v4 gadgets make use of memory disambiguation, which is a set of
techniques that execute memory access instructions, that is, loads and
stores, out of program order; Intel's optimization manual, section 2.4.4.5:

  A load instruction micro-op may depend on a preceding store. Many
  microarchitectures block loads until all preceding store addresses are
  known. The memory disambiguator predicts which loads will not depend on
  any previous stores. When the disambiguator predicts that a load does
  not have such a dependency, the load takes its data from the L1 data
  cache. Eventually, the prediction is verified. If an actual conflict is
  detected, the load and all succeeding instructions are re-executed.

af86ca4e3088 ("bpf: Prevent memory disambiguation attack") tried to mitigate
this attack by sanitizing the memory locations through preemptive "fast"
(low latency) stores of zero prior to the actual "slow" (high latency) store
of a pointer value such that upon dependency misprediction the CPU then
speculatively executes the load of the pointer value and retrieves the zero
value instead of the attacker controlled scalar value previously stored at
that location, meaning, subsequent access in the speculative domain is then
redirected to the "zero page".

The sanitized preemptive store of zero prior to the actual "slow" store is
done through a simple ST instruction based on r10 (frame pointer) with
relative offset to the stack location that the verifier has been tracking
on the original used register for STX, which does not have to be r10. Thus,
there are no memory dependencies for this store, since it's only using r10
and immediate constant of zero; hence af86ca4e3088 /assumed/ a low latency
operation.

However, a recent attack demonstrated that this mitigation is not sufficient
since the preemptive store of zero could also be turned into a "slow" store
and is thus bypassed as well:

  [...]
  // r2 = oob address (e.g. scalar)
  // r7 = pointer to map value
  31: (7b) *(u64 *)(r10 -16) = r2
  // r9 will remain "fast" register, r10 will become "slow" register below
  32: (bf) r9 = r10
  // JIT maps BPF reg to x86 reg:
  //  r9  -> r15 (callee saved)
  //  r10 -> rbp
  // train store forward prediction to break dependency link between both r9
  // and r10 by evicting them from the predictor's LRU table.
  33: (61) r0 = *(u32 *)(r7 +24576)
  34: (63) *(u32 *)(r7 +29696) = r0
  35: (61) r0 = *(u32 *)(r7 +24580)
  36: (63) *(u32 *)(r7 +29700) = r0
  37: (61) r0 = *(u32 *)(r7 +24584)
  38: (63) *(u32 *)(r7 +29704) = r0
  39: (61) r0 = *(u32 *)(r7 +24588)
  40: (63) *(u32 *)(r7 +29708) = r0
  [...]
  543: (61) r0 = *(u32 *)(r7 +25596)
  544: (63) *(u32 *)(r7 +30716) = r0
  // prepare call to bpf_ringbuf_output() helper. the latter will cause rbp
  // to spill to stack memory while r13/r14/r15 (all callee saved regs) remain
  // in hardware registers. rbp becomes slow due to push/pop latency. below is
  // disasm of bpf_ringbuf_output() helper for better visual context:
  //
  // ffffffff8117ee20: 41 54                 push   r12
  // ffffffff8117ee22: 55                    push   rbp
  // ffffffff8117ee23: 53                    push   rbx
  // ffffffff8117ee24: 48 f7 c1 fc ff ff ff  test   rcx,0xfffffffffffffffc
  // ffffffff8117ee2b: 0f 85 af 00 00 00     jne    ffffffff8117eee0 <-- jump taken
  // [...]
  // ffffffff8117eee0: 49 c7 c4 ea ff ff ff  mov    r12,0xffffffffffffffea
  // ffffffff8117eee7: 5b                    pop    rbx
  // ffffffff8117eee8: 5d                    pop    rbp
  // ffffffff8117eee9: 4c 89 e0              mov    rax,r12
  // ffffffff8117eeec: 41 5c                 pop    r12
  // ffffffff8117eeee: c3                    ret
  545: (18) r1 = map[id:4]
  547: (bf) r2 = r7
  548: (b7) r3 = 0
  549: (b7) r4 = 4
  550: (85) call bpf_ringbuf_output#194288
  // instruction 551 inserted by verifier    \
  551: (7a) *(u64 *)(r10 -16) = 0            | /both/ are now slow stores here
  // storing map value pointer r7 at fp-16   | since value of r10 is "slow".
  552: (7b) *(u64 *)(r10 -16) = r7           /
  // following "fast" read to the same memory location, but due to dependency
  // misprediction it will speculatively execute before insn 551/552 completes.
  553: (79) r2 = *(u64 *)(r9 -16)
  // in speculative domain contains attacker controlled r2. in non-speculative
  // domain this contains r7, and thus accesses r7 +0 below.
  554: (71) r3 = *(u8 *)(r2 +0)
  // leak r3

As can be seen, the current speculative store bypass mitigation which the
verifier inserts at line 551 is insufficient since /both/, the write of
the zero sanitation as well as the map value pointer are a high latency
instruction due to prior memory access via push/pop of r10 (rbp) in contrast
to the low latency read in line 553 as r9 (r15) which stays in hardware
registers. Thus, architecturally, fp-16 is r7, however, microarchitecturally,
fp-16 can still be r2.

Initial thoughts to address this issue was to track spilled pointer loads
from stack and enforce their load via LDX through r10 as well so that /both/
the preemptive store of zero /as well as/ the load use the /same/ register
such that a dependency is created between the store and load. However, this
option is not sufficient either since it can be bypassed as well under
speculation. An updated attack with pointer spill/fills now _all_ based on
r10 would look as follows:

  [...]
  // r2 = oob address (e.g. scalar)
  // r7 = pointer to map value
  [...]
  // longer store forward prediction training sequence than before.
  2062: (61) r0 = *(u32 *)(r7 +25588)
  2063: (63) *(u32 *)(r7 +30708) = r0
  2064: (61) r0 = *(u32 *)(r7 +25592)
  2065: (63) *(u32 *)(r7 +30712) = r0
  2066: (61) r0 = *(u32 *)(r7 +25596)
  2067: (63) *(u32 *)(r7 +30716) = r0
  // store the speculative load address (scalar) this time after the store
  // forward prediction training.
  2068: (7b) *(u64 *)(r10 -16) = r2
  // preoccupy the CPU store port by running sequence of dummy stores.
  2069: (63) *(u32 *)(r7 +29696) = r0
  2070: (63) *(u32 *)(r7 +29700) = r0
  2071: (63) *(u32 *)(r7 +29704) = r0
  2072: (63) *(u32 *)(r7 +29708) = r0
  2073: (63) *(u32 *)(r7 +29712) = r0
  2074: (63) *(u32 *)(r7 +29716) = r0
  2075: (63) *(u32 *)(r7 +29720) = r0
  2076: (63) *(u32 *)(r7 +29724) = r0
  2077: (63) *(u32 *)(r7 +29728) = r0
  2078: (63) *(u32 *)(r7 +29732) = r0
  2079: (63) *(u32 *)(r7 +29736) = r0
  2080: (63) *(u32 *)(r7 +29740) = r0
  2081: (63) *(u32 *)(r7 +29744) = r0
  2082: (63) *(u32 *)(r7 +29748) = r0
  2083: (63) *(u32 *)(r7 +29752) = r0
  2084: (63) *(u32 *)(r7 +29756) = r0
  2085: (63) *(u32 *)(r7 +29760) = r0
  2086: (63) *(u32 *)(r7 +29764) = r0
  2087: (63) *(u32 *)(r7 +29768) = r0
  2088: (63) *(u32 *)(r7 +29772) = r0
  2089: (63) *(u32 *)(r7 +29776) = r0
  2090: (63) *(u32 *)(r7 +29780) = r0
  2091: (63) *(u32 *)(r7 +29784) = r0
  2092: (63) *(u32 *)(r7 +29788) = r0
  2093: (63) *(u32 *)(r7 +29792) = r0
  2094: (63) *(u32 *)(r7 +29796) = r0
  2095: (63) *(u32 *)(r7 +29800) = r0
  2096: (63) *(u32 *)(r7 +29804) = r0
  2097: (63) *(u32 *)(r7 +29808) = r0
  2098: (63) *(u32 *)(r7 +29812) = r0
  // overwrite scalar with dummy pointer; same as before, also including the
  // sanitation store with 0 from the current mitigation by the verifier.
  2099: (7a) *(u64 *)(r10 -16) = 0         | /both/ are now slow stores here
  2100: (7b) *(u64 *)(r10 -16) = r7        | since store unit is still busy.
  // load from stack intended to bypass stores.
  2101: (79) r2 = *(u64 *)(r10 -16)
  2102: (71) r3 = *(u8 *)(r2 +0)
  // leak r3
  [...]

Looking at the CPU microarchitecture, the scheduler might issue loads (such
as seen in line 2101) before stores (line 2099,2100) because the load execution
units become available while the store execution unit is still busy with the
sequence of dummy stores (line 2069-2098). And so the load may use the prior
stored scalar from r2 at address r10 -16 for speculation. The updated attack
may work less reliable on CPU microarchitectures where loads and stores share
execution resources.

This concludes that the sanitizing with zero stores from af86ca4e3088 ("bpf:
Prevent memory disambiguation attack") is insufficient. Moreover, the detection
of stack reuse from af86ca4e3088 where previously data (STACK_MISC) has been
written to a given stack slot where a pointer value is now to be stored does
not have sufficient coverage as precondition for the mitigation either; for
several reasons outlined as follows:

 1) Stack content from prior program runs could still be preserved and is
    therefore not "random", best example is to split a speculative store
    bypass attack between tail calls, program A would prepare and store the
    oob address at a given stack slot and then tail call into program B which
    does the "slow" store of a pointer to the stack with subsequent "fast"
    read. From program B PoV such stack slot type is STACK_INVALID, and
    therefore also must be subject to mitigation.

 2) The STACK_SPILL must not be coupled to register_is_const(&stack->spilled_ptr)
    condition, for example, the previous content of that memory location could
    also be a pointer to map or map value. Without the fix, a speculative
    store bypass is not mitigated in such precondition and can then lead to
    a type confusion in the speculative domain leaking kernel memory near
    these pointer types.

While brainstorming on various alternative mitigation possibilities, we also
stumbled upon a retrospective from Chrome developers [0]:

  [...] For variant 4, we implemented a mitigation to zero the unused memory
  of the heap prior to allocation, which cost about 1% when done concurrently
  and 4% for scavenging. Variant 4 defeats everything we could think of. We
  explored more mitigations for variant 4 but the threat proved to be more
  pervasive and dangerous than we anticipated. For example, stack slots used
  by the register allocator in the optimizing compiler could be subject to
  type confusion, leading to pointer crafting. Mitigating type confusion for
  stack slots alone would have required a complete redesign of the backend of
  the optimizing compiler, perhaps man years of work, without a guarantee of
  completeness. [...]

From BPF side, the problem space is reduced, however, options are rather
limited. One idea that has been explored was to xor-obfuscate pointer spills
to the BPF stack:

  [...]
  // preoccupy the CPU store port by running sequence of dummy stores.
  [...]
  2106: (63) *(u32 *)(r7 +29796) = r0
  2107: (63) *(u32 *)(r7 +29800) = r0
  2108: (63) *(u32 *)(r7 +29804) = r0
  2109: (63) *(u32 *)(r7 +29808) = r0
  2110: (63) *(u32 *)(r7 +29812) = r0
  // overwrite scalar with dummy pointer; xored with random 'secret' value
  // of 943576462 before store ...
  2111: (b4) w11 = 943576462
  2112: (af) r11 ^= r7
  2113: (7b) *(u64 *)(r10 -16) = r11
  2114: (79) r11 = *(u64 *)(r10 -16)
  2115: (b4) w2 = 943576462
  2116: (af) r2 ^= r11
  // ... and restored with the same 'secret' value with the help of AX reg.
  2117: (71) r3 = *(u8 *)(r2 +0)
  [...]

While the above would not prevent speculation, it would make data leakage
infeasible by directing it to random locations. In order to be effective
and prevent type confusion under speculation, such random secret would have
to be regenerated for each store. The additional complexity involved for a
tracking mechanism that prevents jumps such that restoring spilled pointers
would not get corrupted is not worth the gain for unprivileged. Hence, the
fix in here eventually opted for emitting a non-public BPF_ST | BPF_NOSPEC
instruction which the x86 JIT translates into a lfence opcode. Inserting the
latter in between the store and load instruction is one of the mitigations
options [1]. The x86 instruction manual notes:

  [...] An LFENCE that follows an instruction that stores to memory might
  complete before the data being stored have become globally visible. [...]

The latter meaning that the preceding store instruction finished execution
and the store is at minimum guaranteed to be in the CPU's store queue, but
it's not guaranteed to be in that CPU's L1 cache at that point (globally
visible). The latter would only be guaranteed via sfence. So the load which
is guaranteed to execute after the lfence for that local CPU would have to
rely on store-to-load forwarding. [2], in section 2.3 on store buffers says:

  [...] For every store operation that is added to the ROB, an entry is
  allocated in the store buffer. This entry requires both the virtual and
  physical address of the target. Only if there is no free entry in the store
  buffer, the frontend stalls until there is an empty slot available in the
  store buffer again. Otherwise, the CPU can immediately continue adding
  subsequent instructions to the ROB and execute them out of order. On Intel
  CPUs, the store buffer has up to 56 entries. [...]

One small upside on the fix is that it lifts constraints from af86ca4e3088
where the sanitize_stack_off relative to r10 must be the same when coming
from different paths. The BPF_ST | BPF_NOSPEC gets emitted after a BPF_STX
or BPF_ST instruction. This happens either when we store a pointer or data
value to the BPF stack for the first time, or upon later pointer spills.
The former needs to be enforced since otherwise stale stack data could be
leaked under speculation as outlined earlier. For non-x86 JITs the BPF_ST |
BPF_NOSPEC mapping is currently optimized away, but others could emit a
speculation barrier as well if necessary. For real-world unprivileged
programs e.g. generated by LLVM, pointer spill/fill is only generated upon
register pressure and LLVM only tries to do that for pointers which are not
used often. The program main impact will be the initial BPF_ST | BPF_NOSPEC
sanitation for the STACK_INVALID case when the first write to a stack slot
occurs e.g. upon map lookup. In future we might refine ways to mitigate
the latter cost.

  [0] https://arxiv.org/pdf/1902.05178.pdf
  [1] https://msrc-blog.microsoft.com/2018/05/21/analysis-and-mitigation-of-speculative-store-bypass-cve-2018-3639/
  [2] https://arxiv.org/pdf/1905.05725.pdf

Fixes: af86ca4e3088 ("bpf: Prevent memory disambiguation attack")
Fixes: f7cf25b2026d ("bpf: track spill/fill of constants")
Co-developed-by: Piotr Krysiuk <piotras@gmail.com>
Co-developed-by: Benedict Schlueter <benedict.schlueter@rub.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Piotr Krysiuk <piotras@gmail.com>
Signed-off-by: Benedict Schlueter <benedict.schlueter@rub.de>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  2 +-
 kernel/bpf/verifier.c        | 87 ++++++++++++++++----------------------------
 2 files changed, 33 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7ba7e800d472..828d08afeee0 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -340,8 +340,8 @@ struct bpf_insn_aux_data {
 	};
 	u64 map_key_state; /* constant (32 bit) key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
-	int sanitize_stack_off; /* stack slot to be cleared */
 	u32 seen; /* this insn was processed by the verifier at env->pass_cnt */
+	bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */
 	bool zext_dst; /* this insn zero extends dst reg */
 	u8 alu_state; /* used in combination with alu_limit */
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 657062cb4d85..f9bda5476ea5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2610,6 +2610,19 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 	cur = env->cur_state->frame[env->cur_state->curframe];
 	if (value_regno >= 0)
 		reg = &cur->regs[value_regno];
+	if (!env->bypass_spec_v4) {
+		bool sanitize = reg && is_spillable_regtype(reg->type);
+
+		for (i = 0; i < size; i++) {
+			if (state->stack[spi].slot_type[i] == STACK_INVALID) {
+				sanitize = true;
+				break;
+			}
+		}
+
+		if (sanitize)
+			env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
+	}
 
 	if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) &&
 	    !register_is_null(reg) && env->bpf_capable) {
@@ -2632,47 +2645,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
 			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
-
 		if (state != cur && reg->type == PTR_TO_STACK) {
 			verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
 			return -EINVAL;
 		}
-
-		if (!env->bypass_spec_v4) {
-			bool sanitize = false;
-
-			if (state->stack[spi].slot_type[0] == STACK_SPILL &&
-			    register_is_const(&state->stack[spi].spilled_ptr))
-				sanitize = true;
-			for (i = 0; i < BPF_REG_SIZE; i++)
-				if (state->stack[spi].slot_type[i] == STACK_MISC) {
-					sanitize = true;
-					break;
-				}
-			if (sanitize) {
-				int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
-				int soff = (-spi - 1) * BPF_REG_SIZE;
-
-				/* detected reuse of integer stack slot with a pointer
-				 * which means either llvm is reusing stack slot or
-				 * an attacker is trying to exploit CVE-2018-3639
-				 * (speculative store bypass)
-				 * Have to sanitize that slot with preemptive
-				 * store of zero.
-				 */
-				if (*poff && *poff != soff) {
-					/* disallow programs where single insn stores
-					 * into two different stack slots, since verifier
-					 * cannot sanitize them
-					 */
-					verbose(env,
-						"insn %d cannot access two stack slots fp%d and fp%d",
-						insn_idx, *poff, soff);
-					return -EINVAL;
-				}
-				*poff = soff;
-			}
-		}
 		save_register_state(state, spi, reg);
 	} else {
 		u8 type = STACK_MISC;
@@ -11913,35 +11889,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		bpf_convert_ctx_access_t convert_ctx_access;
+		bool ctx_access;
 
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
 		    insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
-		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
+		    insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
 			type = BPF_READ;
-		else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
-			 insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
-			 insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
-			 insn->code == (BPF_STX | BPF_MEM | BPF_DW))
+			ctx_access = true;
+		} else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
+			   insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
+			   insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+			   insn->code == (BPF_STX | BPF_MEM | BPF_DW) ||
+			   insn->code == (BPF_ST | BPF_MEM | BPF_B) ||
+			   insn->code == (BPF_ST | BPF_MEM | BPF_H) ||
+			   insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
+			   insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
 			type = BPF_WRITE;
-		else
+			ctx_access = BPF_CLASS(insn->code) == BPF_STX;
+		} else {
 			continue;
+		}
 
 		if (type == BPF_WRITE &&
-		    env->insn_aux_data[i + delta].sanitize_stack_off) {
+		    env->insn_aux_data[i + delta].sanitize_stack_spill) {
 			struct bpf_insn patch[] = {
-				/* Sanitize suspicious stack slot with zero.
-				 * There are no memory dependencies for this store,
-				 * since it's only using frame pointer and immediate
-				 * constant of zero
-				 */
-				BPF_ST_MEM(BPF_DW, BPF_REG_FP,
-					   env->insn_aux_data[i + delta].sanitize_stack_off,
-					   0),
-				/* the original STX instruction will immediately
-				 * overwrite the same stack slot with appropriate value
-				 */
 				*insn,
+				BPF_ST_NOSPEC(),
 			};
 
 			cnt = ARRAY_SIZE(patch);
@@ -11955,6 +11929,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			continue;
 		}
 
+		if (!ctx_access)
+			continue;
+
 		switch (env->insn_aux_data[i + delta].ptr_type) {
 		case PTR_TO_CTX:
 			if (!ops->convert_ctx_access)
-- 
cgit v1.2.3


From 40e159403896f7d55c98f858d0b20fee1d941fa4 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 2 Aug 2021 12:21:30 +0100
Subject: mhi: Fix networking tree build.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mhi.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 944aa3aa3035..5e08468854db 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -719,8 +719,13 @@ void mhi_device_put(struct mhi_device *mhi_dev);
  *                            host and device execution environments match and
  *                            channels are in a DISABLED state.
  * @mhi_dev: Device associated with the channels
+ * @flags: MHI channel flags
  */
-int mhi_prepare_for_transfer(struct mhi_device *mhi_dev);
+int mhi_prepare_for_transfer(struct mhi_device *mhi_dev,
+			     unsigned int flags);
+
+/* Automatically allocate and queue inbound buffers */
+#define MHI_CH_INBOUND_ALLOC_BUFS BIT(0)
 
 /**
  * mhi_unprepare_from_transfer - Reset UL and DL channels for data transfer.
-- 
cgit v1.2.3


From 1c69d7cf4a8b6b6cfd920a1e809f1cd33ae4369c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 2 Aug 2021 07:30:29 -0700
Subject: Revert "mhi: Fix networking tree build."

This reverts commit 40e159403896f7d55c98f858d0b20fee1d941fa4.

Looks like this commit breaks the build for me.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/mhi.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 5e08468854db..944aa3aa3035 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -719,13 +719,8 @@ void mhi_device_put(struct mhi_device *mhi_dev);
  *                            host and device execution environments match and
  *                            channels are in a DISABLED state.
  * @mhi_dev: Device associated with the channels
- * @flags: MHI channel flags
  */
-int mhi_prepare_for_transfer(struct mhi_device *mhi_dev,
-			     unsigned int flags);
-
-/* Automatically allocate and queue inbound buffers */
-#define MHI_CH_INBOUND_ALLOC_BUFS BIT(0)
+int mhi_prepare_for_transfer(struct mhi_device *mhi_dev);
 
 /**
  * mhi_unprepare_from_transfer - Reset UL and DL channels for data transfer.
-- 
cgit v1.2.3


From f01639589e252a6f72c04716e1b5f9bb10e2debc Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Fri, 23 Jul 2021 19:54:46 +0800
Subject: soundwire: move intel sdw register definitions to sdw_intel.h

Those Intel sdw registers will be used by ASoC SOF drivers in the
following commits. So move those definitions to sdw_intel.h and it can
be visible to SOF drivers.

Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Acked-By: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20210723115451.7245-2-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/soundwire/intel.c           | 74 ----------------------------------
 drivers/soundwire/intel_init.c      |  6 ---
 include/linux/soundwire/sdw_intel.h | 79 +++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+), 80 deletions(-)

(limited to 'include')

diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c
index c11e3d8cd308..15668d6fecd6 100644
--- a/drivers/soundwire/intel.c
+++ b/drivers/soundwire/intel.c
@@ -40,80 +40,6 @@ static int md_flags;
 module_param_named(sdw_md_flags, md_flags, int, 0444);
 MODULE_PARM_DESC(sdw_md_flags, "SoundWire Intel Master device flags (0x0 all off)");
 
-/* Intel SHIM Registers Definition */
-#define SDW_SHIM_LCAP			0x0
-#define SDW_SHIM_LCTL			0x4
-#define SDW_SHIM_IPPTR			0x8
-#define SDW_SHIM_SYNC			0xC
-
-#define SDW_SHIM_CTLSCAP(x)		(0x010 + 0x60 * (x))
-#define SDW_SHIM_CTLS0CM(x)		(0x012 + 0x60 * (x))
-#define SDW_SHIM_CTLS1CM(x)		(0x014 + 0x60 * (x))
-#define SDW_SHIM_CTLS2CM(x)		(0x016 + 0x60 * (x))
-#define SDW_SHIM_CTLS3CM(x)		(0x018 + 0x60 * (x))
-#define SDW_SHIM_PCMSCAP(x)		(0x020 + 0x60 * (x))
-
-#define SDW_SHIM_PCMSYCHM(x, y)		(0x022 + (0x60 * (x)) + (0x2 * (y)))
-#define SDW_SHIM_PCMSYCHC(x, y)		(0x042 + (0x60 * (x)) + (0x2 * (y)))
-#define SDW_SHIM_PDMSCAP(x)		(0x062 + 0x60 * (x))
-#define SDW_SHIM_IOCTL(x)		(0x06C + 0x60 * (x))
-#define SDW_SHIM_CTMCTL(x)		(0x06E + 0x60 * (x))
-
-#define SDW_SHIM_WAKEEN			0x190
-#define SDW_SHIM_WAKESTS		0x192
-
-#define SDW_SHIM_LCTL_SPA		BIT(0)
-#define SDW_SHIM_LCTL_SPA_MASK		GENMASK(3, 0)
-#define SDW_SHIM_LCTL_CPA		BIT(8)
-#define SDW_SHIM_LCTL_CPA_MASK		GENMASK(11, 8)
-
-#define SDW_SHIM_SYNC_SYNCPRD_VAL_24	(24000 / SDW_CADENCE_GSYNC_KHZ - 1)
-#define SDW_SHIM_SYNC_SYNCPRD_VAL_38_4	(38400 / SDW_CADENCE_GSYNC_KHZ - 1)
-#define SDW_SHIM_SYNC_SYNCPRD		GENMASK(14, 0)
-#define SDW_SHIM_SYNC_SYNCCPU		BIT(15)
-#define SDW_SHIM_SYNC_CMDSYNC_MASK	GENMASK(19, 16)
-#define SDW_SHIM_SYNC_CMDSYNC		BIT(16)
-#define SDW_SHIM_SYNC_SYNCGO		BIT(24)
-
-#define SDW_SHIM_PCMSCAP_ISS		GENMASK(3, 0)
-#define SDW_SHIM_PCMSCAP_OSS		GENMASK(7, 4)
-#define SDW_SHIM_PCMSCAP_BSS		GENMASK(12, 8)
-
-#define SDW_SHIM_PCMSYCM_LCHN		GENMASK(3, 0)
-#define SDW_SHIM_PCMSYCM_HCHN		GENMASK(7, 4)
-#define SDW_SHIM_PCMSYCM_STREAM		GENMASK(13, 8)
-#define SDW_SHIM_PCMSYCM_DIR		BIT(15)
-
-#define SDW_SHIM_PDMSCAP_ISS		GENMASK(3, 0)
-#define SDW_SHIM_PDMSCAP_OSS		GENMASK(7, 4)
-#define SDW_SHIM_PDMSCAP_BSS		GENMASK(12, 8)
-#define SDW_SHIM_PDMSCAP_CPSS		GENMASK(15, 13)
-
-#define SDW_SHIM_IOCTL_MIF		BIT(0)
-#define SDW_SHIM_IOCTL_CO		BIT(1)
-#define SDW_SHIM_IOCTL_COE		BIT(2)
-#define SDW_SHIM_IOCTL_DO		BIT(3)
-#define SDW_SHIM_IOCTL_DOE		BIT(4)
-#define SDW_SHIM_IOCTL_BKE		BIT(5)
-#define SDW_SHIM_IOCTL_WPDD		BIT(6)
-#define SDW_SHIM_IOCTL_CIBD		BIT(8)
-#define SDW_SHIM_IOCTL_DIBD		BIT(9)
-
-#define SDW_SHIM_CTMCTL_DACTQE		BIT(0)
-#define SDW_SHIM_CTMCTL_DODS		BIT(1)
-#define SDW_SHIM_CTMCTL_DOAIS		GENMASK(4, 3)
-
-#define SDW_SHIM_WAKEEN_ENABLE		BIT(0)
-#define SDW_SHIM_WAKESTS_STATUS		BIT(0)
-
-/* Intel ALH Register definitions */
-#define SDW_ALH_STRMZCFG(x)		(0x000 + (0x4 * (x)))
-#define SDW_ALH_NUM_STREAMS		64
-
-#define SDW_ALH_STRMZCFG_DMAT_VAL	0x3
-#define SDW_ALH_STRMZCFG_DMAT		GENMASK(7, 0)
-#define SDW_ALH_STRMZCFG_CHN		GENMASK(19, 16)
-
 enum intel_pdi_type {
 	INTEL_PDI_IN = 0,
 	INTEL_PDI_OUT = 1,
diff --git a/drivers/soundwire/intel_init.c b/drivers/soundwire/intel_init.c
index 9e283bef53d2..03ff69ab1797 100644
--- a/drivers/soundwire/intel_init.c
+++ b/drivers/soundwire/intel_init.c
@@ -18,12 +18,6 @@
 #include "cadence_master.h"
 #include "intel.h"
 
-#define SDW_SHIM_LCAP		0x0
-#define SDW_SHIM_BASE		0x2C000
-#define SDW_ALH_BASE		0x2C800
-#define SDW_LINK_BASE		0x30000
-#define SDW_LINK_SIZE		0x10000
-
 static void intel_link_dev_release(struct device *dev)
 {
 	struct auxiliary_device *auxdev = to_auxiliary_dev(dev);
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 1ebea7764011..7fce6aee0c36 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -7,6 +7,85 @@
 #include <linux/irqreturn.h>
 #include <linux/soundwire/sdw.h>
 
+#define SDW_SHIM_BASE			0x2C000
+#define SDW_ALH_BASE			0x2C800
+#define SDW_LINK_BASE			0x30000
+#define SDW_LINK_SIZE			0x10000
+
+/* Intel SHIM Registers Definition */
+#define SDW_SHIM_LCAP			0x0
+#define SDW_SHIM_LCTL			0x4
+#define SDW_SHIM_IPPTR			0x8
+#define SDW_SHIM_SYNC			0xC
+
+#define SDW_SHIM_CTLSCAP(x)		(0x010 + 0x60 * (x))
+#define SDW_SHIM_CTLS0CM(x)		(0x012 + 0x60 * (x))
+#define SDW_SHIM_CTLS1CM(x)		(0x014 + 0x60 * (x))
+#define SDW_SHIM_CTLS2CM(x)		(0x016 + 0x60 * (x))
+#define SDW_SHIM_CTLS3CM(x)		(0x018 + 0x60 * (x))
+#define SDW_SHIM_PCMSCAP(x)		(0x020 + 0x60 * (x))
+
+#define SDW_SHIM_PCMSYCHM(x, y)		(0x022 + (0x60 * (x)) + (0x2 * (y)))
+#define SDW_SHIM_PCMSYCHC(x, y)		(0x042 + (0x60 * (x)) + (0x2 * (y)))
+#define SDW_SHIM_PDMSCAP(x)		(0x062 + 0x60 * (x))
+#define SDW_SHIM_IOCTL(x)		(0x06C + 0x60 * (x))
+#define SDW_SHIM_CTMCTL(x)		(0x06E + 0x60 * (x))
+
+#define SDW_SHIM_WAKEEN			0x190
+#define SDW_SHIM_WAKESTS		0x192
+
+#define SDW_SHIM_LCTL_SPA		BIT(0)
+#define SDW_SHIM_LCTL_SPA_MASK		GENMASK(3, 0)
+#define SDW_SHIM_LCTL_CPA		BIT(8)
+#define SDW_SHIM_LCTL_CPA_MASK		GENMASK(11, 8)
+
+#define SDW_SHIM_SYNC_SYNCPRD_VAL_24	(24000 / SDW_CADENCE_GSYNC_KHZ - 1)
+#define SDW_SHIM_SYNC_SYNCPRD_VAL_38_4	(38400 / SDW_CADENCE_GSYNC_KHZ - 1)
+#define SDW_SHIM_SYNC_SYNCPRD		GENMASK(14, 0)
+#define SDW_SHIM_SYNC_SYNCCPU		BIT(15)
+#define SDW_SHIM_SYNC_CMDSYNC_MASK	GENMASK(19, 16)
+#define SDW_SHIM_SYNC_CMDSYNC		BIT(16)
+#define SDW_SHIM_SYNC_SYNCGO		BIT(24)
+
+#define SDW_SHIM_PCMSCAP_ISS		GENMASK(3, 0)
+#define SDW_SHIM_PCMSCAP_OSS		GENMASK(7, 4)
+#define SDW_SHIM_PCMSCAP_BSS		GENMASK(12, 8)
+
+#define SDW_SHIM_PCMSYCM_LCHN		GENMASK(3, 0)
+#define SDW_SHIM_PCMSYCM_HCHN		GENMASK(7, 4)
+#define SDW_SHIM_PCMSYCM_STREAM		GENMASK(13, 8)
+#define SDW_SHIM_PCMSYCM_DIR		BIT(15)
+
+#define SDW_SHIM_PDMSCAP_ISS		GENMASK(3, 0)
+#define SDW_SHIM_PDMSCAP_OSS		GENMASK(7, 4)
+#define SDW_SHIM_PDMSCAP_BSS		GENMASK(12, 8)
+#define SDW_SHIM_PDMSCAP_CPSS		GENMASK(15, 13)
+
+#define SDW_SHIM_IOCTL_MIF		BIT(0)
+#define SDW_SHIM_IOCTL_CO		BIT(1)
+#define SDW_SHIM_IOCTL_COE		BIT(2)
+#define SDW_SHIM_IOCTL_DO		BIT(3)
+#define SDW_SHIM_IOCTL_DOE		BIT(4)
+#define SDW_SHIM_IOCTL_BKE		BIT(5)
+#define SDW_SHIM_IOCTL_WPDD		BIT(6)
+#define SDW_SHIM_IOCTL_CIBD		BIT(8)
+#define SDW_SHIM_IOCTL_DIBD		BIT(9)
+
+#define SDW_SHIM_CTMCTL_DACTQE		BIT(0)
+#define SDW_SHIM_CTMCTL_DODS		BIT(1)
+#define SDW_SHIM_CTMCTL_DOAIS		GENMASK(4, 3)
+
+#define SDW_SHIM_WAKEEN_ENABLE		BIT(0)
+#define SDW_SHIM_WAKESTS_STATUS		BIT(0)
+
+/* Intel ALH Register definitions */
+#define SDW_ALH_STRMZCFG(x)		(0x000 + (0x4 * (x)))
+#define SDW_ALH_NUM_STREAMS		64
+
+#define SDW_ALH_STRMZCFG_DMAT_VAL	0x3
+#define SDW_ALH_STRMZCFG_DMAT		GENMASK(7, 0)
+#define SDW_ALH_STRMZCFG_CHN		GENMASK(19, 16)
+
 /**
  * struct sdw_intel_stream_params_data: configuration passed during
  * the @params_stream callback, e.g. for interaction with DSP
-- 
cgit v1.2.3


From 60e9feb781dfe84158b4ec7a4d61c5103e96e6f3 Mon Sep 17 00:00:00 2001
From: Bard Liao <yung-chuan.liao@linux.intel.com>
Date: Fri, 23 Jul 2021 19:54:51 +0800
Subject: soundwire: intel: introduce shim and alh base

shim base and alh base are platform-dependent. Adding these two
parameters allows us to use different shim/alh base for each
platform.

Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Ranjani Sridharan <ranjani.sridharan@linux.intel.com>
Acked-By: Vinod Koul <vkoul@kernel.org>
Link: https://lore.kernel.org/r/20210723115451.7245-7-yung-chuan.liao@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/soundwire/intel_init.c      | 8 +++++---
 include/linux/soundwire/sdw_intel.h | 8 ++++++++
 sound/soc/sof/intel/hda.c           | 2 ++
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/soundwire/intel_init.c b/drivers/soundwire/intel_init.c
index 03ff69ab1797..e329022e1669 100644
--- a/drivers/soundwire/intel_init.c
+++ b/drivers/soundwire/intel_init.c
@@ -63,8 +63,8 @@ static struct sdw_intel_link_dev *intel_link_dev_register(struct sdw_intel_res *
 	link->mmio_base = res->mmio_base;
 	link->registers = res->mmio_base + SDW_LINK_BASE
 		+ (SDW_LINK_SIZE * link_id);
-	link->shim = res->mmio_base + SDW_SHIM_BASE;
-	link->alh = res->mmio_base + SDW_ALH_BASE;
+	link->shim = res->mmio_base + res->shim_base;
+	link->alh = res->mmio_base + res->alh_base;
 
 	link->ops = res->ops;
 	link->dev = res->dev;
@@ -214,6 +214,8 @@ static struct sdw_intel_ctx
 	}
 
 	ctx->mmio_base = res->mmio_base;
+	ctx->shim_base = res->shim_base;
+	ctx->alh_base = res->alh_base;
 	ctx->link_mask = res->link_mask;
 	ctx->handle = res->handle;
 	mutex_init(&ctx->shim_lock);
@@ -302,7 +304,7 @@ sdw_intel_startup_controller(struct sdw_intel_ctx *ctx)
 		return -EINVAL;
 
 	/* Check SNDWLCAP.LCOUNT */
-	caps = ioread32(ctx->mmio_base + SDW_SHIM_BASE + SDW_SHIM_LCAP);
+	caps = ioread32(ctx->mmio_base + ctx->shim_base + SDW_SHIM_LCAP);
 	caps &= GENMASK(2, 0);
 
 	/* Check HW supported vs property value */
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 7fce6aee0c36..8a463b8fc12a 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -195,6 +195,8 @@ struct sdw_intel_slave_id {
  * @link_list: list to handle interrupts across all links
  * @shim_lock: mutex to handle concurrent rmw access to shared SHIM registers.
  * @shim_mask: flags to track initialization of SHIM shared registers
+ * @shim_base: sdw shim base.
+ * @alh_base: sdw alh base.
  */
 struct sdw_intel_ctx {
 	int count;
@@ -207,6 +209,8 @@ struct sdw_intel_ctx {
 	struct list_head link_list;
 	struct mutex shim_lock; /* lock for access to shared SHIM registers */
 	u32 shim_mask;
+	u32 shim_base;
+	u32 alh_base;
 };
 
 /**
@@ -225,6 +229,8 @@ struct sdw_intel_ctx {
  * machine-specific quirks are handled in the DSP driver.
  * @clock_stop_quirks: mask array of possible behaviors requested by the
  * DSP driver. The quirks are common for all links for now.
+ * @shim_base: sdw shim base.
+ * @alh_base: sdw alh base.
  */
 struct sdw_intel_res {
 	int count;
@@ -236,6 +242,8 @@ struct sdw_intel_res {
 	struct device *dev;
 	u32 link_mask;
 	u32 clock_stop_quirks;
+	u32 shim_base;
+	u32 alh_base;
 };
 
 /*
diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c
index c979581c6812..b4e35fbbe693 100644
--- a/sound/soc/sof/intel/hda.c
+++ b/sound/soc/sof/intel/hda.c
@@ -166,6 +166,8 @@ static int hda_sdw_probe(struct snd_sof_dev *sdev)
 	memset(&res, 0, sizeof(res));
 
 	res.mmio_base = sdev->bar[HDA_DSP_BAR];
+	res.shim_base = hdev->desc->sdw_shim_base;
+	res.alh_base = hdev->desc->sdw_alh_base;
 	res.irq = sdev->ipc_irq;
 	res.handle = hdev->info.handle;
 	res.parent = sdev->dev;
-- 
cgit v1.2.3


From ce78ffa3ef1681065ba451cfd545da6126f5ca88 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 3 Aug 2021 11:14:03 +0100
Subject: net: really fix the build...

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/bus/mhi/core/internal.h  |  2 +-
 drivers/bus/mhi/core/main.c      |  9 ++++++---
 drivers/net/mhi/net.c            |  2 +-
 drivers/net/wwan/mhi_wwan_ctrl.c |  2 +-
 include/linux/mhi.h              |  7 ++++++-
 net/qrtr/mhi.c                   | 16 +++++++++++++++-
 6 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/mhi/core/internal.h b/drivers/bus/mhi/core/internal.h
index 5b9ea66b92dc..bc239a11aa69 100644
--- a/drivers/bus/mhi/core/internal.h
+++ b/drivers/bus/mhi/core/internal.h
@@ -682,7 +682,7 @@ void mhi_rddm_prepare(struct mhi_controller *mhi_cntrl,
 		      struct image_info *img_info);
 void mhi_fw_load_handler(struct mhi_controller *mhi_cntrl);
 int mhi_prepare_channel(struct mhi_controller *mhi_cntrl,
-			struct mhi_chan *mhi_chan);
+			struct mhi_chan *mhi_chan, unsigned int flags);
 int mhi_init_chan_ctxt(struct mhi_controller *mhi_cntrl,
 		       struct mhi_chan *mhi_chan);
 void mhi_deinit_chan_ctxt(struct mhi_controller *mhi_cntrl,
diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c
index fc9196f11cb7..84448233f64c 100644
--- a/drivers/bus/mhi/core/main.c
+++ b/drivers/bus/mhi/core/main.c
@@ -1430,7 +1430,7 @@ exit_unprepare_channel:
 }
 
 int mhi_prepare_channel(struct mhi_controller *mhi_cntrl,
-			struct mhi_chan *mhi_chan)
+			struct mhi_chan *mhi_chan, unsigned int flags)
 {
 	int ret = 0;
 	struct device *dev = &mhi_chan->mhi_dev->dev;
@@ -1455,6 +1455,9 @@ int mhi_prepare_channel(struct mhi_controller *mhi_cntrl,
 	if (ret)
 		goto error_pm_state;
 
+	if (mhi_chan->dir == DMA_FROM_DEVICE)
+		mhi_chan->pre_alloc = !!(flags & MHI_CH_INBOUND_ALLOC_BUFS);
+	
 	/* Pre-allocate buffer for xfer ring */
 	if (mhi_chan->pre_alloc) {
 		int nr_el = get_nr_avail_ring_elements(mhi_cntrl,
@@ -1610,7 +1613,7 @@ void mhi_reset_chan(struct mhi_controller *mhi_cntrl, struct mhi_chan *mhi_chan)
 }
 
 /* Move channel to start state */
-int mhi_prepare_for_transfer(struct mhi_device *mhi_dev)
+int mhi_prepare_for_transfer(struct mhi_device *mhi_dev, unsigned int flags)
 {
 	int ret, dir;
 	struct mhi_controller *mhi_cntrl = mhi_dev->mhi_cntrl;
@@ -1621,7 +1624,7 @@ int mhi_prepare_for_transfer(struct mhi_device *mhi_dev)
 		if (!mhi_chan)
 			continue;
 
-		ret = mhi_prepare_channel(mhi_cntrl, mhi_chan);
+		ret = mhi_prepare_channel(mhi_cntrl, mhi_chan, flags);
 		if (ret)
 			goto error_open_chan;
 	}
diff --git a/drivers/net/mhi/net.c b/drivers/net/mhi/net.c
index e60e38c1f09d..11be6bcdd551 100644
--- a/drivers/net/mhi/net.c
+++ b/drivers/net/mhi/net.c
@@ -335,7 +335,7 @@ static int mhi_net_newlink(void *ctxt, struct net_device *ndev, u32 if_id,
 	u64_stats_init(&mhi_netdev->stats.tx_syncp);
 
 	/* Start MHI channels */
-	err = mhi_prepare_for_transfer(mhi_dev);
+	err = mhi_prepare_for_transfer(mhi_dev, 0);
 	if (err)
 		goto out_err;
 
diff --git a/drivers/net/wwan/mhi_wwan_ctrl.c b/drivers/net/wwan/mhi_wwan_ctrl.c
index 1bc6b69aa530..1e18420ce404 100644
--- a/drivers/net/wwan/mhi_wwan_ctrl.c
+++ b/drivers/net/wwan/mhi_wwan_ctrl.c
@@ -110,7 +110,7 @@ static int mhi_wwan_ctrl_start(struct wwan_port *port)
 	int ret;
 
 	/* Start mhi device's channel(s) */
-	ret = mhi_prepare_for_transfer(mhiwwan->mhi_dev);
+	ret = mhi_prepare_for_transfer(mhiwwan->mhi_dev, 0);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 944aa3aa3035..5e08468854db 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -719,8 +719,13 @@ void mhi_device_put(struct mhi_device *mhi_dev);
  *                            host and device execution environments match and
  *                            channels are in a DISABLED state.
  * @mhi_dev: Device associated with the channels
+ * @flags: MHI channel flags
  */
-int mhi_prepare_for_transfer(struct mhi_device *mhi_dev);
+int mhi_prepare_for_transfer(struct mhi_device *mhi_dev,
+			     unsigned int flags);
+
+/* Automatically allocate and queue inbound buffers */
+#define MHI_CH_INBOUND_ALLOC_BUFS BIT(0)
 
 /**
  * mhi_unprepare_from_transfer - Reset UL and DL channels for data transfer.
diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c
index fa611678af05..1dc955ca57d3 100644
--- a/net/qrtr/mhi.c
+++ b/net/qrtr/mhi.c
@@ -15,6 +15,7 @@ struct qrtr_mhi_dev {
 	struct qrtr_endpoint ep;
 	struct mhi_device *mhi_dev;
 	struct device *dev;
+	struct completion ready;
 };
 
 /* From MHI to QRTR */
@@ -50,6 +51,10 @@ static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb)
 	struct qrtr_mhi_dev *qdev = container_of(ep, struct qrtr_mhi_dev, ep);
 	int rc;
 
+	rc = wait_for_completion_interruptible(&qdev->ready);
+	if (rc)
+		goto free_skb;
+
 	if (skb->sk)
 		sock_hold(skb->sk);
 
@@ -79,7 +84,7 @@ static int qcom_mhi_qrtr_probe(struct mhi_device *mhi_dev,
 	int rc;
 
 	/* start channels */
-	rc = mhi_prepare_for_transfer(mhi_dev);
+	rc = mhi_prepare_for_transfer(mhi_dev, 0);
 	if (rc)
 		return rc;
 
@@ -96,6 +101,15 @@ static int qcom_mhi_qrtr_probe(struct mhi_device *mhi_dev,
 	if (rc)
 		return rc;
 
+	/* start channels */
+	rc = mhi_prepare_for_transfer(mhi_dev, MHI_CH_INBOUND_ALLOC_BUFS);
+	if (rc) {
+		qrtr_endpoint_unregister(&qdev->ep);
+		dev_set_drvdata(&mhi_dev->dev, NULL);
+		return rc;
+	}
+
+	complete_all(&qdev->ready);
 	dev_dbg(qdev->dev, "Qualcomm MHI QRTR driver probed\n");
 
 	return 0;
-- 
cgit v1.2.3


From c87a4c542b5a796f795fec2b7a909c7d3067b11c Mon Sep 17 00:00:00 2001
From: Bijie Xu <bijie.xu@corigine.com>
Date: Tue, 3 Aug 2021 11:40:18 +0200
Subject: net: flow_offload: correct comments mismatch with code

Correct mismatch between the name of flow_offload_has_one_action()
and its kdoc entry.

Found using ./scripts/kernel-doc -Werror -none include/net/flow_offload.h

Signed-off-by: Bijie Xu <bijie.xu@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 69c9eabf8325..f3c2841566a0 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -293,7 +293,7 @@ static inline bool flow_action_has_entries(const struct flow_action *action)
 }
 
 /**
- * flow_action_has_one_action() - check if exactly one action is present
+ * flow_offload_has_one_action() - check if exactly one action is present
  * @action: tc filter flow offload action
  *
  * Returns true if exactly one action is present.
-- 
cgit v1.2.3


From 0161d151f3e36306219f5aa6f5f6b3877038afd3 Mon Sep 17 00:00:00 2001
From: Bijie Xu <bijie.xu@corigine.com>
Date: Tue, 3 Aug 2021 11:40:19 +0200
Subject: net: sched: provide missing kdoc for tcf_pkt_info and tcf_ematch_ops

Provide missing kdoc of fields of struct tcf_pkt_info and tcf_ematch_ops.

Found using ./scripts/kernel-doc -none -Werror include/net/pkt_cls.h

Signed-off-by: Bijie Xu <bijie.xu@corigine.com>
Signed-off-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index ec7823921bd2..298a8d10168b 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -337,6 +337,9 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts);
 
 /**
  * struct tcf_pkt_info - packet information
+ *
+ * @ptr: start of the pkt data
+ * @nexthdr: offset of the next header
  */
 struct tcf_pkt_info {
 	unsigned char *		ptr;
@@ -355,6 +358,7 @@ struct tcf_ematch_ops;
  * @ops: the operations lookup table of the corresponding ematch module
  * @datalen: length of the ematch specific configuration data
  * @data: ematch specific data
+ * @net: the network namespace
  */
 struct tcf_ematch {
 	struct tcf_ematch_ops * ops;
-- 
cgit v1.2.3


From 4039146777a91e1576da2bf38e0d8a1061a1ae47 Mon Sep 17 00:00:00 2001
From: Antoine Tenart <atenart@kernel.org>
Date: Tue, 3 Aug 2021 12:00:16 +0200
Subject: net: ipv6: fix returned variable type in ip6_skb_dst_mtu

The patch fixing the returned value of ip6_skb_dst_mtu (int -> unsigned
int) was rebased between its initial review and the version applied. In
the meantime fade56410c22 was applied, which added a new variable (int)
used as the returned value. This lead to a mismatch between the function
prototype and the variable used as the return value.

Fixes: 40fc3054b458 ("net: ipv6: fix return value of ip6_skb_dst_mtu")
Cc: Vadim Fedorenko <vfedorenko@novek.ru>
Signed-off-by: Antoine Tenart <atenart@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 625a38ccb5d9..0bf09a9bca4e 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -265,7 +265,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 
 static inline unsigned int ip6_skb_dst_mtu(struct sk_buff *skb)
 {
-	int mtu;
+	unsigned int mtu;
 
 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 				inet6_sk(skb->sk) : NULL;
-- 
cgit v1.2.3


From 5f7b51bf09baca8e4f80cbe879536842bafb5f31 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@netfilter.org>
Date: Wed, 28 Jul 2021 17:01:15 +0200
Subject: netfilter: ipset: Limit the maximal range of consecutive elements to
 add/delete

The range size of consecutive elements were not limited. Thus one could
define a huge range which may result soft lockup errors due to the long
execution time. Now the range size is limited to 2^20 entries.

Reported-by: Brad Spengler <spender@grsecurity.net>
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h       |  3 +++
 net/netfilter/ipset/ip_set_hash_ip.c         |  9 ++++++++-
 net/netfilter/ipset/ip_set_hash_ipmark.c     | 10 +++++++++-
 net/netfilter/ipset/ip_set_hash_ipport.c     |  3 +++
 net/netfilter/ipset/ip_set_hash_ipportip.c   |  3 +++
 net/netfilter/ipset/ip_set_hash_ipportnet.c  |  3 +++
 net/netfilter/ipset/ip_set_hash_net.c        | 11 ++++++++++-
 net/netfilter/ipset/ip_set_hash_netiface.c   | 10 +++++++++-
 net/netfilter/ipset/ip_set_hash_netnet.c     | 16 +++++++++++++++-
 net/netfilter/ipset/ip_set_hash_netport.c    | 11 ++++++++++-
 net/netfilter/ipset/ip_set_hash_netportnet.c | 16 +++++++++++++++-
 11 files changed, 88 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 10279c4830ac..ada1296c87d5 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -196,6 +196,9 @@ struct ip_set_region {
 	u32 elements;		/* Number of elements vs timeout */
 };
 
+/* Max range where every element is added/deleted in one step */
+#define IPSET_MAX_RANGE		(1<<20)
+
 /* The max revision number supported by any set type + 1 */
 #define IPSET_REVISION_MAX	9
 
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index d1bef23fd4f5..dd30c03d5a23 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -132,8 +132,11 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
 		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
 		if (ret)
 			return ret;
-		if (ip > ip_to)
+		if (ip > ip_to) {
+			if (ip_to == 0)
+				return -IPSET_ERR_HASH_ELEM;
 			swap(ip, ip_to);
+		}
 	} else if (tb[IPSET_ATTR_CIDR]) {
 		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
 
@@ -144,6 +147,10 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1);
 
+	/* 64bit division is not allowed on 32bit */
+	if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE)
+		return -ERANGE;
+
 	if (retried) {
 		ip = ntohl(h->next.ip);
 		e.ip = htonl(ip);
diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c
index 18346d18aa16..153de3457423 100644
--- a/net/netfilter/ipset/ip_set_hash_ipmark.c
+++ b/net/netfilter/ipset/ip_set_hash_ipmark.c
@@ -121,6 +121,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK]));
 	e.mark &= h->markmask;
+	if (e.mark == 0 && e.ip == 0)
+		return -IPSET_ERR_HASH_ELEM;
 
 	if (adt == IPSET_TEST ||
 	    !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) {
@@ -133,8 +135,11 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
 		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
 		if (ret)
 			return ret;
-		if (ip > ip_to)
+		if (ip > ip_to) {
+			if (e.mark == 0 && ip_to == 0)
+				return -IPSET_ERR_HASH_ELEM;
 			swap(ip, ip_to);
+		}
 	} else if (tb[IPSET_ATTR_CIDR]) {
 		u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
 
@@ -143,6 +148,9 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[],
 		ip_set_mask_from_to(ip, ip_to, cidr);
 	}
 
+	if (((u64)ip_to - ip + 1) > IPSET_MAX_RANGE)
+		return -ERANGE;
+
 	if (retried)
 		ip = ntohl(h->next.ip);
 	for (; ip <= ip_to; ip++) {
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index e1ca11196515..7303138e46be 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -173,6 +173,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(port, port_to);
 	}
 
+	if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE)
+		return -ERANGE;
+
 	if (retried)
 		ip = ntohl(h->next.ip);
 	for (; ip <= ip_to; ip++) {
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index ab179e064597..334fb1ad0e86 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -180,6 +180,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(port, port_to);
 	}
 
+	if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE)
+		return -ERANGE;
+
 	if (retried)
 		ip = ntohl(h->next.ip);
 	for (; ip <= ip_to; ip++) {
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 8f075b44cf64..7df94f437f60 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -253,6 +253,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 			swap(port, port_to);
 	}
 
+	if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE)
+		return -ERANGE;
+
 	ip2_to = ip2_from;
 	if (tb[IPSET_ATTR_IP2_TO]) {
 		ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index c1a11f041ac6..1422739d9aa2 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -140,7 +140,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_net4_elem e = { .cidr = HOST_MASK };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
-	u32 ip = 0, ip_to = 0;
+	u32 ip = 0, ip_to = 0, ipn, n = 0;
 	int ret;
 
 	if (tb[IPSET_ATTR_LINENO])
@@ -188,6 +188,15 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 		if (ip + UINT_MAX == ip_to)
 			return -IPSET_ERR_HASH_RANGE;
 	}
+	ipn = ip;
+	do {
+		ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr);
+		n++;
+	} while (ipn++ < ip_to);
+
+	if (n > IPSET_MAX_RANGE)
+		return -ERANGE;
+
 	if (retried)
 		ip = ntohl(h->next.ip);
 	do {
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index ddd51c2e1cb3..9810f5bf63f5 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -202,7 +202,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
-	u32 ip = 0, ip_to = 0;
+	u32 ip = 0, ip_to = 0, ipn, n = 0;
 	int ret;
 
 	if (tb[IPSET_ATTR_LINENO])
@@ -256,6 +256,14 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 	} else {
 		ip_set_mask_from_to(ip, ip_to, e.cidr);
 	}
+	ipn = ip;
+	do {
+		ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr);
+		n++;
+	} while (ipn++ < ip_to);
+
+	if (n > IPSET_MAX_RANGE)
+		return -ERANGE;
 
 	if (retried)
 		ip = ntohl(h->next.ip);
diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c
index 6532f0505e66..3d09eefe998a 100644
--- a/net/netfilter/ipset/ip_set_hash_netnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netnet.c
@@ -168,7 +168,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 	struct hash_netnet4_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
 	u32 ip = 0, ip_to = 0;
-	u32 ip2 = 0, ip2_from = 0, ip2_to = 0;
+	u32 ip2 = 0, ip2_from = 0, ip2_to = 0, ipn;
+	u64 n = 0, m = 0;
 	int ret;
 
 	if (tb[IPSET_ATTR_LINENO])
@@ -244,6 +245,19 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 	} else {
 		ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
 	}
+	ipn = ip;
+	do {
+		ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]);
+		n++;
+	} while (ipn++ < ip_to);
+	ipn = ip2_from;
+	do {
+		ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]);
+		m++;
+	} while (ipn++ < ip2_to);
+
+	if (n*m > IPSET_MAX_RANGE)
+		return -ERANGE;
 
 	if (retried) {
 		ip = ntohl(h->next.ip[0]);
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index ec1564a1cb5a..09cf72eb37f8 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -158,7 +158,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
-	u32 port, port_to, p = 0, ip = 0, ip_to = 0;
+	u32 port, port_to, p = 0, ip = 0, ip_to = 0, ipn;
+	u64 n = 0;
 	bool with_ports = false;
 	u8 cidr;
 	int ret;
@@ -235,6 +236,14 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 	} else {
 		ip_set_mask_from_to(ip, ip_to, e.cidr + 1);
 	}
+	ipn = ip;
+	do {
+		ipn = ip_set_range_to_cidr(ipn, ip_to, &cidr);
+		n++;
+	} while (ipn++ < ip_to);
+
+	if (n*(port_to - port + 1) > IPSET_MAX_RANGE)
+		return -ERANGE;
 
 	if (retried) {
 		ip = ntohl(h->next.ip);
diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c
index 0e91d1e82f1c..19bcdb3141f6 100644
--- a/net/netfilter/ipset/ip_set_hash_netportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_netportnet.c
@@ -182,7 +182,8 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 	struct hash_netportnet4_elem e = { };
 	struct ip_set_ext ext = IP_SET_INIT_UEXT(set);
 	u32 ip = 0, ip_to = 0, p = 0, port, port_to;
-	u32 ip2_from = 0, ip2_to = 0, ip2;
+	u32 ip2_from = 0, ip2_to = 0, ip2, ipn;
+	u64 n = 0, m = 0;
 	bool with_ports = false;
 	int ret;
 
@@ -284,6 +285,19 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 	} else {
 		ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]);
 	}
+	ipn = ip;
+	do {
+		ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]);
+		n++;
+	} while (ipn++ < ip_to);
+	ipn = ip2_from;
+	do {
+		ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]);
+		m++;
+	} while (ipn++ < ip2_to);
+
+	if (n*m*(port_to - port + 1) > IPSET_MAX_RANGE)
+		return -ERANGE;
 
 	if (retried) {
 		ip = ntohl(h->next.ip[0]);
-- 
cgit v1.2.3


From e04480920d1eec9c061841399aa6f35b6f987d8b Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Wed, 4 Aug 2021 19:26:56 +0900
Subject: Bluetooth: defer cleanup of resources in hci_unregister_dev()

syzbot is hitting might_sleep() warning at hci_sock_dev_event() due to
calling lock_sock() with rw spinlock held [1].

It seems that history of this locking problem is a trial and error.

Commit b40df5743ee8 ("[PATCH] bluetooth: fix socket locking in
hci_sock_dev_event()") in 2.6.21-rc4 changed bh_lock_sock() to
lock_sock() as an attempt to fix lockdep warning.

Then, commit 4ce61d1c7a8e ("[BLUETOOTH]: Fix locking in
hci_sock_dev_event().") in 2.6.22-rc2 changed lock_sock() to
local_bh_disable() + bh_lock_sock_nested() as an attempt to fix the
sleep in atomic context warning.

Then, commit 4b5dd696f81b ("Bluetooth: Remove local_bh_disable() from
hci_sock.c") in 3.3-rc1 removed local_bh_disable().

Then, commit e305509e678b ("Bluetooth: use correct lock to prevent UAF
of hdev object") in 5.13-rc5 again changed bh_lock_sock_nested() to
lock_sock() as an attempt to fix CVE-2021-3573.

This difficulty comes from current implementation that
hci_sock_dev_event(HCI_DEV_UNREG) is responsible for dropping all
references from sockets because hci_unregister_dev() immediately
reclaims resources as soon as returning from
hci_sock_dev_event(HCI_DEV_UNREG).

But the history suggests that hci_sock_dev_event(HCI_DEV_UNREG) was not
doing what it should do.

Therefore, instead of trying to detach sockets from device, let's accept
not detaching sockets from device at hci_sock_dev_event(HCI_DEV_UNREG),
by moving actual cleanup of resources from hci_unregister_dev() to
hci_cleanup_dev() which is called by bt_host_release() when all
references to this unregistered device (which is a kobject) are gone.

Since hci_sock_dev_event(HCI_DEV_UNREG) no longer resets
hci_pi(sk)->hdev, we need to check whether this device was unregistered
and return an error based on HCI_UNREGISTER flag.  There might be subtle
behavioral difference in "monitor the hdev" functionality; please report
if you found something went wrong due to this patch.

Link: https://syzkaller.appspot.com/bug?extid=a5df189917e79d5e59c9 [1]
Reported-by: syzbot <syzbot+a5df189917e79d5e59c9@syzkaller.appspotmail.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Fixes: e305509e678b ("Bluetooth: use correct lock to prevent UAF of hdev object")
Acked-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/hci_core.c         | 16 ++++++-------
 net/bluetooth/hci_sock.c         | 49 +++++++++++++++++++++++++++-------------
 net/bluetooth/hci_sysfs.c        |  3 +++
 4 files changed, 45 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index a53e94459ecd..db4312e44d47 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1230,6 +1230,7 @@ struct hci_dev *hci_alloc_dev(void);
 void hci_free_dev(struct hci_dev *hdev);
 int hci_register_dev(struct hci_dev *hdev);
 void hci_unregister_dev(struct hci_dev *hdev);
+void hci_cleanup_dev(struct hci_dev *hdev);
 int hci_suspend_dev(struct hci_dev *hdev);
 int hci_resume_dev(struct hci_dev *hdev);
 int hci_reset_dev(struct hci_dev *hdev);
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 2560ed2f144d..e1a545c8a69f 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3996,14 +3996,10 @@ EXPORT_SYMBOL(hci_register_dev);
 /* Unregister HCI device */
 void hci_unregister_dev(struct hci_dev *hdev)
 {
-	int id;
-
 	BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
 
 	hci_dev_set_flag(hdev, HCI_UNREGISTER);
 
-	id = hdev->id;
-
 	write_lock(&hci_dev_list_lock);
 	list_del(&hdev->list);
 	write_unlock(&hci_dev_list_lock);
@@ -4038,7 +4034,14 @@ void hci_unregister_dev(struct hci_dev *hdev)
 	}
 
 	device_del(&hdev->dev);
+	/* Actual cleanup is deferred until hci_cleanup_dev(). */
+	hci_dev_put(hdev);
+}
+EXPORT_SYMBOL(hci_unregister_dev);
 
+/* Cleanup HCI device */
+void hci_cleanup_dev(struct hci_dev *hdev)
+{
 	debugfs_remove_recursive(hdev->debugfs);
 	kfree_const(hdev->hw_info);
 	kfree_const(hdev->fw_info);
@@ -4063,11 +4066,8 @@ void hci_unregister_dev(struct hci_dev *hdev)
 	hci_blocked_keys_clear(hdev);
 	hci_dev_unlock(hdev);
 
-	hci_dev_put(hdev);
-
-	ida_simple_remove(&hci_index_ida, id);
+	ida_simple_remove(&hci_index_ida, hdev->id);
 }
-EXPORT_SYMBOL(hci_unregister_dev);
 
 /* Suspend HCI device */
 int hci_suspend_dev(struct hci_dev *hdev)
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index b04a5a02ecf3..f1128c2134f0 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -59,6 +59,17 @@ struct hci_pinfo {
 	char              comm[TASK_COMM_LEN];
 };
 
+static struct hci_dev *hci_hdev_from_sock(struct sock *sk)
+{
+	struct hci_dev *hdev = hci_pi(sk)->hdev;
+
+	if (!hdev)
+		return ERR_PTR(-EBADFD);
+	if (hci_dev_test_flag(hdev, HCI_UNREGISTER))
+		return ERR_PTR(-EPIPE);
+	return hdev;
+}
+
 void hci_sock_set_flag(struct sock *sk, int nr)
 {
 	set_bit(nr, &hci_pi(sk)->flags);
@@ -759,19 +770,13 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event)
 	if (event == HCI_DEV_UNREG) {
 		struct sock *sk;
 
-		/* Detach sockets from device */
+		/* Wake up sockets using this dead device */
 		read_lock(&hci_sk_list.lock);
 		sk_for_each(sk, &hci_sk_list.head) {
-			lock_sock(sk);
 			if (hci_pi(sk)->hdev == hdev) {
-				hci_pi(sk)->hdev = NULL;
 				sk->sk_err = EPIPE;
-				sk->sk_state = BT_OPEN;
 				sk->sk_state_change(sk);
-
-				hci_dev_put(hdev);
 			}
-			release_sock(sk);
 		}
 		read_unlock(&hci_sk_list.lock);
 	}
@@ -930,10 +935,10 @@ static int hci_sock_reject_list_del(struct hci_dev *hdev, void __user *arg)
 static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd,
 				unsigned long arg)
 {
-	struct hci_dev *hdev = hci_pi(sk)->hdev;
+	struct hci_dev *hdev = hci_hdev_from_sock(sk);
 
-	if (!hdev)
-		return -EBADFD;
+	if (IS_ERR(hdev))
+		return PTR_ERR(hdev);
 
 	if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
 		return -EBUSY;
@@ -1103,6 +1108,18 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
 
 	lock_sock(sk);
 
+	/* Allow detaching from dead device and attaching to alive device, if
+	 * the caller wants to re-bind (instead of close) this socket in
+	 * response to hci_sock_dev_event(HCI_DEV_UNREG) notification.
+	 */
+	hdev = hci_pi(sk)->hdev;
+	if (hdev && hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+		hci_pi(sk)->hdev = NULL;
+		sk->sk_state = BT_OPEN;
+		hci_dev_put(hdev);
+	}
+	hdev = NULL;
+
 	if (sk->sk_state == BT_BOUND) {
 		err = -EALREADY;
 		goto done;
@@ -1379,9 +1396,9 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr,
 
 	lock_sock(sk);
 
-	hdev = hci_pi(sk)->hdev;
-	if (!hdev) {
-		err = -EBADFD;
+	hdev = hci_hdev_from_sock(sk);
+	if (IS_ERR(hdev)) {
+		err = PTR_ERR(hdev);
 		goto done;
 	}
 
@@ -1743,9 +1760,9 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
 		goto done;
 	}
 
-	hdev = hci_pi(sk)->hdev;
-	if (!hdev) {
-		err = -EBADFD;
+	hdev = hci_hdev_from_sock(sk);
+	if (IS_ERR(hdev)) {
+		err = PTR_ERR(hdev);
 		goto done;
 	}
 
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 9874844a95a9..b69d88b88d2e 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -83,6 +83,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn)
 static void bt_host_release(struct device *dev)
 {
 	struct hci_dev *hdev = to_hci_dev(dev);
+
+	if (hci_dev_test_flag(hdev, HCI_UNREGISTER))
+		hci_cleanup_dev(hdev);
 	kfree(hdev);
 	module_put(THIS_MODULE);
 }
-- 
cgit v1.2.3


From a6e57c4af12bbacf927d7321c3aa894948653688 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 3 Aug 2021 00:15:54 +0200
Subject: netfilter: nfnetlink_hook: missing chain family

The family is relevant for pseudo-families like NFPROTO_INET
otherwise the user needs to rely on the hook function name to
differentiate it from NFPROTO_IPV4 and NFPROTO_IPV6 names.

Add nfnl_hook_chain_desc_attributes instead of using the existing
NFTA_CHAIN_* attributes, since these do not provide a family number.

Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_hook.h | 9 +++++++++
 net/netfilter/nfnetlink_hook.c                | 8 ++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nfnetlink_hook.h b/include/uapi/linux/netfilter/nfnetlink_hook.h
index 912ec60b26b0..bbcd285b22e1 100644
--- a/include/uapi/linux/netfilter/nfnetlink_hook.h
+++ b/include/uapi/linux/netfilter/nfnetlink_hook.h
@@ -43,6 +43,15 @@ enum nfnl_hook_chain_info_attributes {
 };
 #define NFNLA_HOOK_INFO_MAX (__NFNLA_HOOK_INFO_MAX - 1)
 
+enum nfnl_hook_chain_desc_attributes {
+	NFNLA_CHAIN_UNSPEC,
+	NFNLA_CHAIN_TABLE,
+	NFNLA_CHAIN_FAMILY,
+	NFNLA_CHAIN_NAME,
+	__NFNLA_CHAIN_MAX,
+};
+#define NFNLA_CHAIN_MAX (__NFNLA_CHAIN_MAX - 1)
+
 /**
  * enum nfnl_hook_chaintype - chain type
  *
diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c
index ca453c61dbdf..e0ff2973fd14 100644
--- a/net/netfilter/nfnetlink_hook.c
+++ b/net/netfilter/nfnetlink_hook.c
@@ -89,11 +89,15 @@ static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb,
 	if (!nest2)
 		goto cancel_nest;
 
-	ret = nla_put_string(nlskb, NFTA_CHAIN_TABLE, chain->table->name);
+	ret = nla_put_string(nlskb, NFNLA_CHAIN_TABLE, chain->table->name);
 	if (ret)
 		goto cancel_nest;
 
-	ret = nla_put_string(nlskb, NFTA_CHAIN_NAME, chain->name);
+	ret = nla_put_string(nlskb, NFNLA_CHAIN_NAME, chain->name);
+	if (ret)
+		goto cancel_nest;
+
+	ret = nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, chain->table->family);
 	if (ret)
 		goto cancel_nest;
 
-- 
cgit v1.2.3


From 4592ee7f525c4683ec9e290381601fdee50ae110 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 4 Aug 2021 15:02:15 +0200
Subject: netfilter: conntrack: remove offload_pickup sysctl again

These two sysctls were added because the hardcoded defaults (2 minutes,
tcp, 30 seconds, udp) turned out to be too low for some setups.

They appeared in 5.14-rc1 so it should be fine to remove it again.

Marcelo convinced me that there should be no difference between a flow
that was offloaded vs. a flow that was not wrt. timeout handling.
Thus the default is changed to those for TCP established and UDP stream,
5 days and 120 seconds, respectively.

Marcelo also suggested to account for the timeout value used for the
offloading, this avoids increase beyond the value in the conntrack-sysctl
and will also instantly expire the conntrack entry with altered sysctls.

Example:
   nf_conntrack_udp_timeout_stream=60
   nf_flowtable_udp_timeout=60

This will remove offloaded udp flows after one minute, rather than two.

An earlier version of this patch also cleared the ASSURED bit to
allow nf_conntrack to evict the entry via early_drop (i.e., table full).
However, it looks like we can safely assume that connection timed out
via HW is still in established state, so this isn't needed.

Quoting Oz:
 [..] the hardware sends all packets with a set FIN flags to sw.
 [..] Connections that are aged in hardware are expected to be in the
 established state.

In case it turns out that back-to-sw-path transition can occur for
'dodgy' connections too (e.g., one side disappeared while software-path
would have been in RETRANS timeout), we can adjust this later.

Cc: Oz Shlomo <ozsh@nvidia.com>
Cc: Paul Blakey <paulb@nvidia.com>
Suggested-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Reviewed-by: Oz Shlomo <ozsh@nvidia.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/nf_conntrack-sysctl.rst | 10 ----------
 include/net/netns/conntrack.h                    |  2 --
 net/netfilter/nf_conntrack_proto_tcp.c           |  1 -
 net/netfilter/nf_conntrack_proto_udp.c           |  1 -
 net/netfilter/nf_conntrack_standalone.c          | 16 ----------------
 net/netfilter/nf_flow_table_core.c               | 11 ++++++++---
 6 files changed, 8 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index d31ed6c1cb0d..024d784157c8 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -191,19 +191,9 @@ nf_flowtable_tcp_timeout - INTEGER (seconds)
         TCP connections may be offloaded from nf conntrack to nf flow table.
         Once aged, the connection is returned to nf conntrack with tcp pickup timeout.
 
-nf_flowtable_tcp_pickup - INTEGER (seconds)
-        default 120
-
-        TCP connection timeout after being aged from nf flow table offload.
-
 nf_flowtable_udp_timeout - INTEGER (seconds)
         default 30
 
         Control offload timeout for udp connections.
         UDP connections may be offloaded from nf conntrack to nf flow table.
         Once aged, the connection is returned to nf conntrack with udp pickup timeout.
-
-nf_flowtable_udp_pickup - INTEGER (seconds)
-        default 30
-
-        UDP connection timeout after being aged from nf flow table offload.
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 37e5300c7e5a..fefd38db95b3 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -30,7 +30,6 @@ struct nf_tcp_net {
 	u8 tcp_ignore_invalid_rst;
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
 	unsigned int offload_timeout;
-	unsigned int offload_pickup;
 #endif
 };
 
@@ -44,7 +43,6 @@ struct nf_udp_net {
 	unsigned int timeouts[UDP_CT_MAX];
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
 	unsigned int offload_timeout;
-	unsigned int offload_pickup;
 #endif
 };
 
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 3259416f2ea4..af5115e127cf 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1478,7 +1478,6 @@ void nf_conntrack_tcp_init_net(struct net *net)
 
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
 	tn->offload_timeout = 30 * HZ;
-	tn->offload_pickup = 120 * HZ;
 #endif
 }
 
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 698fee49e732..f8e3c0d2602f 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -271,7 +271,6 @@ void nf_conntrack_udp_init_net(struct net *net)
 
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
 	un->offload_timeout = 30 * HZ;
-	un->offload_pickup = 30 * HZ;
 #endif
 }
 
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 214d9f9e499b..e84b499b7bfa 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -575,7 +575,6 @@ enum nf_ct_sysctl_index {
 	NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK,
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
 	NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP,
 #endif
 	NF_SYSCTL_CT_PROTO_TCP_LOOSE,
 	NF_SYSCTL_CT_PROTO_TCP_LIBERAL,
@@ -585,7 +584,6 @@ enum nf_ct_sysctl_index {
 	NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM,
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
 	NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD,
-	NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP,
 #endif
 	NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP,
 	NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6,
@@ -776,12 +774,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP] = {
-		.procname	= "nf_flowtable_tcp_pickup",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
 #endif
 	[NF_SYSCTL_CT_PROTO_TCP_LOOSE] = {
 		.procname	= "nf_conntrack_tcp_loose",
@@ -832,12 +824,6 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
-	[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP] = {
-		.procname	= "nf_flowtable_udp_pickup",
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_jiffies,
-	},
 #endif
 	[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = {
 		.procname	= "nf_conntrack_icmp_timeout",
@@ -1018,7 +1004,6 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net,
 
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
 	table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout;
-	table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP].data = &tn->offload_pickup;
 #endif
 
 }
@@ -1111,7 +1096,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 	table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED];
 #if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
 	table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout;
-	table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP].data = &un->offload_pickup;
 #endif
 
 	nf_conntrack_standalone_init_tcp_sysctl(net, table);
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 551976e4284c..8788b519255e 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -183,7 +183,7 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
 	const struct nf_conntrack_l4proto *l4proto;
 	struct net *net = nf_ct_net(ct);
 	int l4num = nf_ct_protonum(ct);
-	unsigned int timeout;
+	s32 timeout;
 
 	l4proto = nf_ct_l4proto_find(l4num);
 	if (!l4proto)
@@ -192,15 +192,20 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
 	if (l4num == IPPROTO_TCP) {
 		struct nf_tcp_net *tn = nf_tcp_pernet(net);
 
-		timeout = tn->offload_pickup;
+		timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
+		timeout -= tn->offload_timeout;
 	} else if (l4num == IPPROTO_UDP) {
 		struct nf_udp_net *tn = nf_udp_pernet(net);
 
-		timeout = tn->offload_pickup;
+		timeout = tn->timeouts[UDP_CT_REPLIED];
+		timeout -= tn->offload_timeout;
 	} else {
 		return;
 	}
 
+	if (timeout < 0)
+		timeout = 0;
+
 	if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout)
 		ct->timeout = nfct_time_stamp + timeout;
 }
-- 
cgit v1.2.3


From 1027b96ec9d34f9abab69bc1a4dc5b1ad8ab1349 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Fri, 6 Aug 2021 16:21:24 +0800
Subject: once: Fix panic when module unload

DO_ONCE
DEFINE_STATIC_KEY_TRUE(___once_key);
__do_once_done
  once_disable_jump(once_key);
    INIT_WORK(&w->work, once_deferred);
    struct once_work *w;
    w->key = key;
    schedule_work(&w->work);                     module unload
                                                   //*the key is
destroy*
process_one_work
  once_deferred
    BUG_ON(!static_key_enabled(work->key));
       static_key_count((struct static_key *)x)    //*access key, crash*

When module uses DO_ONCE mechanism, it could crash due to the above
concurrency problem, we could reproduce it with link[1].

Fix it by add/put module refcount in the once work process.

[1] https://lore.kernel.org/netdev/eaa6c371-465e-57eb-6be9-f4b16b9d7cbf@huawei.com/

Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Reported-by: Minmin chen <chenmingmin@huawei.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/once.h |  4 ++--
 lib/once.c           | 11 ++++++++---
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/once.h b/include/linux/once.h
index 9225ee6d96c7..ae6f4eb41cbe 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -7,7 +7,7 @@
 
 bool __do_once_start(bool *done, unsigned long *flags);
 void __do_once_done(bool *done, struct static_key_true *once_key,
-		    unsigned long *flags);
+		    unsigned long *flags, struct module *mod);
 
 /* Call a function exactly once. The idea of DO_ONCE() is to perform
  * a function call such as initialization of random seeds, etc, only
@@ -46,7 +46,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key,
 			if (unlikely(___ret)) {				     \
 				func(__VA_ARGS__);			     \
 				__do_once_done(&___done, &___once_key,	     \
-					       &___flags);		     \
+					       &___flags, THIS_MODULE);	     \
 			}						     \
 		}							     \
 		___ret;							     \
diff --git a/lib/once.c b/lib/once.c
index 8b7d6235217e..59149bf3bfb4 100644
--- a/lib/once.c
+++ b/lib/once.c
@@ -3,10 +3,12 @@
 #include <linux/spinlock.h>
 #include <linux/once.h>
 #include <linux/random.h>
+#include <linux/module.h>
 
 struct once_work {
 	struct work_struct work;
 	struct static_key_true *key;
+	struct module *module;
 };
 
 static void once_deferred(struct work_struct *w)
@@ -16,10 +18,11 @@ static void once_deferred(struct work_struct *w)
 	work = container_of(w, struct once_work, work);
 	BUG_ON(!static_key_enabled(work->key));
 	static_branch_disable(work->key);
+	module_put(work->module);
 	kfree(work);
 }
 
-static void once_disable_jump(struct static_key_true *key)
+static void once_disable_jump(struct static_key_true *key, struct module *mod)
 {
 	struct once_work *w;
 
@@ -29,6 +32,8 @@ static void once_disable_jump(struct static_key_true *key)
 
 	INIT_WORK(&w->work, once_deferred);
 	w->key = key;
+	w->module = mod;
+	__module_get(mod);
 	schedule_work(&w->work);
 }
 
@@ -53,11 +58,11 @@ bool __do_once_start(bool *done, unsigned long *flags)
 EXPORT_SYMBOL(__do_once_start);
 
 void __do_once_done(bool *done, struct static_key_true *once_key,
-		    unsigned long *flags)
+		    unsigned long *flags, struct module *mod)
 	__releases(once_lock)
 {
 	*done = true;
 	spin_unlock_irqrestore(&once_lock, *flags);
-	once_disable_jump(once_key);
+	once_disable_jump(once_key, mod);
 }
 EXPORT_SYMBOL(__do_once_done);
-- 
cgit v1.2.3


From 71330842ff93ae67a066c1fa68d75672527312fa Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 9 Aug 2021 21:45:32 +0200
Subject: bpf: Add _kernel suffix to internal lockdown_bpf_read

Rename LOCKDOWN_BPF_READ into LOCKDOWN_BPF_READ_KERNEL so we have naming
more consistent with a LOCKDOWN_BPF_WRITE_USER option that we are adding.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/security.h | 2 +-
 kernel/bpf/helpers.c     | 4 ++--
 kernel/trace/bpf_trace.c | 8 ++++----
 security/security.c      | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 24eda04221e9..724d7a4a0c91 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -123,7 +123,7 @@ enum lockdown_reason {
 	LOCKDOWN_INTEGRITY_MAX,
 	LOCKDOWN_KCORE,
 	LOCKDOWN_KPROBES,
-	LOCKDOWN_BPF_READ,
+	LOCKDOWN_BPF_READ_KERNEL,
 	LOCKDOWN_PERF,
 	LOCKDOWN_TRACEFS,
 	LOCKDOWN_XMON_RW,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 62cf00383910..0b04553e8c44 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1070,12 +1070,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_probe_read_user:
 		return &bpf_probe_read_user_proto;
 	case BPF_FUNC_probe_read_kernel:
-		return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
 		       NULL : &bpf_probe_read_kernel_proto;
 	case BPF_FUNC_probe_read_user_str:
 		return &bpf_probe_read_user_str_proto;
 	case BPF_FUNC_probe_read_kernel_str:
-		return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
 		       NULL : &bpf_probe_read_kernel_str_proto;
 	case BPF_FUNC_snprintf_btf:
 		return &bpf_snprintf_btf_proto;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b4916ef388ad..1836591197a5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -999,19 +999,19 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_probe_read_user:
 		return &bpf_probe_read_user_proto;
 	case BPF_FUNC_probe_read_kernel:
-		return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
 		       NULL : &bpf_probe_read_kernel_proto;
 	case BPF_FUNC_probe_read_user_str:
 		return &bpf_probe_read_user_str_proto;
 	case BPF_FUNC_probe_read_kernel_str:
-		return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
 		       NULL : &bpf_probe_read_kernel_str_proto;
 #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
 	case BPF_FUNC_probe_read:
-		return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
 		       NULL : &bpf_probe_read_compat_proto;
 	case BPF_FUNC_probe_read_str:
-		return security_locked_down(LOCKDOWN_BPF_READ) < 0 ?
+		return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ?
 		       NULL : &bpf_probe_read_compat_str_proto;
 #endif
 #ifdef CONFIG_CGROUPS
diff --git a/security/security.c b/security/security.c
index 09533cbb7221..6b83ab4e9d66 100644
--- a/security/security.c
+++ b/security/security.c
@@ -61,7 +61,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
 	[LOCKDOWN_INTEGRITY_MAX] = "integrity",
 	[LOCKDOWN_KCORE] = "/proc/kcore access",
 	[LOCKDOWN_KPROBES] = "use of kprobes",
-	[LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM",
+	[LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM",
 	[LOCKDOWN_PERF] = "unsafe use of perf",
 	[LOCKDOWN_TRACEFS] = "use of tracefs",
 	[LOCKDOWN_XMON_RW] = "xmon read and write access",
-- 
cgit v1.2.3


From beb7f2de5728b0bd2140a652fa51f6ad85d159f7 Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@nvidia.com>
Date: Sun, 8 Aug 2021 09:52:42 +0300
Subject: psample: Add a fwd declaration for skbuff

Without this there is a warning if source files include psample.h
before skbuff.h or doesn't include it at all.

Fixes: 6ae0a6286171 ("net: Introduce psample, a new genetlink channel for packet sampling")
Signed-off-by: Roi Dayan <roid@nvidia.com>
Link: https://lore.kernel.org/r/20210808065242.1522535-1-roid@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/psample.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/psample.h b/include/net/psample.h
index e328c5127757..0509d2d6be67 100644
--- a/include/net/psample.h
+++ b/include/net/psample.h
@@ -31,6 +31,8 @@ struct psample_group *psample_group_get(struct net *net, u32 group_num);
 void psample_group_take(struct psample_group *group);
 void psample_group_put(struct psample_group *group);
 
+struct sk_buff;
+
 #if IS_ENABLED(CONFIG_PSAMPLE)
 
 void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
-- 
cgit v1.2.3


From 563476ae0c5e48a028cbfa38fa9d2fc0418eb88f Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Sun, 11 Apr 2021 15:32:55 +0300
Subject: net/mlx5: Synchronize correct IRQ when destroying CQ

The CQ destroy is performed based on the IRQ number that is stored in
cq->irqn. That number wasn't set explicitly during CQ creation and as
expected some of the API users of mlx5_core_create_cq() forgot to update
it.

This caused to wrong synchronization call of the wrong IRQ with a number
0 instead of the real one.

As a fix, set the IRQ number directly in the mlx5_core_create_cq() and
update all users accordingly.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Fixes: ef1659ade359 ("IB/mlx5: Add DEVX support for CQ events")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/infiniband/hw/mlx5/cq.c                      |  4 +---
 drivers/infiniband/hw/mlx5/devx.c                    |  3 +--
 drivers/net/ethernet/mellanox/mlx5/core/cq.c         |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c    | 13 ++-----------
 drivers/net/ethernet/mellanox/mlx5/core/eq.c         | 20 ++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c  |  4 +---
 drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h     |  2 ++
 .../ethernet/mellanox/mlx5/core/steering/dr_send.c   |  4 +---
 drivers/vdpa/mlx5/net/mlx5_vnet.c                    |  3 +--
 include/linux/mlx5/driver.h                          |  3 +--
 10 files changed, 27 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 7abeb576b3c5..b8e5e371bb19 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -945,7 +945,6 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 	u32 *cqb = NULL;
 	void *cqc;
 	int cqe_size;
-	unsigned int irqn;
 	int eqn;
 	int err;
 
@@ -984,7 +983,7 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		INIT_WORK(&cq->notify_work, notify_soft_wc_handler);
 	}
 
-	err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn);
+	err = mlx5_vector2eqn(dev->mdev, vector, &eqn);
 	if (err)
 		goto err_cqb;
 
@@ -1007,7 +1006,6 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		goto err_cqb;
 
 	mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
-	cq->mcq.irqn = irqn;
 	if (udata)
 		cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp;
 	else
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index eb9b0a2707f8..c869b2a91a28 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -975,7 +975,6 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)(
 	struct mlx5_ib_dev *dev;
 	int user_vector;
 	int dev_eqn;
-	unsigned int irqn;
 	int err;
 
 	if (uverbs_copy_from(&user_vector, attrs,
@@ -987,7 +986,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)(
 		return PTR_ERR(c);
 	dev = to_mdev(c->ibucontext.device);
 
-	err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn, &irqn);
+	err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn);
 	if (err < 0)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
index df3e4938ecdd..360e093874d4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -134,6 +134,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
 			      cq->cqn);
 
 	cq->uar = dev->priv.uar;
+	cq->irqn = eq->core.irqn;
 
 	return 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index fd250f7bcd88..24f919ef9b8e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1535,15 +1535,9 @@ static int mlx5e_alloc_cq_common(struct mlx5e_priv *priv,
 {
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5_core_cq *mcq = &cq->mcq;
-	int eqn_not_used;
-	unsigned int irqn;
 	int err;
 	u32 i;
 
-	err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn_not_used, &irqn);
-	if (err)
-		return err;
-
 	err = mlx5_cqwq_create(mdev, &param->wq, param->cqc, &cq->wq,
 			       &cq->wq_ctrl);
 	if (err)
@@ -1557,7 +1551,6 @@ static int mlx5e_alloc_cq_common(struct mlx5e_priv *priv,
 	mcq->vector     = param->eq_ix;
 	mcq->comp       = mlx5e_completion_event;
 	mcq->event      = mlx5e_cq_error_event;
-	mcq->irqn       = irqn;
 
 	for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
 		struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
@@ -1605,11 +1598,10 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 	void *in;
 	void *cqc;
 	int inlen;
-	unsigned int irqn_not_used;
 	int eqn;
 	int err;
 
-	err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn, &irqn_not_used);
+	err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn);
 	if (err)
 		return err;
 
@@ -1983,9 +1975,8 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
 	struct mlx5e_channel *c;
 	unsigned int irq;
 	int err;
-	int eqn;
 
-	err = mlx5_vector2eqn(priv->mdev, ix, &eqn, &irq);
+	err = mlx5_vector2irqn(priv->mdev, ix, &irq);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 6e074cc457de..605c8ecc3610 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -855,8 +855,8 @@ clean:
 	return err;
 }
 
-int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
-		    unsigned int *irqn)
+static int vector2eqnirqn(struct mlx5_core_dev *dev, int vector, int *eqn,
+			  unsigned int *irqn)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
 	struct mlx5_eq_comp *eq, *n;
@@ -865,8 +865,10 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
 
 	list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) {
 		if (i++ == vector) {
-			*eqn = eq->core.eqn;
-			*irqn = eq->core.irqn;
+			if (irqn)
+				*irqn = eq->core.irqn;
+			if (eqn)
+				*eqn = eq->core.eqn;
 			err = 0;
 			break;
 		}
@@ -874,8 +876,18 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
 
 	return err;
 }
+
+int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn)
+{
+	return vector2eqnirqn(dev, vector, eqn, NULL);
+}
 EXPORT_SYMBOL(mlx5_vector2eqn);
 
+int mlx5_vector2irqn(struct mlx5_core_dev *dev, int vector, unsigned int *irqn)
+{
+	return vector2eqnirqn(dev, vector, NULL, irqn);
+}
+
 unsigned int mlx5_comp_vectors_count(struct mlx5_core_dev *dev)
 {
 	return dev->priv.eq_table->num_comp_eqs;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
index bd66ab2af5b5..d5da4ab65766 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c
@@ -417,7 +417,6 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 	struct mlx5_wq_param wqp;
 	struct mlx5_cqe64 *cqe;
 	int inlen, err, eqn;
-	unsigned int irqn;
 	void *cqc, *in;
 	__be64 *pas;
 	u32 i;
@@ -446,7 +445,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 		goto err_cqwq;
 	}
 
-	err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn, &irqn);
+	err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn);
 	if (err) {
 		kvfree(in);
 		goto err_cqwq;
@@ -476,7 +475,6 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size)
 	*conn->cq.mcq.arm_db    = 0;
 	conn->cq.mcq.vector     = 0;
 	conn->cq.mcq.comp       = mlx5_fpga_conn_cq_complete;
-	conn->cq.mcq.irqn       = irqn;
 	conn->cq.mcq.uar        = fdev->conn_res.uar;
 	tasklet_setup(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
index 624cedebb510..d3d628b862f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h
@@ -104,4 +104,6 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev);
 struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev);
 #endif
 
+int mlx5_vector2irqn(struct mlx5_core_dev *dev, int vector, unsigned int *irqn);
+
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
index 12cf323a5943..9df0e73d1c35 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c
@@ -749,7 +749,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
 	struct mlx5_cqe64 *cqe;
 	struct mlx5dr_cq *cq;
 	int inlen, err, eqn;
-	unsigned int irqn;
 	void *cqc, *in;
 	__be64 *pas;
 	int vector;
@@ -782,7 +781,7 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
 		goto err_cqwq;
 
 	vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev);
-	err = mlx5_vector2eqn(mdev, vector, &eqn, &irqn);
+	err = mlx5_vector2eqn(mdev, vector, &eqn);
 	if (err) {
 		kvfree(in);
 		goto err_cqwq;
@@ -818,7 +817,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev,
 	*cq->mcq.arm_db = cpu_to_be32(2 << 28);
 
 	cq->mcq.vector = 0;
-	cq->mcq.irqn = irqn;
 	cq->mcq.uar = uar;
 
 	return cq;
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 2a31467f7ac5..379a19144a25 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -526,7 +526,6 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
-	unsigned int irqn;
 	__be64 *pas;
 	int inlen;
 	void *cqc;
@@ -566,7 +565,7 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
 	/* Use vector 0 by default. Consider adding code to choose least used
 	 * vector.
 	 */
-	err = mlx5_vector2eqn(mdev, 0, &eqn, &irqn);
+	err = mlx5_vector2eqn(mdev, 0, &eqn);
 	if (err)
 		goto err_vec;
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1efe37466969..25a8be58d289 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1044,8 +1044,7 @@ void mlx5_unregister_debugfs(void);
 void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas);
 void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm);
 void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas);
-int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
-		    unsigned int *irqn);
+int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn);
 int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
 int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
 
-- 
cgit v1.2.3


From 51e1bb9eeaf7868db56e58f47848e364ab4c4129 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 9 Aug 2021 12:43:17 +0200
Subject: bpf: Add lockdown check for probe_write_user helper

Back then, commit 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper
to be called in tracers") added the bpf_probe_write_user() helper in order
to allow to override user space memory. Its original goal was to have a
facility to "debug, divert, and manipulate execution of semi-cooperative
processes" under CAP_SYS_ADMIN. Write to kernel was explicitly disallowed
since it would otherwise tamper with its integrity.

One use case was shown in cf9b1199de27 ("samples/bpf: Add test/example of
using bpf_probe_write_user bpf helper") where the program DNATs traffic
at the time of connect(2) syscall, meaning, it rewrites the arguments to
a syscall while they're still in userspace, and before the syscall has a
chance to copy the argument into kernel space. These days we have better
mechanisms in BPF for achieving the same (e.g. for load-balancers), but
without having to write to userspace memory.

Of course the bpf_probe_write_user() helper can also be used to abuse
many other things for both good or bad purpose. Outside of BPF, there is
a similar mechanism for ptrace(2) such as PTRACE_PEEK{TEXT,DATA} and
PTRACE_POKE{TEXT,DATA}, but would likely require some more effort.
Commit 96ae52279594 explicitly dedicated the helper for experimentation
purpose only. Thus, move the helper's availability behind a newly added
LOCKDOWN_BPF_WRITE_USER lockdown knob so that the helper is disabled under
the "integrity" mode. More fine-grained control can be implemented also
from LSM side with this change.

Fixes: 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
---
 include/linux/security.h | 1 +
 kernel/trace/bpf_trace.c | 5 +++--
 security/security.c      | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/security.h b/include/linux/security.h
index 724d7a4a0c91..5b7288521300 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -120,6 +120,7 @@ enum lockdown_reason {
 	LOCKDOWN_MMIOTRACE,
 	LOCKDOWN_DEBUGFS,
 	LOCKDOWN_XMON_WR,
+	LOCKDOWN_BPF_WRITE_USER,
 	LOCKDOWN_INTEGRITY_MAX,
 	LOCKDOWN_KCORE,
 	LOCKDOWN_KPROBES,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1836591197a5..fdd14072fc3b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -990,12 +990,13 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_numa_node_id_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
-	case BPF_FUNC_probe_write_user:
-		return bpf_get_probe_write_proto();
 	case BPF_FUNC_current_task_under_cgroup:
 		return &bpf_current_task_under_cgroup_proto;
 	case BPF_FUNC_get_prandom_u32:
 		return &bpf_get_prandom_u32_proto;
+	case BPF_FUNC_probe_write_user:
+		return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
+		       NULL : bpf_get_probe_write_proto();
 	case BPF_FUNC_probe_read_user:
 		return &bpf_probe_read_user_proto;
 	case BPF_FUNC_probe_read_kernel:
diff --git a/security/security.c b/security/security.c
index 6b83ab4e9d66..9ffa9e9c5c55 100644
--- a/security/security.c
+++ b/security/security.c
@@ -58,6 +58,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = {
 	[LOCKDOWN_MMIOTRACE] = "unsafe mmio",
 	[LOCKDOWN_DEBUGFS] = "debugfs access",
 	[LOCKDOWN_XMON_WR] = "xmon write access",
+	[LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM",
 	[LOCKDOWN_INTEGRITY_MAX] = "integrity",
 	[LOCKDOWN_KCORE] = "/proc/kcore access",
 	[LOCKDOWN_KPROBES] = "use of kprobes",
-- 
cgit v1.2.3


From a2baf4e8bb0f306fbed7b5e6197c02896a638ab5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 9 Aug 2021 18:04:13 -0700
Subject: bpf: Fix potentially incorrect results with bpf_get_local_storage()

Commit b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage()
helper") fixed a bug for bpf_get_local_storage() helper so different tasks
won't mess up with each other's percpu local storage.

The percpu data contains 8 slots so it can hold up to 8 contexts (same or
different tasks), for 8 different program runs, at the same time. This in
general is sufficient. But our internal testing showed the following warning
multiple times:

  [...]
  warning: WARNING: CPU: 13 PID: 41661 at include/linux/bpf-cgroup.h:193
     __cgroup_bpf_run_filter_sock_ops+0x13e/0x180
  RIP: 0010:__cgroup_bpf_run_filter_sock_ops+0x13e/0x180
  <IRQ>
   tcp_call_bpf.constprop.99+0x93/0xc0
   tcp_conn_request+0x41e/0xa50
   ? tcp_rcv_state_process+0x203/0xe00
   tcp_rcv_state_process+0x203/0xe00
   ? sk_filter_trim_cap+0xbc/0x210
   ? tcp_v6_inbound_md5_hash.constprop.41+0x44/0x160
   tcp_v6_do_rcv+0x181/0x3e0
   tcp_v6_rcv+0xc65/0xcb0
   ip6_protocol_deliver_rcu+0xbd/0x450
   ip6_input_finish+0x11/0x20
   ip6_input+0xb5/0xc0
   ip6_sublist_rcv_finish+0x37/0x50
   ip6_sublist_rcv+0x1dc/0x270
   ipv6_list_rcv+0x113/0x140
   __netif_receive_skb_list_core+0x1a0/0x210
   netif_receive_skb_list_internal+0x186/0x2a0
   gro_normal_list.part.170+0x19/0x40
   napi_complete_done+0x65/0x150
   mlx5e_napi_poll+0x1ae/0x680
   __napi_poll+0x25/0x120
   net_rx_action+0x11e/0x280
   __do_softirq+0xbb/0x271
   irq_exit_rcu+0x97/0xa0
   common_interrupt+0x7f/0xa0
   </IRQ>
   asm_common_interrupt+0x1e/0x40
  RIP: 0010:bpf_prog_1835a9241238291a_tw_egress+0x5/0xbac
   ? __cgroup_bpf_run_filter_skb+0x378/0x4e0
   ? do_softirq+0x34/0x70
   ? ip6_finish_output2+0x266/0x590
   ? ip6_finish_output+0x66/0xa0
   ? ip6_output+0x6c/0x130
   ? ip6_xmit+0x279/0x550
   ? ip6_dst_check+0x61/0xd0
  [...]

Using drgn [0] to dump the percpu buffer contents showed that on this CPU
slot 0 is still available, but slots 1-7 are occupied and those tasks in
slots 1-7 mostly don't exist any more. So we might have issues in
bpf_cgroup_storage_unset().

Further debugging confirmed that there is a bug in bpf_cgroup_storage_unset().
Currently, it tries to unset "current" slot with searching from the start.
So the following sequence is possible:

  1. A task is running and claims slot 0
  2. Running BPF program is done, and it checked slot 0 has the "task"
     and ready to reset it to NULL (not yet).
  3. An interrupt happens, another BPF program runs and it claims slot 1
     with the *same* task.
  4. The unset() in interrupt context releases slot 0 since it matches "task".
  5. Interrupt is done, the task in process context reset slot 0.

At the end, slot 1 is not reset and the same process can continue to occupy
slots 2-7 and finally, when the above step 1-5 is repeated again, step 3 BPF
program won't be able to claim an empty slot and a warning will be issued.

To fix the issue, for unset() function, we should traverse from the last slot
to the first. This way, the above issue can be avoided.

The same reverse traversal should also be done in bpf_get_local_storage() helper
itself. Otherwise, incorrect local storage may be returned to BPF program.

  [0] https://github.com/osandov/drgn

Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20210810010413.1976277-1-yhs@fb.com
---
 include/linux/bpf-cgroup.h | 4 ++--
 kernel/bpf/helpers.c       | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8b77d08d4b47..6c9b10d82c80 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -201,8 +201,8 @@ static inline void bpf_cgroup_storage_unset(void)
 {
 	int i;
 
-	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
-		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+	for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) {
+		if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
 			continue;
 
 		this_cpu_write(bpf_cgroup_storage_info[i].task, NULL);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 0b04553e8c44..7a97b2f4747d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -397,8 +397,8 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
 	void *ptr;
 	int i;
 
-	for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) {
-		if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
+	for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) {
+		if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current))
 			continue;
 
 		storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]);
-- 
cgit v1.2.3


From 77e89afc25f30abd56e76a809ee2884d7c1b63ce Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jul 2021 23:51:47 +0200
Subject: PCI/MSI: Protect msi_desc::masked for multi-MSI

Multi-MSI uses a single MSI descriptor and there is a single mask register
when the device supports per vector masking. To avoid reading back the mask
register the value is cached in the MSI descriptor and updates are done by
clearing and setting bits in the cache and writing it to the device.

But nothing protects msi_desc::masked and the mask register from being
modified concurrently on two different CPUs for two different Linux
interrupts which belong to the same multi-MSI descriptor.

Add a lock to struct device and protect any operation on the mask and the
mask register with it.

This makes the update of msi_desc::masked unconditional, but there is no
place which requires a modification of the hardware register without
updating the masked cache.

msi_mask_irq() is now an empty wrapper which will be cleaned up in follow
up changes.

The problem goes way back to the initial support of multi-MSI, but picking
the commit which introduced the mask cache is a valid cut off point
(2.6.30).

Fixes: f2440d9acbe8 ("PCI MSI: Refactor interrupt masking code")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.726833414@linutronix.de
---
 drivers/base/core.c    |  1 +
 drivers/pci/msi.c      | 19 ++++++++++---------
 include/linux/device.h |  1 +
 include/linux/msi.h    |  2 +-
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index f6360490a4a3..6c0ef9d55a34 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2837,6 +2837,7 @@ void device_initialize(struct device *dev)
 	device_pm_init(dev);
 	set_dev_node(dev, -1);
 #ifdef CONFIG_GENERIC_MSI_IRQ
+	raw_spin_lock_init(&dev->msi_lock);
 	INIT_LIST_HEAD(&dev->msi_list);
 #endif
 	INIT_LIST_HEAD(&dev->links.consumers);
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index f0f7026b7ac0..e5e75331b415 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -143,24 +143,25 @@ static inline __attribute_const__ u32 msi_mask(unsigned x)
  * reliably as devices without an INTx disable bit will then generate a
  * level IRQ which will never be cleared.
  */
-u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
+void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 {
-	u32 mask_bits = desc->masked;
+	raw_spinlock_t *lock = &desc->dev->msi_lock;
+	unsigned long flags;
 
 	if (pci_msi_ignore_mask || !desc->msi_attrib.maskbit)
-		return 0;
+		return;
 
-	mask_bits &= ~mask;
-	mask_bits |= flag;
+	raw_spin_lock_irqsave(lock, flags);
+	desc->masked &= ~mask;
+	desc->masked |= flag;
 	pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->mask_pos,
-			       mask_bits);
-
-	return mask_bits;
+			       desc->masked);
+	raw_spin_unlock_irqrestore(lock, flags);
 }
 
 static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 {
-	desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag);
+	__pci_msi_desc_mask_irq(desc, mask, flag);
 }
 
 static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
diff --git a/include/linux/device.h b/include/linux/device.h
index 59940f1744c1..e53aa5065f58 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -506,6 +506,7 @@ struct device {
 	struct dev_pin_info	*pins;
 #endif
 #ifdef CONFIG_GENERIC_MSI_IRQ
+	raw_spinlock_t		msi_lock;
 	struct list_head	msi_list;
 #endif
 #ifdef CONFIG_DMA_OPS
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 6aff469e511d..e8bdcb83172b 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -233,7 +233,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 
 u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag);
-u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
+void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag);
 void pci_msi_mask_irq(struct irq_data *data);
 void pci_msi_unmask_irq(struct irq_data *data);
 
-- 
cgit v1.2.3


From 826da771291fc25a428e871f9e7fb465e390f852 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 29 Jul 2021 23:51:48 +0200
Subject: genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP

X86 IO/APIC and MSI interrupts (when used without interrupts remapping)
require that the affinity setup on startup is done before the interrupt is
enabled for the first time as the non-remapped operation mode cannot safely
migrate enabled interrupts from arbitrary contexts. Provide a new irq chip
flag which allows affected hardware to request this.

This has to be opt-in because there have been reports in the past that some
interrupt chips cannot handle affinity setting before startup.

Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)")
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20210729222542.779791738@linutronix.de
---
 include/linux/irq.h | 2 ++
 kernel/irq/chip.c   | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8e9a9ae471a6..c8293c817646 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -569,6 +569,7 @@ struct irq_chip {
  * IRQCHIP_SUPPORTS_NMI:              Chip can deliver NMIs, only for root irqchips
  * IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND:  Invokes __enable_irq()/__disable_irq() for wake irqs
  *                                    in the suspend path if they are in disabled state
+ * IRQCHIP_AFFINITY_PRE_STARTUP:      Default affinity update before startup
  */
 enum {
 	IRQCHIP_SET_TYPE_MASKED			= (1 <<  0),
@@ -581,6 +582,7 @@ enum {
 	IRQCHIP_SUPPORTS_LEVEL_MSI		= (1 <<  7),
 	IRQCHIP_SUPPORTS_NMI			= (1 <<  8),
 	IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND	= (1 <<  9),
+	IRQCHIP_AFFINITY_PRE_STARTUP		= (1 << 10),
 };
 
 #include <linux/irqdesc.h>
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 7f04c7d8296e..a98bcfc4be7b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
 	} else {
 		switch (__irq_startup_managed(desc, aff, force)) {
 		case IRQ_STARTUP_NORMAL:
+			if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)
+				irq_setup_affinity(desc);
 			ret = __irq_startup(desc);
-			irq_setup_affinity(desc);
+			if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP))
+				irq_setup_affinity(desc);
 			break;
 		case IRQ_STARTUP_MANAGED:
 			irq_do_set_affinity(d, aff, false);
-- 
cgit v1.2.3


From 0e566c8f0f2e8325e35f6f97e13cde5356b41814 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Wed, 21 Jul 2021 17:26:47 +0300
Subject: virtio: Protect vqs list access

VQs may be accessed to mark the device broken while they are
created/destroyed. Hence protect the access to the vqs list.

Fixes: e2dcdfe95c0b ("virtio: virtio_break_device() to mark all virtqueues broken.")
Signed-off-by: Parav Pandit <parav@nvidia.com>
Link: https://lore.kernel.org/r/20210721142648.1525924-4-parav@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio.c      | 1 +
 drivers/virtio/virtio_ring.c | 8 ++++++++
 include/linux/virtio.h       | 1 +
 3 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 4b15c00c0a0a..49984d2cba24 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -355,6 +355,7 @@ int register_virtio_device(struct virtio_device *dev)
 	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
 
 	INIT_LIST_HEAD(&dev->vqs);
+	spin_lock_init(&dev->vqs_list_lock);
 
 	/*
 	 * device_add() causes the bus infrastructure to look for a matching
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index d5934c2e5a89..c2aaa0eff6df 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1755,7 +1755,9 @@ static struct virtqueue *vring_create_virtqueue_packed(
 			cpu_to_le16(vq->packed.event_flags_shadow);
 	}
 
+	spin_lock(&vdev->vqs_list_lock);
 	list_add_tail(&vq->vq.list, &vdev->vqs);
+	spin_unlock(&vdev->vqs_list_lock);
 	return &vq->vq;
 
 err_desc_extra:
@@ -2229,7 +2231,9 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index,
 	memset(vq->split.desc_state, 0, vring.num *
 			sizeof(struct vring_desc_state_split));
 
+	spin_lock(&vdev->vqs_list_lock);
 	list_add_tail(&vq->vq.list, &vdev->vqs);
+	spin_unlock(&vdev->vqs_list_lock);
 	return &vq->vq;
 
 err_extra:
@@ -2291,7 +2295,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 
+	spin_lock(&vq->vq.vdev->vqs_list_lock);
 	list_del(&_vq->list);
+	spin_unlock(&vq->vq.vdev->vqs_list_lock);
 
 	if (vq->we_own_ring) {
 		if (vq->packed_ring) {
@@ -2386,12 +2392,14 @@ void virtio_break_device(struct virtio_device *dev)
 {
 	struct virtqueue *_vq;
 
+	spin_lock(&dev->vqs_list_lock);
 	list_for_each_entry(_vq, &dev->vqs, list) {
 		struct vring_virtqueue *vq = to_vvq(_vq);
 
 		/* Pairs with READ_ONCE() in virtqueue_is_broken(). */
 		WRITE_ONCE(vq->broken, true);
 	}
+	spin_unlock(&dev->vqs_list_lock);
 }
 EXPORT_SYMBOL_GPL(virtio_break_device);
 
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index b1894e0323fa..41edbc01ffa4 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -110,6 +110,7 @@ struct virtio_device {
 	bool config_enabled;
 	bool config_change_pending;
 	spinlock_t config_lock;
+	spinlock_t vqs_list_lock; /* Protects VQs list access */
 	struct device dev;
 	struct virtio_device_id id;
 	const struct virtio_config_ops *config;
-- 
cgit v1.2.3


From 45a687879b31caae4032abd1c2402e289d2b8083 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@nvidia.com>
Date: Tue, 10 Aug 2021 14:00:10 +0300
Subject: net: bridge: fix flags interpretation for extern learn fdb entries

Ignore fdb flags when adding port extern learn entries and always set
BR_FDB_LOCAL flag when adding bridge extern learn entries. This is
closest to the behaviour we had before and avoids breaking any use cases
which were allowed.

This patch fixes iproute2 calls which assume NUD_PERMANENT and were
allowed before, example:
$ bridge fdb add 00:11:22:33:44:55 dev swp1 extern_learn

Extern learn entries are allowed to roam, but do not expire, so static
or dynamic flags make no sense for them.

Also add a comment for future reference.

Fixes: eb100e0e24a2 ("net: bridge: allow to add externally learned entries from user-space")
Fixes: 0541a6293298 ("net: bridge: validate the NUD_PERMANENT bit when adding an extern_learn FDB entry")
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Tested-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://lore.kernel.org/r/20210810110010.43859-1-razor@blackwall.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/neighbour.h |  7 +++++--
 net/bridge/br.c                |  3 +--
 net/bridge/br_fdb.c            | 11 ++++-------
 net/bridge/br_private.h        |  2 +-
 4 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index dc8b72201f6c..00a60695fa53 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -66,8 +66,11 @@ enum {
 #define NUD_NONE	0x00
 
 /* NUD_NOARP & NUD_PERMANENT are pseudostates, they never change
-   and make no address resolution or NUD.
-   NUD_PERMANENT also cannot be deleted by garbage collectors.
+ * and make no address resolution or NUD.
+ * NUD_PERMANENT also cannot be deleted by garbage collectors.
+ * When NTF_EXT_LEARNED is set for a bridge fdb entry the different cache entry
+ * states don't make sense and thus are ignored. Such entries don't age and
+ * can roam.
  */
 
 struct nda_cacheinfo {
diff --git a/net/bridge/br.c b/net/bridge/br.c
index bbab9984f24e..ef743f94254d 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -166,8 +166,7 @@ static int br_switchdev_event(struct notifier_block *unused,
 	case SWITCHDEV_FDB_ADD_TO_BRIDGE:
 		fdb_info = ptr;
 		err = br_fdb_external_learn_add(br, p, fdb_info->addr,
-						fdb_info->vid,
-						fdb_info->is_local, false);
+						fdb_info->vid, false);
 		if (err) {
 			err = notifier_from_errno(err);
 			break;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 835cec1e5a03..5dee30966ed3 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -1044,10 +1044,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
 					   "FDB entry towards bridge must be permanent");
 			return -EINVAL;
 		}
-
-		err = br_fdb_external_learn_add(br, p, addr, vid,
-						ndm->ndm_state & NUD_PERMANENT,
-						true);
+		err = br_fdb_external_learn_add(br, p, addr, vid, true);
 	} else {
 		spin_lock_bh(&br->hash_lock);
 		err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb);
@@ -1275,7 +1272,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
 }
 
 int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
-			      const unsigned char *addr, u16 vid, bool is_local,
+			      const unsigned char *addr, u16 vid,
 			      bool swdev_notify)
 {
 	struct net_bridge_fdb_entry *fdb;
@@ -1293,7 +1290,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 		if (swdev_notify)
 			flags |= BIT(BR_FDB_ADDED_BY_USER);
 
-		if (is_local)
+		if (!p)
 			flags |= BIT(BR_FDB_LOCAL);
 
 		fdb = fdb_create(br, p, addr, vid, flags);
@@ -1322,7 +1319,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 		if (swdev_notify)
 			set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
 
-		if (is_local)
+		if (!p)
 			set_bit(BR_FDB_LOCAL, &fdb->flags);
 
 		if (modified)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index aa64d8d63ca3..2b48b204205e 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -711,7 +711,7 @@ int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev,
 int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
 void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
 int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
-			      const unsigned char *addr, u16 vid, bool is_local,
+			      const unsigned char *addr, u16 vid,
 			      bool swdev_notify);
 int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid,
-- 
cgit v1.2.3


From c8d182bd387a09a8b95303c8086238e8bf61fcfc Mon Sep 17 00:00:00 2001
From: Xie Yongji <xieyongji@bytedance.com>
Date: Thu, 15 Jul 2021 16:00:26 +0800
Subject: vdpa: Add documentation for vdpa_alloc_device() macro

The return value of vdpa_alloc_device() macro is not very
clear, so that most of callers did the wrong check. Let's
add some comments to better document it.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Link: https://lore.kernel.org/r/20210715080026.242-4-xieyongji@bytedance.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
---
 include/linux/vdpa.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 3357ac98878d..8cfe49d201dd 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -277,6 +277,17 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent,
 					const struct vdpa_config_ops *config,
 					size_t size, const char *name);
 
+/**
+ * vdpa_alloc_device - allocate and initilaize a vDPA device
+ *
+ * @dev_struct: the type of the parent structure
+ * @member: the name of struct vdpa_device within the @dev_struct
+ * @parent: the parent device
+ * @config: the bus operations that is supported by this device
+ * @name: name of the vdpa device
+ *
+ * Return allocated data structure or ERR_PTR upon error
+ */
 #define vdpa_alloc_device(dev_struct, member, parent, config, name)   \
 			  container_of(__vdpa_alloc_device( \
 				       parent, config, \
-- 
cgit v1.2.3


From ea2f6af16532511eb1cd8eb62845c37861f24ce8 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Tue, 10 Aug 2021 12:25:05 -0400
Subject: vringh: pull in spinlock header

we use a spinlock now pull in the correct header to
make vring.h self sufficient.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/vringh.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index 84db7b8f912f..212892cf9822 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -14,6 +14,7 @@
 #include <linux/virtio_byteorder.h>
 #include <linux/uio.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 #if IS_REACHABLE(CONFIG_VHOST_IOTLB)
 #include <linux/dma-direction.h>
 #include <linux/vhost_iotlb.h>
-- 
cgit v1.2.3


From 879753c816dbbdb2a9a395aa4448d29feee92d1a Mon Sep 17 00:00:00 2001
From: Eli Cohen <elic@nvidia.com>
Date: Wed, 11 Aug 2021 08:37:59 +0300
Subject: vdpa/mlx5: Fix queue type selection logic

get_queue_type() comments that splict virtqueue is preferred, however,
the actual logic preferred packed virtqueues. Since firmware has not
supported packed virtqueues we ended up using split virtqueues as was
desired.

Since we do not advertise support for packed virtqueues, we add a check
to verify split virtqueues are indeed supported.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Signed-off-by: Eli Cohen <elic@nvidia.com>
Link: https://lore.kernel.org/r/20210811053759.66752-1-elic@nvidia.com
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vdpa/mlx5/net/mlx5_vnet.c  | 14 ++++++++++----
 include/linux/mlx5/mlx5_ifc_vdpa.h | 10 ++++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 2a31467f7ac5..b1230fa2f5d1 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -753,12 +753,12 @@ static int get_queue_type(struct mlx5_vdpa_net *ndev)
 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
 
 	/* prefer split queue */
-	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED)
-		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
+	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
+		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
 
-	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT));
+	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
 
-	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
+	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
 }
 
 static bool vq_is_tx(u16 idx)
@@ -2030,6 +2030,12 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name)
 		return -ENOSPC;
 
 	mdev = mgtdev->madev->mdev;
+	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
+	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
+		dev_warn(mdev->device, "missing support for split virtqueues\n");
+		return -EOPNOTSUPP;
+	}
+
 	/* we save one virtqueue for control virtqueue should we require it */
 	max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues);
 	max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS);
diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h
index 98b56b75c625..1a9c9d94cb59 100644
--- a/include/linux/mlx5/mlx5_ifc_vdpa.h
+++ b/include/linux/mlx5/mlx5_ifc_vdpa.h
@@ -11,13 +11,15 @@ enum {
 };
 
 enum {
-	MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT   = 0x1, // do I check this caps?
-	MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED  = 0x2,
+	MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT   = 0,
+	MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED  = 1,
 };
 
 enum {
-	MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT   = 0,
-	MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED  = 1,
+	MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT =
+		BIT(MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT),
+	MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED =
+		BIT(MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED),
 };
 
 struct mlx5_ifc_virtio_q_bits {
-- 
cgit v1.2.3


From 848378812e40152abe9b9baf58ce2004f76fb988 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Fri, 30 Jul 2021 19:31:08 -0700
Subject: vmlinux.lds.h: Handle clang's module.{c,d}tor sections

A recent change in LLVM causes module_{c,d}tor sections to appear when
CONFIG_K{A,C}SAN are enabled, which results in orphan section warnings
because these are not handled anywhere:

ld.lld: warning: arch/x86/pci/built-in.a(legacy.o):(.text.asan.module_ctor) is being placed in '.text.asan.module_ctor'
ld.lld: warning: arch/x86/pci/built-in.a(legacy.o):(.text.asan.module_dtor) is being placed in '.text.asan.module_dtor'
ld.lld: warning: arch/x86/pci/built-in.a(legacy.o):(.text.tsan.module_ctor) is being placed in '.text.tsan.module_ctor'

Fangrui explains: "the function asan.module_ctor has the SHF_GNU_RETAIN
flag, so it is in a separate section even with -fno-function-sections
(default)".

Place them in the TEXT_TEXT section so that these technologies continue
to work with the newer compiler versions. All of the KASAN and KCSAN
KUnit tests continue to pass after this change.

Cc: stable@vger.kernel.org
Link: https://github.com/ClangBuiltLinux/linux/issues/1432
Link: https://github.com/llvm/llvm-project/commit/7b789562244ee941b7bf2cefeb3fc08a59a01865
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Fangrui Song <maskray@google.com>
Acked-by: Marco Elver <elver@google.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20210731023107.1932981-1-nathan@kernel.org
---
 include/asm-generic/vmlinux.lds.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 17325416e2de..62669b36a772 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -586,6 +586,7 @@
 		NOINSTR_TEXT						\
 		*(.text..refcount)					\
 		*(.ref.text)						\
+		*(.text.asan.* .text.tsan.*)				\
 		TEXT_CFI_JT						\
 	MEM_KEEP(init.text*)						\
 	MEM_KEEP(exit.text*)						\
-- 
cgit v1.2.3


From b69dd5b3780a7298bd893816a09da751bc0636f7 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 11 Aug 2021 12:57:15 -0700
Subject: net: igmp: increase size of mr_ifc_count

Some arches support cmpxchg() on 4-byte and 8-byte only.
Increase mr_ifc_count width to 32bit to fix this problem.

Fixes: 4a2b285e7e10 ("net: igmp: fix data-race in igmp_ifc_timer_expire()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/20210811195715.3684218-1-eric.dumazet@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/inetdevice.h | 2 +-
 net/ipv4/igmp.c            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 53aa0343bf69..aaf4f1b4c277 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -41,7 +41,7 @@ struct in_device {
 	unsigned long		mr_qri;		/* Query Response Interval */
 	unsigned char		mr_qrv;		/* Query Robustness Variable */
 	unsigned char		mr_gq_running;
-	unsigned char		mr_ifc_count;
+	u32			mr_ifc_count;
 	struct timer_list	mr_gq_timer;	/* general query timer */
 	struct timer_list	mr_ifc_timer;	/* interface change timer */
 
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a51360087b19..00576bae183d 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -803,7 +803,7 @@ static void igmp_gq_timer_expire(struct timer_list *t)
 static void igmp_ifc_timer_expire(struct timer_list *t)
 {
 	struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer);
-	u8 mr_ifc_count;
+	u32 mr_ifc_count;
 
 	igmpv3_send_cr(in_dev);
 restart:
-- 
cgit v1.2.3


From 7a3dc4f35bf8e1a07e5c3f8ecc8ac923f48493fe Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 13 Aug 2021 12:36:14 +0200
Subject: driver core: Add missing kernel doc for device::msi_lock

Fixes: 77e89afc25f3 ("PCI/MSI: Protect msi_desc::masked for multi-MSI")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/device.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index e53aa5065f58..65d84b67b024 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -407,6 +407,7 @@ struct dev_links_info {
  * @em_pd:	device's energy model performance domain
  * @pins:	For device pin management.
  *		See Documentation/driver-api/pin-control.rst for details.
+ * @msi_lock:	Lock to protect MSI mask cache and mask register
  * @msi_list:	Hosts MSI descriptors
  * @msi_domain: The generic MSI domain this device is using.
  * @numa_node:	NUMA node this device is close to.
-- 
cgit v1.2.3


From 3b844826b6c6affa80755254da322b017358a2f4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 5 Aug 2021 10:04:43 -0700
Subject: pipe: avoid unnecessary EPOLLET wakeups under normal loads

I had forgotten just how sensitive hackbench is to extra pipe wakeups,
and commit 3a34b13a88ca ("pipe: make pipe writes always wake up
readers") ended up causing a quite noticeable regression on larger
machines.

Now, hackbench isn't necessarily a hugely meaningful benchmark, and it's
not clear that this matters in real life all that much, but as Mel
points out, it's used often enough when comparing kernels and so the
performance regression shows up like a sore thumb.

It's easy enough to fix at least for the common cases where pipes are
used purely for data transfer, and you never have any exciting poll
usage at all.  So set a special 'poll_usage' flag when there is polling
activity, and make the ugly "EPOLLET has crazy legacy expectations"
semantics explicit to only that case.

I would love to limit it to just the broken EPOLLET case, but the pipe
code can't see the difference between epoll and regular select/poll, so
any non-read/write waiting will trigger the extra wakeup behavior.  That
is sufficient for at least the hackbench case.

Apart from making the odd extra wakeup cases more explicitly about
EPOLLET, this also makes the extra wakeup be at the _end_ of the pipe
write, not at the first write chunk.  That is actually much saner
semantics (as much as you can call any of the legacy edge-triggered
expectations for EPOLLET "sane") since it means that you know the wakeup
will happen once the write is done, rather than possibly in the middle
of one.

[ For stable people: I'm putting a "Fixes" tag on this, but I leave it
  up to you to decide whether you actually want to backport it or not.
  It likely has no impact outside of synthetic benchmarks  - Linus ]

Link: https://lore.kernel.org/lkml/20210802024945.GA8372@xsang-OptiPlex-9020/
Fixes: 3a34b13a88ca ("pipe: make pipe writes always wake up readers")
Reported-by: kernel test robot <oliver.sang@intel.com>
Tested-by: Sandeep Patil <sspatil@android.com>
Tested-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c                 | 15 +++++++++------
 include/linux/pipe_fs_i.h |  2 ++
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/pipe.c b/fs/pipe.c
index 8e6ef62aeb1c..678dee2a8228 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -444,9 +444,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 #endif
 
 	/*
-	 * Epoll nonsensically wants a wakeup whether the pipe
-	 * was already empty or not.
-	 *
 	 * If it wasn't empty we try to merge new data into
 	 * the last buffer.
 	 *
@@ -455,9 +452,9 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 	 * spanning multiple pages.
 	 */
 	head = pipe->head;
-	was_empty = true;
+	was_empty = pipe_empty(head, pipe->tail);
 	chars = total_len & (PAGE_SIZE-1);
-	if (chars && !pipe_empty(head, pipe->tail)) {
+	if (chars && !was_empty) {
 		unsigned int mask = pipe->ring_size - 1;
 		struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
 		int offset = buf->offset + buf->len;
@@ -590,8 +587,11 @@ out:
 	 * This is particularly important for small writes, because of
 	 * how (for example) the GNU make jobserver uses small writes to
 	 * wake up pending jobs
+	 *
+	 * Epoll nonsensically wants a wakeup whether the pipe
+	 * was already empty or not.
 	 */
-	if (was_empty) {
+	if (was_empty || pipe->poll_usage) {
 		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
@@ -654,6 +654,9 @@ pipe_poll(struct file *filp, poll_table *wait)
 	struct pipe_inode_info *pipe = filp->private_data;
 	unsigned int head, tail;
 
+	/* Epoll has some historical nasty semantics, this enables them */
+	pipe->poll_usage = 1;
+
 	/*
 	 * Reading pipe state only -- no need for acquiring the semaphore.
 	 *
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 5d2705f1d01c..fc5642431b92 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -48,6 +48,7 @@ struct pipe_buffer {
  *	@files: number of struct file referring this pipe (protected by ->i_lock)
  *	@r_counter: reader counter
  *	@w_counter: writer counter
+ *	@poll_usage: is this pipe used for epoll, which has crazy wakeups?
  *	@fasync_readers: reader side fasync
  *	@fasync_writers: writer side fasync
  *	@bufs: the circular array of pipe buffers
@@ -70,6 +71,7 @@ struct pipe_inode_info {
 	unsigned int files;
 	unsigned int r_counter;
 	unsigned int w_counter;
+	unsigned int poll_usage;
 	struct page *tmp_page;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
-- 
cgit v1.2.3


From fa05bdb89b01b098aad19ec0ebc4d1cc7b11177e Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Thu, 19 Aug 2021 13:58:42 +0300
Subject: Revert "flow_offload: action should not be NULL when it is
 referenced"

This reverts commit 9ea3e52c5bc8bb4a084938dc1e3160643438927a.

Cited commit added a check to make sure 'action' is not NULL, but
'action' is already dereferenced before the check, when calling
flow_offload_has_one_action().

Therefore, the check does not make any sense and results in a smatch
warning:

include/net/flow_offload.h:322 flow_action_mixed_hw_stats_check() warn:
variable dereferenced before check 'action' (see line 319)

Fix by reverting this commit.

Cc: gushengxian <gushengxian@yulong.com>
Fixes: 9ea3e52c5bc8 ("flow_offload: action should not be NULL when it is referenced")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Link: https://lore.kernel.org/r/20210819105842.1315705-1-idosch@idosch.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/flow_offload.h | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index f3c2841566a0..1b9d75aedb22 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -319,14 +319,12 @@ flow_action_mixed_hw_stats_check(const struct flow_action *action,
 	if (flow_offload_has_one_action(action))
 		return true;
 
-	if (action) {
-		flow_action_for_each(i, action_entry, action) {
-			if (i && action_entry->hw_stats != last_hw_stats) {
-				NL_SET_ERR_MSG_MOD(extack, "Mixing HW stats types for actions is not supported");
-				return false;
-			}
-			last_hw_stats = action_entry->hw_stats;
+	flow_action_for_each(i, action_entry, action) {
+		if (i && action_entry->hw_stats != last_hw_stats) {
+			NL_SET_ERR_MSG_MOD(extack, "Mixing HW stats types for actions is not supported");
+			return false;
 		}
+		last_hw_stats = action_entry->hw_stats;
 	}
 	return true;
 }
-- 
cgit v1.2.3


From b16ee0f9ed79fca2f2c31b13cac2ab9cf543525a Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Thu, 19 Aug 2021 19:04:15 -0700
Subject: mmflags.h: add missing __GFP_ZEROTAGS and __GFP_SKIP_KASAN_POISON
 names

printk("%pGg") outputs these two flags as hexadecimal number, rather
than as a string, e.g:

	GFP_KERNEL|0x1800000

Fix this by adding missing names of __GFP_ZEROTAGS and
__GFP_SKIP_KASAN_POISON flags to __def_gfpflag_names.

Link: https://lkml.kernel.org/r/20210816133502.590-1-rppt@kernel.org
Fixes: 013bb59dbb7c ("arm64: mte: handle tags zeroing at page allocation time")
Fixes: c275c5c6d50a ("kasan: disable freed user page poisoning with HW tags")
Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/mmflags.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 390270e00a1d..f160484afc5c 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -48,7 +48,9 @@
 	{(unsigned long)__GFP_WRITE,		"__GFP_WRITE"},		\
 	{(unsigned long)__GFP_RECLAIM,		"__GFP_RECLAIM"},	\
 	{(unsigned long)__GFP_DIRECT_RECLAIM,	"__GFP_DIRECT_RECLAIM"},\
-	{(unsigned long)__GFP_KSWAPD_RECLAIM,	"__GFP_KSWAPD_RECLAIM"}\
+	{(unsigned long)__GFP_KSWAPD_RECLAIM,	"__GFP_KSWAPD_RECLAIM"},\
+	{(unsigned long)__GFP_ZEROTAGS,		"__GFP_ZEROTAGS"},	\
+	{(unsigned long)__GFP_SKIP_KASAN_POISON,"__GFP_SKIP_KASAN_POISON"}\
 
 #define show_gfp_flags(flags)						\
 	(flags) ? __print_flags(flags, "|",				\
-- 
cgit v1.2.3


From f56ce412a59d7d938b81de8878faef128812482c Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 19 Aug 2021 19:04:21 -0700
Subject: mm: memcontrol: fix occasional OOMs due to proportional memory.low
 reclaim

We've noticed occasional OOM killing when memory.low settings are in
effect for cgroups.  This is unexpected and undesirable as memory.low is
supposed to express non-OOMing memory priorities between cgroups.

The reason for this is proportional memory.low reclaim.  When cgroups
are below their memory.low threshold, reclaim passes them over in the
first round, and then retries if it couldn't find pages anywhere else.
But when cgroups are slightly above their memory.low setting, page scan
force is scaled down and diminished in proportion to the overage, to the
point where it can cause reclaim to fail as well - only in that case we
currently don't retry, and instead trigger OOM.

To fix this, hook proportional reclaim into the same retry logic we have
in place for when cgroups are skipped entirely.  This way if reclaim
fails and some cgroups were scanned with diminished pressure, we'll try
another full-force cycle before giving up and OOMing.

[akpm@linux-foundation.org: coding-style fixes]

Link: https://lkml.kernel.org/r/20210817180506.220056-1-hannes@cmpxchg.org
Fixes: 9783aa9917f8 ("mm, memcg: proportional memory.{low,min} reclaim")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Leon Yang <lnyng@fb.com>
Reviewed-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Chris Down <chris@chrisdown.name>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>		[5.4+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 29 +++++++++++++++--------------
 mm/vmscan.c                | 27 +++++++++++++++++++--------
 2 files changed, 34 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bfe5c486f4ad..24797929d8a1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -612,12 +612,15 @@ static inline bool mem_cgroup_disabled(void)
 	return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
-						  struct mem_cgroup *memcg,
-						  bool in_low_reclaim)
+static inline void mem_cgroup_protection(struct mem_cgroup *root,
+					 struct mem_cgroup *memcg,
+					 unsigned long *min,
+					 unsigned long *low)
 {
+	*min = *low = 0;
+
 	if (mem_cgroup_disabled())
-		return 0;
+		return;
 
 	/*
 	 * There is no reclaim protection applied to a targeted reclaim.
@@ -653,13 +656,10 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
 	 *
 	 */
 	if (root == memcg)
-		return 0;
-
-	if (in_low_reclaim)
-		return READ_ONCE(memcg->memory.emin);
+		return;
 
-	return max(READ_ONCE(memcg->memory.emin),
-		   READ_ONCE(memcg->memory.elow));
+	*min = READ_ONCE(memcg->memory.emin);
+	*low = READ_ONCE(memcg->memory.elow);
 }
 
 void mem_cgroup_calculate_protection(struct mem_cgroup *root,
@@ -1147,11 +1147,12 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
 {
 }
 
-static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root,
-						  struct mem_cgroup *memcg,
-						  bool in_low_reclaim)
+static inline void mem_cgroup_protection(struct mem_cgroup *root,
+					 struct mem_cgroup *memcg,
+					 unsigned long *min,
+					 unsigned long *low)
 {
-	return 0;
+	*min = *low = 0;
 }
 
 static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4620df62f0ff..b0202ab5e136 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -100,9 +100,12 @@ struct scan_control {
 	unsigned int may_swap:1;
 
 	/*
-	 * Cgroups are not reclaimed below their configured memory.low,
-	 * unless we threaten to OOM. If any cgroups are skipped due to
-	 * memory.low and nothing was reclaimed, go back for memory.low.
+	 * Cgroup memory below memory.low is protected as long as we
+	 * don't threaten to OOM. If any cgroup is reclaimed at
+	 * reduced force or passed over entirely due to its memory.low
+	 * setting (memcg_low_skipped), and nothing is reclaimed as a
+	 * result, then go back for one more cycle that reclaims the protected
+	 * memory (memcg_low_reclaim) to avert OOM.
 	 */
 	unsigned int memcg_low_reclaim:1;
 	unsigned int memcg_low_skipped:1;
@@ -2537,15 +2540,14 @@ out:
 	for_each_evictable_lru(lru) {
 		int file = is_file_lru(lru);
 		unsigned long lruvec_size;
+		unsigned long low, min;
 		unsigned long scan;
-		unsigned long protection;
 
 		lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
-		protection = mem_cgroup_protection(sc->target_mem_cgroup,
-						   memcg,
-						   sc->memcg_low_reclaim);
+		mem_cgroup_protection(sc->target_mem_cgroup, memcg,
+				      &min, &low);
 
-		if (protection) {
+		if (min || low) {
 			/*
 			 * Scale a cgroup's reclaim pressure by proportioning
 			 * its current usage to its memory.low or memory.min
@@ -2576,6 +2578,15 @@ out:
 			 * hard protection.
 			 */
 			unsigned long cgroup_size = mem_cgroup_size(memcg);
+			unsigned long protection;
+
+			/* memory.low scaling, make sure we retry before OOM */
+			if (!sc->memcg_low_reclaim && low > min) {
+				protection = low;
+				sc->memcg_low_skipped = 1;
+			} else {
+				protection = min;
+			}
 
 			/* Avoid TOCTOU with earlier protection check */
 			cgroup_size = max(cgroup_size, protection);
-- 
cgit v1.2.3


From a7cb5d23eaea148f8582229846f8dfff192f05c3 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 19 Aug 2021 19:04:30 -0700
Subject: kfence: fix is_kfence_address() for addresses below KFENCE_POOL_SIZE

Originally the addr != NULL check was meant to take care of the case
where __kfence_pool == NULL (KFENCE is disabled).  However, this does
not work for addresses where addr > 0 && addr < KFENCE_POOL_SIZE.

This can be the case on NULL-deref where addr > 0 && addr < PAGE_SIZE or
any other faulting access with addr < KFENCE_POOL_SIZE.  While the
kernel would likely crash, the stack traces and report might be
confusing due to double faults upon KFENCE's attempt to unprotect such
an address.

Fix it by just checking that __kfence_pool != NULL instead.

Link: https://lkml.kernel.org/r/20210818130300.2482437-1-elver@google.com
Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure")
Signed-off-by: Marco Elver <elver@google.com>
Reported-by: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: <stable@vger.kernel.org>    [5.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kfence.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/kfence.h b/include/linux/kfence.h
index a70d1ea03532..3fe6dd8a18c1 100644
--- a/include/linux/kfence.h
+++ b/include/linux/kfence.h
@@ -51,10 +51,11 @@ extern atomic_t kfence_allocation_gate;
 static __always_inline bool is_kfence_address(const void *addr)
 {
 	/*
-	 * The non-NULL check is required in case the __kfence_pool pointer was
-	 * never initialized; keep it in the slow-path after the range-check.
+	 * The __kfence_pool != NULL check is required to deal with the case
+	 * where __kfence_pool == NULL && addr < KFENCE_POOL_SIZE. Keep it in
+	 * the slow-path after the range-check!
 	 */
-	return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && addr);
+	return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && __kfence_pool);
 }
 
 /**
-- 
cgit v1.2.3


From a4ad42d28618eef83bee02e0a19af0d467bd9722 Mon Sep 17 00:00:00 2001
From: Kareem Shaik <kareem.m.shaik@intel.com>
Date: Wed, 18 Aug 2021 09:57:39 +0200
Subject: ASoC: Intel: Skylake: Support multiple format configs

A module can have two kinds of set params, as per topology requirements.
For example, one pre-init and one post-init. But currently, there is
support for just one type, as the format_config.

This patch extends the format_configs to 4, so as to be able to support
pre-init, post-init and post-bind type of set params, for the same
module, simultaneously.

Signed-off-by: Kareem Shaik <kareem.m.shaik@intel.com>
Signed-off-by: Gustaw Lewandowski <gustaw.lewandowski@linux.intel.com>
Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Tested-by: Lukasz Majczak <lma@semihalf.com>
Link: https://lore.kernel.org/r/20210818075742.1515155-9-cezary.rojewski@intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/snd_sst_tokens.h    |  6 ++-
 sound/soc/intel/skylake/skl-messages.c | 19 ++++---
 sound/soc/intel/skylake/skl-topology.c | 99 ++++++++++++++++++++++------------
 sound/soc/intel/skylake/skl-topology.h |  5 +-
 4 files changed, 82 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/snd_sst_tokens.h b/include/uapi/sound/snd_sst_tokens.h
index 8ba0112e5336..ff3748e9308a 100644
--- a/include/uapi/sound/snd_sst_tokens.h
+++ b/include/uapi/sound/snd_sst_tokens.h
@@ -233,6 +233,8 @@
  *
  * %SKL_TKN_U32_ASTATE_CLK_SRC: Clock source for A-State entry
  *
+ * %SKL_TKN_U32_FMT_CFG_IDX:    Format config index
+ *
  * module_id and loadable flags dont have tokens as these values will be
  * read from the DSP FW manifest
  *
@@ -324,7 +326,9 @@ enum SKL_TKNS {
 	SKL_TKN_U32_ASTATE_COUNT,
 	SKL_TKN_U32_ASTATE_KCPS,
 	SKL_TKN_U32_ASTATE_CLK_SRC,
-	SKL_TKN_MAX = SKL_TKN_U32_ASTATE_CLK_SRC,
+
+	SKL_TKN_U32_FMT_CFG_IDX = 96,
+	SKL_TKN_MAX = SKL_TKN_U32_FMT_CFG_IDX,
 };
 
 #endif
diff --git a/sound/soc/intel/skylake/skl-messages.c b/sound/soc/intel/skylake/skl-messages.c
index 79c6cf2c14bf..7257acf0b6ee 100644
--- a/sound/soc/intel/skylake/skl-messages.c
+++ b/sound/soc/intel/skylake/skl-messages.c
@@ -479,15 +479,15 @@ static void skl_set_base_module_format(struct skl_dev *skl,
 static void skl_copy_copier_caps(struct skl_module_cfg *mconfig,
 				struct skl_cpr_cfg *cpr_mconfig)
 {
-	if (mconfig->formats_config.caps_size == 0)
+	if (mconfig->formats_config[SKL_PARAM_INIT].caps_size == 0)
 		return;
 
 	memcpy(cpr_mconfig->gtw_cfg.config_data,
-			mconfig->formats_config.caps,
-			mconfig->formats_config.caps_size);
+			mconfig->formats_config[SKL_PARAM_INIT].caps,
+			mconfig->formats_config[SKL_PARAM_INIT].caps_size);
 
 	cpr_mconfig->gtw_cfg.config_length =
-			(mconfig->formats_config.caps_size) / 4;
+			(mconfig->formats_config[SKL_PARAM_INIT].caps_size) / 4;
 }
 
 #define SKL_NON_GATEWAY_CPR_NODE_ID 0xFFFFFFFF
@@ -749,13 +749,12 @@ static void skl_set_algo_format(struct skl_dev *skl,
 	struct skl_base_cfg *base_cfg = (struct skl_base_cfg *)algo_mcfg;
 
 	skl_set_base_module_format(skl, mconfig, base_cfg);
-
-	if (mconfig->formats_config.caps_size == 0)
+	if (mconfig->formats_config[SKL_PARAM_INIT].caps_size == 0)
 		return;
 
 	memcpy(algo_mcfg->params,
-			mconfig->formats_config.caps,
-			mconfig->formats_config.caps_size);
+			mconfig->formats_config[SKL_PARAM_INIT].caps,
+			mconfig->formats_config[SKL_PARAM_INIT].caps_size);
 
 }
 
@@ -786,7 +785,7 @@ static u16 skl_get_module_param_size(struct skl_dev *skl,
 	switch (mconfig->m_type) {
 	case SKL_MODULE_TYPE_COPIER:
 		param_size = sizeof(struct skl_cpr_cfg);
-		param_size += mconfig->formats_config.caps_size;
+		param_size += mconfig->formats_config[SKL_PARAM_INIT].caps_size;
 		return param_size;
 
 	case SKL_MODULE_TYPE_SRCINT:
@@ -797,7 +796,7 @@ static u16 skl_get_module_param_size(struct skl_dev *skl,
 
 	case SKL_MODULE_TYPE_ALGO:
 		param_size = sizeof(struct skl_base_cfg);
-		param_size += mconfig->formats_config.caps_size;
+		param_size += mconfig->formats_config[SKL_PARAM_INIT].caps_size;
 		return param_size;
 
 	case SKL_MODULE_TYPE_BASE_OUTFMT:
diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c
index 51fb594bb172..04fbf83f3835 100644
--- a/sound/soc/intel/skylake/skl-topology.c
+++ b/sound/soc/intel/skylake/skl-topology.c
@@ -292,7 +292,7 @@ static int skl_tplg_update_be_blob(struct snd_soc_dapm_widget *w,
 	struct skl_module_iface *m_iface = &m_cfg->module->formats[fmt_idx];
 
 	/* check if we already have blob */
-	if (m_cfg->formats_config.caps_size > 0)
+	if (m_cfg->formats_config[SKL_PARAM_INIT].caps_size > 0)
 		return 0;
 
 	dev_dbg(skl->dev, "Applying default cfg blob\n");
@@ -328,8 +328,8 @@ static int skl_tplg_update_be_blob(struct snd_soc_dapm_widget *w,
 	cfg = skl_get_ep_blob(skl, m_cfg->vbus_id, link_type,
 					s_fmt, ch, s_freq, dir, dev_type);
 	if (cfg) {
-		m_cfg->formats_config.caps_size = cfg->size;
-		m_cfg->formats_config.caps = (u32 *) &cfg->caps;
+		m_cfg->formats_config[SKL_PARAM_INIT].caps_size = cfg->size;
+		m_cfg->formats_config[SKL_PARAM_INIT].caps = (u32 *)&cfg->caps;
 	} else {
 		dev_err(skl->dev, "Blob NULL for id %x type %d dirn %d\n",
 					m_cfg->vbus_id, link_type, dir);
@@ -386,9 +386,9 @@ static int skl_tplg_set_module_params(struct snd_soc_dapm_widget *w,
 	struct skl_algo_data *bc;
 	struct skl_specific_cfg *sp_cfg;
 
-	if (mconfig->formats_config.caps_size > 0 &&
-		mconfig->formats_config.set_params == SKL_PARAM_SET) {
-		sp_cfg = &mconfig->formats_config;
+	if (mconfig->formats_config[SKL_PARAM_SET].caps_size > 0 &&
+	    mconfig->formats_config[SKL_PARAM_SET].set_params == SKL_PARAM_SET) {
+		sp_cfg = &mconfig->formats_config[SKL_PARAM_SET];
 		ret = skl_set_module_params(skl, sp_cfg->caps,
 					sp_cfg->caps_size,
 					sp_cfg->param_id, mconfig);
@@ -438,8 +438,10 @@ static int skl_tplg_set_module_init_data(struct snd_soc_dapm_widget *w)
 			if (bc->set_params != SKL_PARAM_INIT)
 				continue;
 
-			mconfig->formats_config.caps = (u32 *)bc->params;
-			mconfig->formats_config.caps_size = bc->size;
+			mconfig->formats_config[SKL_PARAM_INIT].caps =
+							(u32 *)bc->params;
+			mconfig->formats_config[SKL_PARAM_INIT].caps_size =
+								bc->size;
 
 			break;
 		}
@@ -798,9 +800,10 @@ static int skl_tplg_set_module_bind_params(struct snd_soc_dapm_widget *w,
 			return 0;
 	}
 
-	if (mconfig->formats_config.caps_size > 0 &&
-		mconfig->formats_config.set_params == SKL_PARAM_BIND) {
-		sp_cfg = &mconfig->formats_config;
+	if (mconfig->formats_config[SKL_PARAM_BIND].caps_size > 0 &&
+	    mconfig->formats_config[SKL_PARAM_BIND].set_params ==
+								SKL_PARAM_BIND) {
+		sp_cfg = &mconfig->formats_config[SKL_PARAM_BIND];
 		ret = skl_set_module_params(skl, sp_cfg->caps,
 					sp_cfg->caps_size,
 					sp_cfg->param_id, mconfig);
@@ -1496,7 +1499,8 @@ static int skl_tplg_mic_control_get(struct snd_kcontrol *kcontrol,
 static int skl_fill_mic_sel_params(struct skl_module_cfg *mconfig,
 	struct skl_mic_sel_config *mic_cfg, struct device *dev)
 {
-	struct skl_specific_cfg *sp_cfg = &mconfig->formats_config;
+	struct skl_specific_cfg *sp_cfg =
+				&mconfig->formats_config[SKL_PARAM_INIT];
 
 	sp_cfg->caps_size = sizeof(struct skl_mic_sel_config);
 	sp_cfg->set_params = SKL_PARAM_SET;
@@ -1849,8 +1853,8 @@ static int skl_tplg_be_fill_pipe_params(struct snd_soc_dai *dai,
 					pipe_fmt->freq, pipe->direction,
 					dev_type);
 	if (cfg) {
-		mconfig->formats_config.caps_size = cfg->size;
-		mconfig->formats_config.caps = (u32 *) &cfg->caps;
+		mconfig->formats_config[SKL_PARAM_INIT].caps_size = cfg->size;
+		mconfig->formats_config[SKL_PARAM_INIT].caps = (u32 *)&cfg->caps;
 	} else {
 		dev_err(dai->dev, "Blob NULL for id:%d type:%d dirn:%d ch:%d, freq:%d, fmt:%d\n",
 			mconfig->vbus_id, link_type, params->stream,
@@ -2567,19 +2571,26 @@ static int skl_tplg_get_token(struct device *dev,
 
 		break;
 
+	case SKL_TKN_U32_FMT_CFG_IDX:
+		if (tkn_elem->value > SKL_MAX_PARAMS_TYPES)
+			return -EINVAL;
+
+		mconfig->fmt_cfg_idx = tkn_elem->value;
+		break;
+
 	case SKL_TKN_U32_CAPS_SIZE:
-		mconfig->formats_config.caps_size =
+		mconfig->formats_config[mconfig->fmt_cfg_idx].caps_size =
 			tkn_elem->value;
 
 		break;
 
 	case SKL_TKN_U32_CAPS_SET_PARAMS:
-		mconfig->formats_config.set_params =
+		mconfig->formats_config[mconfig->fmt_cfg_idx].set_params =
 				tkn_elem->value;
 		break;
 
 	case SKL_TKN_U32_CAPS_PARAMS_ID:
-		mconfig->formats_config.param_id =
+		mconfig->formats_config[mconfig->fmt_cfg_idx].param_id =
 				tkn_elem->value;
 		break;
 
@@ -2793,6 +2804,7 @@ static int skl_tplg_get_pvt_data_v4(struct snd_soc_tplg_dapm_widget *tplg_w,
 	struct skl_dfw_v4_module *dfw =
 				(struct skl_dfw_v4_module *)tplg_w->priv.data;
 	int ret;
+	int idx = mconfig->fmt_cfg_idx;
 
 	dev_dbg(dev, "Parsing Skylake v4 widget topology data\n");
 
@@ -2826,7 +2838,7 @@ static int skl_tplg_get_pvt_data_v4(struct snd_soc_tplg_dapm_widget *tplg_w,
 	mconfig->dev_type = dfw->dev_type;
 	mconfig->hw_conn_type = dfw->hw_conn_type;
 	mconfig->time_slot = dfw->time_slot;
-	mconfig->formats_config.caps_size = dfw->caps.caps_size;
+	mconfig->formats_config[idx].caps_size = dfw->caps.caps_size;
 
 	mconfig->m_in_pin = devm_kcalloc(dev,
 				MAX_IN_QUEUE, sizeof(*mconfig->m_in_pin),
@@ -2847,21 +2859,39 @@ static int skl_tplg_get_pvt_data_v4(struct snd_soc_tplg_dapm_widget *tplg_w,
 				    dfw->is_dynamic_out_pin,
 				    mconfig->module->max_output_pins);
 
-	if (mconfig->formats_config.caps_size) {
-		mconfig->formats_config.set_params = dfw->caps.set_params;
-		mconfig->formats_config.param_id = dfw->caps.param_id;
-		mconfig->formats_config.caps =
-		devm_kzalloc(dev, mconfig->formats_config.caps_size,
+	if (mconfig->formats_config[idx].caps_size) {
+		mconfig->formats_config[idx].set_params = dfw->caps.set_params;
+		mconfig->formats_config[idx].param_id = dfw->caps.param_id;
+		mconfig->formats_config[idx].caps =
+		devm_kzalloc(dev, mconfig->formats_config[idx].caps_size,
 			     GFP_KERNEL);
-		if (!mconfig->formats_config.caps)
+		if (!mconfig->formats_config[idx].caps)
 			return -ENOMEM;
-		memcpy(mconfig->formats_config.caps, dfw->caps.caps,
+		memcpy(mconfig->formats_config[idx].caps, dfw->caps.caps,
 		       dfw->caps.caps_size);
 	}
 
 	return 0;
 }
 
+static int skl_tplg_get_caps_data(struct device *dev, char *data,
+				  struct skl_module_cfg *mconfig)
+{
+	int idx = mconfig->fmt_cfg_idx;
+
+	if (mconfig->formats_config[idx].caps_size > 0) {
+		mconfig->formats_config[idx].caps =
+			devm_kzalloc(dev, mconfig->formats_config[idx].caps_size,
+				     GFP_KERNEL);
+		if (!mconfig->formats_config[idx].caps)
+			return -ENOMEM;
+		memcpy(mconfig->formats_config[idx].caps, data,
+		       mconfig->formats_config[idx].caps_size);
+	}
+
+	return mconfig->formats_config[idx].caps_size;
+}
+
 /*
  * Parse the private data for the token and corresponding value.
  * The private data can have multiple data blocks. So, a data block
@@ -2922,18 +2952,14 @@ static int skl_tplg_get_pvt_data(struct snd_soc_tplg_dapm_widget *tplg_w,
 		if (block_type == SKL_TYPE_TUPLE) {
 			ret = skl_tplg_get_tokens(dev, data,
 					skl, mconfig, block_size);
-
-			if (ret < 0)
-				return ret;
-
-			--num_blocks;
 		} else {
-			if (mconfig->formats_config.caps_size > 0)
-				memcpy(mconfig->formats_config.caps, data,
-					mconfig->formats_config.caps_size);
-			--num_blocks;
-			ret = mconfig->formats_config.caps_size;
+			ret = skl_tplg_get_caps_data(dev, data, mconfig);
 		}
+
+		if (ret < 0)
+			return ret;
+
+		--num_blocks;
 		off += ret;
 	}
 
@@ -3024,6 +3050,9 @@ static int skl_tplg_widget_load(struct snd_soc_component *cmpnt, int index,
 	 */
 	mconfig->id.module_id = -1;
 
+	/* To provide backward compatibility, set default as SKL_PARAM_INIT */
+	mconfig->fmt_cfg_idx = SKL_PARAM_INIT;
+
 	/* Parse private data for tuples */
 	ret = skl_tplg_get_pvt_data(tplg_w, skl, bus->dev, mconfig);
 	if (ret < 0)
diff --git a/sound/soc/intel/skylake/skl-topology.h b/sound/soc/intel/skylake/skl-topology.h
index ad230fd64758..ef332c83d5c0 100644
--- a/sound/soc/intel/skylake/skl-topology.h
+++ b/sound/soc/intel/skylake/skl-topology.h
@@ -81,6 +81,8 @@ enum skl_s_freq {
 	SKL_FS_INVALID
 };
 
+#define SKL_MAX_PARAMS_TYPES	4
+
 enum skl_widget_type {
 	SKL_WIDGET_VMIXER = 1,
 	SKL_WIDGET_MIXER = 2,
@@ -371,6 +373,7 @@ struct skl_module_cfg {
 	struct skl_module *module;
 	int res_idx;
 	int fmt_idx;
+	int fmt_cfg_idx;
 	u8 domain;
 	bool homogenous_inputs;
 	bool homogenous_outputs;
@@ -401,7 +404,7 @@ struct skl_module_cfg {
 	enum skl_hw_conn_type  hw_conn_type;
 	enum skl_module_state m_state;
 	struct skl_pipe *pipe;
-	struct skl_specific_cfg formats_config;
+	struct skl_specific_cfg formats_config[SKL_MAX_PARAMS_TYPES];
 	struct skl_pipe_mcfg mod_cfg[SKL_MAX_MODULES_IN_PIPE];
 };
 
-- 
cgit v1.2.3


From 6479f7588651cbc9c91e61c20ff39119cbc8feba Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 17 Aug 2021 11:40:54 -0500
Subject: ASoC: soc-pcm: test refcount before triggering

On start/pause_release/resume, when more than one FE is connected to
the same BE, it's possible that the trigger is sent more than
once. This is not desirable, we only want to trigger a BE once, which
is straightforward to implement with a refcount.

For stop/pause/suspend, the problem is more complicated: the check
implemented in snd_soc_dpcm_can_be_free_stop() may fail due to a
conceptual deadlock when we trigger the BE before the FE. In this
case, the FE states have not yet changed, so there are corner cases
where the TRIGGER_STOP is never sent - the dual case of start where
multiple triggers might be sent.

This patch suggests an unconditional trigger in all cases, without
checking the FE states, using a refcount protected by a spinlock.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Message-Id: <20210817164054.250028-3-pierre-louis.bossart@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dpcm.h |  2 ++
 sound/soc/soc-pcm.c      | 46 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 44 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dpcm.h b/include/sound/soc-dpcm.h
index e296a3949b18..6cc751002da7 100644
--- a/include/sound/soc-dpcm.h
+++ b/include/sound/soc-dpcm.h
@@ -101,6 +101,8 @@ struct snd_soc_dpcm_runtime {
 	enum snd_soc_dpcm_state state;
 
 	int trigger_pending; /* trigger cmd + 1 if pending, 0 if not */
+
+	int be_start; /* refcount protected by dpcm_lock */
 };
 
 #define for_each_dpcm_fe(be, stream, _dpcm)				\
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index 0717f39d2eec..b2440f2f9bf5 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -1534,7 +1534,7 @@ int dpcm_be_dai_startup(struct snd_soc_pcm_runtime *fe, int stream)
 			be->dpcm[stream].state = SND_SOC_DPCM_STATE_CLOSE;
 			goto unwind;
 		}
-
+		be->dpcm[stream].be_start = 0;
 		be->dpcm[stream].state = SND_SOC_DPCM_STATE_OPEN;
 		count++;
 	}
@@ -2001,6 +2001,7 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 	int ret = 0;
 	unsigned long flags;
 	enum snd_soc_dpcm_state state;
+	bool do_trigger;
 
 	for_each_dpcm_be(fe, stream, dpcm) {
 		struct snd_pcm_substream *be_substream;
@@ -2015,6 +2016,7 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 		dev_dbg(be->dev, "ASoC: trigger BE %s cmd %d\n",
 			be->dai_link->name, cmd);
 
+		do_trigger = false;
 		switch (cmd) {
 		case SNDRV_PCM_TRIGGER_START:
 			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
@@ -2025,13 +2027,20 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 				continue;
 			}
 			state = be->dpcm[stream].state;
+			if (be->dpcm[stream].be_start == 0)
+				do_trigger = true;
+			be->dpcm[stream].be_start++;
 			be->dpcm[stream].state = SND_SOC_DPCM_STATE_START;
 			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 
+			if (!do_trigger)
+				continue;
+
 			ret = soc_pcm_trigger(be_substream, cmd);
 			if (ret) {
 				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
 				be->dpcm[stream].state = state;
+				be->dpcm[stream].be_start--;
 				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 				goto end;
 			}
@@ -2045,13 +2054,20 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 			}
 
 			state = be->dpcm[stream].state;
+			if (be->dpcm[stream].be_start == 0)
+				do_trigger = true;
+			be->dpcm[stream].be_start++;
 			be->dpcm[stream].state = SND_SOC_DPCM_STATE_START;
 			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 
+			if (!do_trigger)
+				continue;
+
 			ret = soc_pcm_trigger(be_substream, cmd);
 			if (ret) {
 				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
 				be->dpcm[stream].state = state;
+				be->dpcm[stream].be_start--;
 				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 				goto end;
 			}
@@ -2065,13 +2081,20 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 			}
 
 			state = be->dpcm[stream].state;
+			if (be->dpcm[stream].be_start == 0)
+				do_trigger = true;
+			be->dpcm[stream].be_start++;
 			be->dpcm[stream].state = SND_SOC_DPCM_STATE_START;
 			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 
+			if (!do_trigger)
+				continue;
+
 			ret = soc_pcm_trigger(be_substream, cmd);
 			if (ret) {
 				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
 				be->dpcm[stream].state = state;
+				be->dpcm[stream].be_start--;
 				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 				goto end;
 			}
@@ -2084,9 +2107,15 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 				continue;
 			}
+			if ((be->dpcm[stream].state == SND_SOC_DPCM_STATE_START &&
+			     be->dpcm[stream].be_start == 1) ||
+			    (be->dpcm[stream].state == SND_SOC_DPCM_STATE_PAUSED &&
+			     be->dpcm[stream].be_start == 0))
+				do_trigger = true;
+			be->dpcm[stream].be_start--;
 			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 
-			if (!snd_soc_dpcm_can_be_free_stop(fe, be, stream))
+			if (!do_trigger)
 				continue;
 
 			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
@@ -2098,6 +2127,7 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 			if (ret) {
 				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
 				be->dpcm[stream].state = state;
+				be->dpcm[stream].be_start++;
 				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 				goto end;
 			}
@@ -2109,9 +2139,12 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 				continue;
 			}
+			if (be->dpcm[stream].be_start == 1)
+				do_trigger = true;
+			be->dpcm[stream].be_start--;
 			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 
-			if (!snd_soc_dpcm_can_be_free_stop(fe, be, stream))
+			if (!do_trigger)
 				continue;
 
 			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
@@ -2123,6 +2156,7 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 			if (ret) {
 				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
 				be->dpcm[stream].state = state;
+				be->dpcm[stream].be_start++;
 				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 				goto end;
 			}
@@ -2134,9 +2168,12 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 				continue;
 			}
+			if (be->dpcm[stream].be_start == 1)
+				do_trigger = true;
+			be->dpcm[stream].be_start--;
 			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 
-			if (!snd_soc_dpcm_can_be_free_stop(fe, be, stream))
+			if (!do_trigger)
 				continue;
 
 			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
@@ -2148,6 +2185,7 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 			if (ret) {
 				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
 				be->dpcm[stream].state = state;
+				be->dpcm[stream].be_start++;
 				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 				goto end;
 			}
-- 
cgit v1.2.3


From 3202e2f5fac0032cb1128fb8d5b7f3368902c8d8 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 30 Aug 2021 12:13:46 +0100
Subject: ASoC: Revert PCM trigger changes

These have turned up some issues in further testing.

Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-dpcm.h |   2 -
 sound/soc/soc-pcm.c      | 151 ++++++++---------------------------------------
 2 files changed, 23 insertions(+), 130 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-dpcm.h b/include/sound/soc-dpcm.h
index 6cc751002da7..e296a3949b18 100644
--- a/include/sound/soc-dpcm.h
+++ b/include/sound/soc-dpcm.h
@@ -101,8 +101,6 @@ struct snd_soc_dpcm_runtime {
 	enum snd_soc_dpcm_state state;
 
 	int trigger_pending; /* trigger cmd + 1 if pending, 0 if not */
-
-	int be_start; /* refcount protected by dpcm_lock */
 };
 
 #define for_each_dpcm_fe(be, stream, _dpcm)				\
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index b2440f2f9bf5..48f71bb81a2f 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -1534,7 +1534,7 @@ int dpcm_be_dai_startup(struct snd_soc_pcm_runtime *fe, int stream)
 			be->dpcm[stream].state = SND_SOC_DPCM_STATE_CLOSE;
 			goto unwind;
 		}
-		be->dpcm[stream].be_start = 0;
+
 		be->dpcm[stream].state = SND_SOC_DPCM_STATE_OPEN;
 		count++;
 	}
@@ -1999,9 +1999,6 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 	struct snd_soc_pcm_runtime *be;
 	struct snd_soc_dpcm *dpcm;
 	int ret = 0;
-	unsigned long flags;
-	enum snd_soc_dpcm_state state;
-	bool do_trigger;
 
 	for_each_dpcm_be(fe, stream, dpcm) {
 		struct snd_pcm_substream *be_substream;
@@ -2016,180 +2013,78 @@ int dpcm_be_dai_trigger(struct snd_soc_pcm_runtime *fe, int stream,
 		dev_dbg(be->dev, "ASoC: trigger BE %s cmd %d\n",
 			be->dai_link->name, cmd);
 
-		do_trigger = false;
 		switch (cmd) {
 		case SNDRV_PCM_TRIGGER_START:
-			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
 			if ((be->dpcm[stream].state != SND_SOC_DPCM_STATE_PREPARE) &&
 			    (be->dpcm[stream].state != SND_SOC_DPCM_STATE_STOP) &&
-			    (be->dpcm[stream].state != SND_SOC_DPCM_STATE_PAUSED)) {
-				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
-				continue;
-			}
-			state = be->dpcm[stream].state;
-			if (be->dpcm[stream].be_start == 0)
-				do_trigger = true;
-			be->dpcm[stream].be_start++;
-			be->dpcm[stream].state = SND_SOC_DPCM_STATE_START;
-			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
-
-			if (!do_trigger)
+			    (be->dpcm[stream].state != SND_SOC_DPCM_STATE_PAUSED))
 				continue;
 
 			ret = soc_pcm_trigger(be_substream, cmd);
-			if (ret) {
-				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-				be->dpcm[stream].state = state;
-				be->dpcm[stream].be_start--;
-				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
+			if (ret)
 				goto end;
-			}
 
+			be->dpcm[stream].state = SND_SOC_DPCM_STATE_START;
 			break;
 		case SNDRV_PCM_TRIGGER_RESUME:
-			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-			if (be->dpcm[stream].state != SND_SOC_DPCM_STATE_SUSPEND) {
-				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
-				continue;
-			}
-
-			state = be->dpcm[stream].state;
-			if (be->dpcm[stream].be_start == 0)
-				do_trigger = true;
-			be->dpcm[stream].be_start++;
-			be->dpcm[stream].state = SND_SOC_DPCM_STATE_START;
-			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
-
-			if (!do_trigger)
+			if ((be->dpcm[stream].state != SND_SOC_DPCM_STATE_SUSPEND))
 				continue;
 
 			ret = soc_pcm_trigger(be_substream, cmd);
-			if (ret) {
-				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-				be->dpcm[stream].state = state;
-				be->dpcm[stream].be_start--;
-				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
+			if (ret)
 				goto end;
-			}
 
+			be->dpcm[stream].state = SND_SOC_DPCM_STATE_START;
 			break;
 		case SNDRV_PCM_TRIGGER_PAUSE_RELEASE:
-			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-			if (be->dpcm[stream].state != SND_SOC_DPCM_STATE_PAUSED) {
-				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
-				continue;
-			}
-
-			state = be->dpcm[stream].state;
-			if (be->dpcm[stream].be_start == 0)
-				do_trigger = true;
-			be->dpcm[stream].be_start++;
-			be->dpcm[stream].state = SND_SOC_DPCM_STATE_START;
-			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
-
-			if (!do_trigger)
+			if ((be->dpcm[stream].state != SND_SOC_DPCM_STATE_PAUSED))
 				continue;
 
 			ret = soc_pcm_trigger(be_substream, cmd);
-			if (ret) {
-				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-				be->dpcm[stream].state = state;
-				be->dpcm[stream].be_start--;
-				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
+			if (ret)
 				goto end;
-			}
 
+			be->dpcm[stream].state = SND_SOC_DPCM_STATE_START;
 			break;
 		case SNDRV_PCM_TRIGGER_STOP:
-			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
 			if ((be->dpcm[stream].state != SND_SOC_DPCM_STATE_START) &&
-			    (be->dpcm[stream].state != SND_SOC_DPCM_STATE_PAUSED)) {
-				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
-				continue;
-			}
-			if ((be->dpcm[stream].state == SND_SOC_DPCM_STATE_START &&
-			     be->dpcm[stream].be_start == 1) ||
-			    (be->dpcm[stream].state == SND_SOC_DPCM_STATE_PAUSED &&
-			     be->dpcm[stream].be_start == 0))
-				do_trigger = true;
-			be->dpcm[stream].be_start--;
-			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
-
-			if (!do_trigger)
+			    (be->dpcm[stream].state != SND_SOC_DPCM_STATE_PAUSED))
 				continue;
 
-			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-			state = be->dpcm[stream].state;
-			be->dpcm[stream].state = SND_SOC_DPCM_STATE_STOP;
-			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
+			if (!snd_soc_dpcm_can_be_free_stop(fe, be, stream))
+				continue;
 
 			ret = soc_pcm_trigger(be_substream, cmd);
-			if (ret) {
-				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-				be->dpcm[stream].state = state;
-				be->dpcm[stream].be_start++;
-				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
+			if (ret)
 				goto end;
-			}
 
+			be->dpcm[stream].state = SND_SOC_DPCM_STATE_STOP;
 			break;
 		case SNDRV_PCM_TRIGGER_SUSPEND:
-			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-			if (be->dpcm[stream].state != SND_SOC_DPCM_STATE_START) {
-				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
+			if (be->dpcm[stream].state != SND_SOC_DPCM_STATE_START)
 				continue;
-			}
-			if (be->dpcm[stream].be_start == 1)
-				do_trigger = true;
-			be->dpcm[stream].be_start--;
-			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 
-			if (!do_trigger)
+			if (!snd_soc_dpcm_can_be_free_stop(fe, be, stream))
 				continue;
 
-			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-			state = be->dpcm[stream].state;
-			be->dpcm[stream].state = SND_SOC_DPCM_STATE_STOP;
-			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
-
 			ret = soc_pcm_trigger(be_substream, cmd);
-			if (ret) {
-				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-				be->dpcm[stream].state = state;
-				be->dpcm[stream].be_start++;
-				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
+			if (ret)
 				goto end;
-			}
 
+			be->dpcm[stream].state = SND_SOC_DPCM_STATE_SUSPEND;
 			break;
 		case SNDRV_PCM_TRIGGER_PAUSE_PUSH:
-			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-			if (be->dpcm[stream].state != SND_SOC_DPCM_STATE_START) {
-				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
+			if (be->dpcm[stream].state != SND_SOC_DPCM_STATE_START)
 				continue;
-			}
-			if (be->dpcm[stream].be_start == 1)
-				do_trigger = true;
-			be->dpcm[stream].be_start--;
-			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
 
-			if (!do_trigger)
+			if (!snd_soc_dpcm_can_be_free_stop(fe, be, stream))
 				continue;
 
-			spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-			state = be->dpcm[stream].state;
-			be->dpcm[stream].state = SND_SOC_DPCM_STATE_PAUSED;
-			spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
-
 			ret = soc_pcm_trigger(be_substream, cmd);
-			if (ret) {
-				spin_lock_irqsave(&fe->card->dpcm_lock, flags);
-				be->dpcm[stream].state = state;
-				be->dpcm[stream].be_start++;
-				spin_unlock_irqrestore(&fe->card->dpcm_lock, flags);
+			if (ret)
 				goto end;
-			}
 
+			be->dpcm[stream].state = SND_SOC_DPCM_STATE_PAUSED;
 			break;
 		}
 	}
-- 
cgit v1.2.3